Commit 069486b6 authored by Yu Liao's avatar Yu Liao Committed by yanhaitao
Browse files

cpuinspect: add CPU-inspect infrastructure

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7ZBQB

----------------------------------

This adds the CPU-inspect infrastructure. CPU-inspect is designed to
provide a framework for early detection of SDC by proactively executing
CPU inspection test cases.

Silent Data Corruption (SDC), sometimes referred to as Silent Data Error
(SDE), is an industry-wide issue impacting not only long-protected memory,
storage, and networking, but also computer CPUs. As with software issues,
hardware-induced SDC can contribute to data loss and corruption. An SDC
occurs when an impacted CPU inadvertently causes errors in the data it
processes. For example, an impacted CPU might miscalculate data (i.e.,
1+1=3). There may be no indication of these computational errors unless the
software systematically checks for errors [1].

SDC issues have been around for many years, but as chips have become more
advanced and compact in size, the transistors and lines have become so tiny
that small electrical fluctuations can cause errors. Most of these errors
are caused by defects during manufacturing and are screened out by the
vendors; others are caught by hardware error detection or correction.
However, some errors go undetected by hardware; therefore only detection
software can protect against such errors [1].

[1] https://support.google.com/cloud/answer/10759085



To use CPU-inspect, you need to load at least one inspector (the driver
that specifically executes the CPU inspection code)

Here is an example using CPU-inspect:

	# Set the cpumask of CPU-inspect to 10-20
	echo 10-20 > /sys/devices/system/cpu/cpuinspect/cpumask
	# set the max cpu utility of inspectiono threads to 50%
	echo 50 > /sys/devices/system/cpu/cpuinspect/cpu_utility
	# start the CPU inspection task
	echo 1 > /sys/devices/system/cpu/cpuinspect/start_patrol
	# Check the result to see if some faulty cpu are found
	cat /sys/devices/system/cpu/cpuinspect/result

In addition to being readable, the 'result' file in cpuinspect can also be
polled. The user that use poll() to monitor 'result' will return when a
faulty CPU is found or the inspection task is completed.

Signed-off-by: default avatarYu Liao <liaoyu15@huawei.com>
parent 5a39e8f3
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -242,4 +242,6 @@ source "drivers/roh/Kconfig"

source "drivers/ub/Kconfig"

source "drivers/cpuinspect/Kconfig"

endmenu
+1 −0
Original line number Diff line number Diff line
@@ -128,6 +128,7 @@ obj-$(CONFIG_EISA) += eisa/
obj-$(CONFIG_PM_OPP)		+= opp/
obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
obj-$(CONFIG_CPU_INSPECT)	+= cpuinspect/
obj-y				+= mmc/
obj-$(CONFIG_MEMSTICK)		+= memstick/
obj-$(CONFIG_NEW_LEDS)		+= leds/
+13 −0
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0-only
menu "CPU Inspect"

config CPU_INSPECT
	tristate "CPU inspect support"
	depends on SYSFS && 64BIT
	default n
	help
	  CPU-inspect is designed to provide a framework for early detection
	  of SDC by proactively executing CPU inspection test cases. It
	  includes modular inspector that can be swapped during runtime.

endmenu
+6 −0
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0
#
# Makefile for cpuinspect.
#
obj-$(CONFIG_CPU_INSPECT) += cpu_inspect.o
cpu_inspect-y = cpuinspect.o inspector.o sysfs.o
+170 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0+
/*
 * cpuinspect.c - core cpuinspect infrastructure
 *
 * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
 *
 * Author: Yu Liao <liaoyu15@huawei.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

#include <linux/kernel.h>
#include <linux/mutex.h>
#include <linux/cpu.h>
#include <linux/cpuinspect.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/atomic.h>

#include "cpuinspect.h"

#define CPUINSPECT_SLEEP_TIMEOUT	1000000UL
/*
 * The core struct, store the most relevant data for cpuinspect.
 */
struct cpuinspect ci_core = {
	.inspect_times	= 1,
	.cpu_utility	= 90,
};

static struct task_struct *cpuinspect_threads[NR_CPUS];
static atomic_t active_threads_num;
DECLARE_BITMAP(result, NR_CPUS);
DEFINE_MUTEX(cpuinspect_lock);

/* inspection thread function */
static int run_inspector(void *data)
{
	unsigned int inspect_times = 0, group, ret;
	unsigned int cpu = (unsigned long)data;
	ktime_t start_time, duration;
	unsigned long sleep_us;

	while (!kthread_should_stop()) {
		if (inspect_times >= ci_core.inspect_times || !cpu_online(cpu))
			break;

		for (group = 0; group < curr_cpu_inspector->group_num; group++) {
			start_time = ktime_get();
			ret = curr_cpu_inspector->start_inspect(group);
			if (ret) {
				set_bit(cpu, result);
				cpuinspect_result_notify();
			}

			/*
			 * Sleep for a while if user set desired cpu utility.
			 */
			duration = ktime_get() - start_time;
			sleep_us = (duration * 100 / ci_core.cpu_utility - duration) / 1000;
			/*
			 * During low cpu utility in cpu inspect we might wait a
			 * while; let's avoid the hung task warning.
			 */
			sleep_us = min(sleep_us, CPUINSPECT_SLEEP_TIMEOUT);
			/*
			 * Since usleep_range is built on top of hrtimers,
			 * and we don't want to introduce a large number of
			 * undesired interrupts, choose a range of 200us
			 * to balance performance and latency. This can
			 * cause inspection threads cpu utility is lower
			 * than required cpu utility. And this also prevents
			 * soft lockup.
			 */
			usleep_range(sleep_us, sleep_us + 200);
		}
		inspect_times++;
	}

	cpuinspect_threads[cpu] = NULL;
	/*
	 * When this condition is met, it indicate this is the final cpuinspect
	 * thread, mark inspect state as 0 and notify user that it has been
	 * completed.
	 */
	if (atomic_dec_and_test(&active_threads_num)) {
		ci_core.inspect_on = 0;
		cpuinspect_result_notify();
	}

	return 0;
}

int start_inspect_threads(void)
{
	unsigned int cpu = 0;

	bitmap_zero(result, NR_CPUS);

	ci_core.inspect_on = 1;
	for_each_cpu(cpu, &ci_core.inspect_cpumask) {
		cpuinspect_threads[cpu] = kthread_create_on_node(run_inspector,
					(void *)(unsigned long)cpu,
					cpu_to_node(cpu), "cpuinspect/%u", cpu);
		if (IS_ERR(cpuinspect_threads[cpu])) {
			cpuinspect_threads[cpu] = NULL;
			continue;
		}

		kthread_bind(cpuinspect_threads[cpu], cpu);
		wake_up_process(cpuinspect_threads[cpu]);
		atomic_inc(&active_threads_num);
	}

	/*
	 * If creating inspection threads for all CPUs in mask fails (or
	 * inspect_cpumask is empty), notify user, mark the inspection status
	 * as 0 and simply exit.
	 */
	if (unlikely(!atomic_read(&active_threads_num))) {
		ci_core.inspect_on = 0;
		cpuinspect_result_notify();
	}

	return 0;
}

int stop_inspect_threads(void)
{
	unsigned int cpu = 0;

	/* All inspection threads has been stopped */
	if (atomic_read(&active_threads_num) == 0)
		return 0;

	for_each_cpu(cpu, &ci_core.inspect_cpumask) {
		if (cpuinspect_threads[cpu])
			kthread_stop(cpuinspect_threads[cpu]);
	}

	return 0;
}

/**
 * cpuinspect_init - core initializer
 */
static int __init cpuinspect_init(void)
{
	cpumask_copy(&ci_core.inspect_cpumask, cpu_all_mask);

	return cpuinspect_add_interface(cpu_subsys.dev_root);
}

static void __exit cpuinspect_exit(void)
{
	return cpuinspect_remove_interface(cpu_subsys.dev_root);
}

module_init(cpuinspect_init);
module_exit(cpuinspect_exit);
module_param_string(inspector, param_inspector, CPUINSPECT_NAME_LEN, 0444);
MODULE_LICENSE("GPL");
Loading