Unverified Commit 0c1529d2 authored by openeuler-ci-bot's avatar openeuler-ci-bot Committed by Gitee
Browse files

!12300 [OLK-6.6] AMD Address Translation Library (ATL) patches

Merge Pull Request from: @PrithivishS 
 
*Description:*
--------------

Patches to introduce AMD Address Translation Library (ATL) on Turin systems.

RAS: Introduce AMD Address Translation Library
RAS/AMD/ATL: Add MI300 support
RAS/AMD/ATL: Add MI300 DRAM to normalized address translation support
RAS/AMD/ATL: Add MI300 row retirement support
RAS: Export helper to get ras_debugfs_dir
EDAC/amd64: Use new AMD Address Translation Library
RAS/AMD/ATL: Add amd_atl pr_fmt() prefix
RAS/AMD/ATL: Read DRAM hole base early
RAS/AMD/ATL: Expand helpers for adding and removing base and hole
RAS/AMD/ATL: Validate address map when information is gathered
RAS/AMD/ATL: Implement DF 4.5 NP2 denormalization
RAS: Introduce a FRU memory poison manager
RAS/AMD/FMPM: Save SPA values
RAS/AMD/FMPM: Add debugfs interface to print record entries
RAS/AMD/FMPM: Fix off by one when unwinding on error
RAS/AMD/FMPM: Avoid NULL ptr deref in get_saved_records()
RAS/AMD/FMPM: Safely handle saved records of various sizes
RAS/AMD/FMPM: Use atl internal.h for INVALID_SPA

By using RAS error injection tests we have captured the following logs
**Without ATL patches in Turin systems**
```javascript
[  333.395989] mce: [Hardware Error]: Machine check events logged
[  333.396016] [Hardware Error]: Corrected error, no action required.
[  333.396028] [Hardware Error]: CPU:0 (1a:1:0) MC21_STATUS[Over|CE|MiscV|AddrV|-|-|SyndV|CECC|-|-|-]: 0xdc2040000400011b
[  333.396050] [Hardware Error]: Error Addr: 0x0000000000000000
[  333.396058] [Hardware Error]: PPIN: 0x008319f6b6f24004
[  333.396066] [Hardware Error]: IPID: 0x0000009600050f00, Syndrome: 0x7c7600010a800100
[  333.396078] [Hardware Error]: Unified Memory Controller Ext. Error Code: 0, DRAM ECC error.
[  333.396096] umc_normaddr_to_sysaddr: Invalid DramBaseAddress range: 0x0.
[  333.396119] EDAC MC0: 1 CE Cannot decode normalized address on mc#0csrow#0channel#0 (csrow:0 channel:0 page:0x0 offset:0x0 grain:64 syndrome:0x1)
[  333.396139] [Hardware Error]: cache level: L3/GEN, tx: GEN, mem-tx: RD
```

**With ATL patches in turin systems**
```javascript
[  333.098899] [Hardware Error]: Corrected error, no action required.
[  333.098910] [Hardware Error]: CPU:0 (1a:1:0) MC21_STATUS[Over|CE|MiscV|AddrV|-|-|SyndV|CECC|-|-|-]: 0xdc2040000400011b
[  333.098929] [Hardware Error]: Error Addr: 0x0000000000000000
[  333.098936] [Hardware Error]: PPIN: 0x008319f6b6f24004
[  333.098943] [Hardware Error]: IPID: 0x0000009600050f00, Syndrome: 0x7c7600010a800100
[  333.098954] [Hardware Error]: Unified Memory Controller Ext. Error Code: 0, DRAM ECC error.
[  333.098992] EDAC MC0: 1 CE on mc#0csrow#0channel#0 (csrow:0 channel:0 page:0x0 offset:0x0 grain:64 syndrome:0x1)
[  333.099005] [Hardware Error]: cache level: L3/GEN, tx: GEN, mem-tx: RD
```

The Address Translational Library (ATL) backport has also been system tested.  
 
Link:https://gitee.com/openeuler/kernel/pulls/12300

 

Reviewed-by: default avatarXiaoFei Tan <tanxiaofei@huawei.com>
Signed-off-by: default avatarZhang Peng <zhangpeng362@huawei.com>
parents 43acbd37 0966ae2b
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -891,6 +891,12 @@ Q: https://patchwork.kernel.org/project/linux-rdma/list/
F:	drivers/infiniband/hw/efa/
F:	include/uapi/rdma/efa-abi.h
AMD ADDRESS TRANSLATION LIBRARY (ATL)
M:	Yazen Ghannam <Yazen.Ghannam@amd.com>
L:	linux-edac@vger.kernel.org
S:	Supported
F:	drivers/ras/amd/atl/*
AMD CDX BUS DRIVER
M:	Nipun Gupta <nipun.gupta@amd.com>
M:	Nikhil Agarwal <nikhil.agarwal@amd.com>
@@ -18050,6 +18056,12 @@ L: linux-wireless@vger.kernel.org
S:	Orphan
F:	drivers/net/wireless/legacy/ray*
RAS FRU MEMORY POISON MANAGER (FMPM)
M:	Yazen Ghannam <Yazen.Ghannam@amd.com>
L:	linux-edac@vger.kernel.org
S:	Maintained
F:	drivers/ras/amd/fmpm.c
RC-CORE / LIRC FRAMEWORK
M:	Sean Young <sean@mess.org>
L:	linux-media@vger.kernel.org
+22 −12
Original line number Diff line number Diff line
@@ -918,6 +918,12 @@ CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y
# end of GCOV-based kernel profiling

#
# Profile Guided Optimization (PGO)
#
CONFIG_ARCH_SUPPORTS_PGO_CLANG=y
# end of Profile Guided Optimization (PGO)

CONFIG_HAVE_GCC_PLUGINS=y
CONFIG_FUNCTION_ALIGNMENT_4B=y
CONFIG_FUNCTION_ALIGNMENT_16B=y
@@ -1143,6 +1149,7 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y
# CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set
CONFIG_THP_SWAP=y
CONFIG_READ_ONLY_THP_FOR_FS=y
CONFIG_PGTABLE_HAS_HUGE_LEAVES=y
CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y
CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y
CONFIG_USE_PERCPU_NUMA_NODE_ID=y
@@ -3082,6 +3089,8 @@ CONFIG_NET_VENDOR_BZWX=y
CONFIG_NCE=m
CONFIG_NE6X=m
CONFIG_NE6XVF=m
CONFIG_NET_VENDOR_NEBULA_MATRIX=y
CONFIG_NBL_CORE=m
# CONFIG_FDDI is not set
# CONFIG_HIPPI is not set
# CONFIG_NET_SB1000 is not set
@@ -3091,8 +3100,6 @@ CONFIG_SWPHY=y
CONFIG_LED_TRIGGER_PHY=y
CONFIG_FIXED_PHY=y
CONFIG_SFP=m
CONFIG_NET_VENDOR_NEBULA_MATRIX=y
CONFIG_NBL_CORE=m

#
# MII PHY device drivers
@@ -3898,6 +3905,8 @@ CONFIG_TCG_INFINEON=m
# CONFIG_TCG_XEN is not set
CONFIG_TCG_CRB=y
# CONFIG_TCG_VTPM_PROXY is not set
CONFIG_TCG_HYGON=m
CONFIG_TCM_HYGON=m
CONFIG_TCG_TIS_ST33ZP24=m
CONFIG_TCG_TIS_ST33ZP24_I2C=m
CONFIG_TCG_TIS_ST33ZP24_SPI=m
@@ -5526,6 +5535,7 @@ CONFIG_DVB_CXD2099=m
# Graphics support
#
CONFIG_APERTURE_HELPERS=y
CONFIG_SCREEN_INFO=y
CONFIG_VIDEO_CMDLINE=y
CONFIG_VIDEO_NOMODESET=y
# CONFIG_AUXDISPLAY is not set
@@ -5736,6 +5746,7 @@ CONFIG_FB_SYS_IMAGEBLIT=y
# CONFIG_FB_FOREIGN_ENDIAN is not set
CONFIG_FB_SYS_FOPS=y
CONFIG_FB_DEFERRED_IO=y
CONFIG_FB_IOMEM_FOPS=y
CONFIG_FB_IOMEM_HELPERS=y
CONFIG_FB_SYSMEM_HELPERS=y
CONFIG_FB_SYSMEM_HELPERS_DEFERRED=y
@@ -6856,8 +6867,8 @@ CONFIG_INFINIBAND_USER_MEM=y
CONFIG_INFINIBAND_ON_DEMAND_PAGING=y
CONFIG_INFINIBAND_ADDR_TRANS=y
CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y
CONFIG_INFINIBAND_VIRT_DMA=y
CONFIG_INFINIBAND_PEER_MEMORY=y
CONFIG_INFINIBAND_VIRT_DMA=y
CONFIG_INFINIBAND_BNXT_RE=m
CONFIG_INFINIBAND_CXGB4=m
# CONFIG_INFINIBAND_EFA is not set
@@ -7508,8 +7519,8 @@ CONFIG_HID_SENSOR_ACCEL_3D=m
# CONFIG_AD7923 is not set
# CONFIG_AD7949 is not set
# CONFIG_AD799X is not set
# CONFIG_ADI_AXI_ADC is not set
# CONFIG_AD9467 is not set
# CONFIG_ADI_AXI_ADC is not set
# CONFIG_ENVELOPE_DETECTOR is not set
# CONFIG_HI8435 is not set
# CONFIG_HX711 is not set
@@ -8026,6 +8037,8 @@ CONFIG_IDLE_INJECT=y

CONFIG_RAS=y
# CONFIG_RAS_CEC is not set
CONFIG_AMD_ATL=m
CONFIG_RAS_FMPM=m
CONFIG_USB4=m
# CONFIG_USB4_DEBUGFS_WRITE is not set
# CONFIG_USB4_DMA_TEST is not set
@@ -8568,8 +8581,6 @@ CONFIG_LIST_HARDENED=y

CONFIG_RANDSTRUCT_NONE=y
# end of Kernel hardening options

# CONFIG_SECURITY_BOOT_INIT is not set
# end of Security options

CONFIG_XOR_BLOCKS=m
@@ -8590,6 +8601,7 @@ CONFIG_CRYPTO_ALGAPI=y
CONFIG_CRYPTO_ALGAPI2=y
CONFIG_CRYPTO_AEAD=y
CONFIG_CRYPTO_AEAD2=y
CONFIG_CRYPTO_SIG=y
CONFIG_CRYPTO_SIG2=y
CONFIG_CRYPTO_SKCIPHER=y
CONFIG_CRYPTO_SKCIPHER2=y
@@ -8813,19 +8825,17 @@ CONFIG_CRYPTO_DEV_ZHAOXIN_AES=m
CONFIG_CRYPTO_DEV_ZHAOXIN_SHA=m
# CONFIG_CRYPTO_DEV_ATMEL_ECC is not set
# CONFIG_CRYPTO_DEV_ATMEL_SHA204A is not set
CONFIG_HYGON_GM=y
CONFIG_CRYPTO_DEV_CCP=y
CONFIG_CRYPTO_DEV_CCP_DD=m
CONFIG_CRYPTO_DEV_SP_CCP=y
CONFIG_CRYPTO_DEV_CCP_CRYPTO=m
CONFIG_CRYPTO_DEV_SP_PSP=y
CONFIG_HYGON_GM=y
# CONFIG_CRYPTO_DEV_CCP_DEBUGFS is not set
CONFIG_HYGON_PSP2CPU_CMD=y
CONFIG_TCG_HYGON=m
CONFIG_TCM_HYGON=m
CONFIG_TDM_DEV_HYGON=y
CONFIG_TDM_KERNEL_GUARD=m
CONFIG_CRYPTO_DEV_HCT=m
# CONFIG_CRYPTO_DEV_CCP_DEBUGFS is not set
CONFIG_CRYPTO_DEV_NITROX=m
CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m
CONFIG_CRYPTO_DEV_QAT=m
@@ -8838,13 +8848,13 @@ CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m
CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m
CONFIG_CRYPTO_DEV_QAT_C62XVF=m
# CONFIG_CRYPTO_DEV_QAT_ERROR_INJECTION is not set
CONFIG_CRYPTO_DEV_IAA_CRYPTO=m
CONFIG_CRYPTO_DEV_IAA_CRYPTO_STATS=y
CONFIG_CRYPTO_DEV_CHELSIO=m
# CONFIG_CRYPTO_DEV_VIRTIO is not set
# CONFIG_CRYPTO_DEV_SAFEXCEL is not set
# CONFIG_CRYPTO_DEV_AMLOGIC_GXL is not set
CONFIG_CRYPTO_DEV_TSSE=m
CONFIG_CRYPTO_DEV_IAA_CRYPTO=m
CONFIG_CRYPTO_DEV_IAA_CRYPTO_STATS=y
CONFIG_ASYMMETRIC_KEY_TYPE=y
CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
CONFIG_X509_CERTIFICATE_PARSER=y
+1 −0
Original line number Diff line number Diff line
@@ -78,6 +78,7 @@ config EDAC_GHES
config EDAC_AMD64
	tristate "AMD64 (Opteron, Athlon64)"
	depends on AMD_NB && EDAC_DECODE_MCE
	imply AMD_ATL
	help
	  Support for error detection and correction of DRAM ECC errors on
	  the AMD64 families (>= K8) of memory controllers.
+17 −8
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/ras.h>
#include "amd64_edac.h"
#include <asm/amd_nb.h>

@@ -3115,6 +3116,7 @@ static void decode_umc_error(int node_id, struct mce *m)
	u8 ecc_type = (m->status >> 45) & 0x3;
	struct mem_ctl_info *mci;
	struct amd64_pvt *pvt;
	struct atl_err a_err;
	struct err_info err;
	u64 sys_addr;
	u8 umc;
@@ -3148,16 +3150,23 @@ static void decode_umc_error(int node_id, struct mce *m)

	pvt->ops->get_err_info(m, &err);

	if (hygon_f18h_m4h() && boot_cpu_data.x86_model == 0x6)
	if (hygon_f18h_m4h() && boot_cpu_data.x86_model == 0x6) {
		umc = err.channel << 1;
	else
		umc = err.channel;

		if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, umc, &sys_addr)) {
			err.err_code = ERR_NORM_ADDR;
			goto log_error;
		}
	} else {
		a_err.addr = m->addr;
		a_err.ipid = m->ipid;
		a_err.cpu  = m->extcpu;

		sys_addr = (u64)amd_convert_umc_mca_addr_to_sys_addr(&a_err);
		if (IS_ERR_VALUE(sys_addr)) {
			err.err_code = ERR_NORM_ADDR;
			goto log_error;
		}
	}
	error_address_to_page_and_offset(sys_addr, &err);

log_error:
+13 −0
Original line number Diff line number Diff line
@@ -44,5 +44,18 @@ if RAS

source "arch/x86/ras/Kconfig"
source "drivers/ras/hisilicon/Kconfig"
source "drivers/ras/amd/atl/Kconfig"

config RAS_FMPM
	tristate "FRU Memory Poison Manager"
	default m
	depends on AMD_ATL && ACPI_APEI
	help
	  Support saving and restoring memory error information across reboot
	  using ACPI ERST as persistent storage. Error information is saved with
	  the UEFI CPER "FRU Memory Poison" section format.

	  Memory will be retired during boot time and run time depending on
	  platform-specific policies.

endif
Loading