Commit 22106ed0 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: add bad_page_threshold check in ras_eeprom_check_err



bad_page_threshold controls page retirement behavior and it should be
also checked.

v2: simplify the condition of bad page handling path.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f3cbe70e
Loading
Loading
Loading
Loading
+14 −5
Original line number Diff line number Diff line
@@ -417,7 +417,8 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

	if (!__is_ras_eeprom_supported(adev))
	if (!__is_ras_eeprom_supported(adev) ||
	    !amdgpu_bad_page_threshold)
		return false;

	/* skip check eeprom table for VEGA20 Gaming */
@@ -428,11 +429,19 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
			return false;

	if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
		if (amdgpu_bad_page_threshold == -1) {
			dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
				con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
			dev_warn(adev->dev,
				"But GPU can be operated due to bad_page_threshold = -1.\n");
			return false;
		} else {
			dev_warn(adev->dev, "This GPU is in BAD status.");
			dev_warn(adev->dev, "Please retire it or set a larger "
				 "threshold value when reloading driver.\n");
			return true;
		}
	}

	return false;
}