Commit 69691c82 authored by Stanley.Yang's avatar Stanley.Yang Committed by Alex Deucher
Browse files

drm/amdgpu: message smu to update bad channel info



It should notice SMU to update bad channel info when detected
uncorrectable error in UMC block

Signed-off-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d510eccf
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
	mutex_init(&con->recovery_lock);
	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
	atomic_set(&con->in_recovery, 0);
	con->eeprom_control.bad_channel_bitmap = 0;

	max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
	amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
@@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
			goto free;

		amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);

		if (con->update_channel_flag == true) {
			amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
			con->update_channel_flag = false;
		}
	}

#ifdef CONFIG_X86_MCE_AMD
@@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
		goto release_con;
	}

	con->update_channel_flag = false;
	con->features = 0;
	INIT_LIST_HEAD(&con->head);
	/* Might need get this flag from vbios. */
+3 −0
Original line number Diff line number Diff line
@@ -374,6 +374,9 @@ struct amdgpu_ras {

	/* record umc error info queried from smu */
	struct umc_ecc_info umc_ecc;

	/* Indicates smu whether need update bad channel info */
	bool update_channel_flag;
};

struct ras_fs_data {
+23 −2
Original line number Diff line number Diff line
@@ -267,6 +267,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
{
	struct amdgpu_device *adev = to_amdgpu_device(control);
	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	u8 csum;
	int res;

@@ -287,6 +288,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)

	amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);

	control->bad_channel_bitmap = 0;
	amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
	con->update_channel_flag = false;

	amdgpu_ras_debugfs_set_ret_size(control);

	mutex_unlock(&control->ras_tbl_mutex);
@@ -420,6 +425,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
			       struct eeprom_table_record *record,
			       const u32 num)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
	u32 a, b, i;
	u8 *buf, *pp;
	int res;
@@ -431,9 +437,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
	/* Encode all of them in one go.
	 */
	pp = buf;
	for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
	for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
		__encode_table_record_to_buf(control, &record[i], pp);

		/* update bad channel bitmap */
		if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
			control->bad_channel_bitmap |= 1 << record[i].mem_channel;
			con->update_channel_flag = true;
		}
	}

	/* a, first record index to write into.
	 * b, last record index to write into.
	 * a = first index to read (fri) + number of records in the table,
@@ -686,6 +699,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
			   const u32 num)
{
	struct amdgpu_device *adev = to_amdgpu_device(control);
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	int i, res;
	u8 *buf, *pp;
	u32 g0, g1;
@@ -753,8 +767,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
	/* Read up everything? Then transform.
	 */
	pp = buf;
	for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
	for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
		__decode_table_record_from_buf(control, &record[i], pp);

		/* update bad channel bitmap */
		if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
			control->bad_channel_bitmap |= 1 << record[i].mem_channel;
			con->update_channel_flag = true;
		}
	}
Out:
	kfree(buf);
	mutex_unlock(&control->ras_tbl_mutex);
+4 −0
Original line number Diff line number Diff line
@@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
	/* Protect table access via this mutex.
	 */
	struct mutex ras_tbl_mutex;

	/* Record channel info which occurred bad pages
	 */
	u32 bad_channel_bitmap;
};

/*
+5 −0
Original line number Diff line number Diff line
@@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
			amdgpu_ras_save_bad_pages(adev);

			amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);

			if (con->update_channel_flag == true) {
				amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
				con->update_channel_flag = false;
			}
		}

		if (reset)