Commit c65b0805 authored by Luben Tuikov's avatar Luben Tuikov Committed by Alex Deucher
Browse files

drm/amdgpu: RAS EEPROM table is now in debugfs



Add "ras_eeprom_size" file in debugfs, which
reports the maximum size allocated to the RAS
table in EEROM, as the number of bytes and the
number of records it could store. For instance,

$cat /sys/kernel/debug/dri/0/ras/ras_eeprom_size
262144 bytes or 10921 records
$_

Add "ras_eeprom_table" file in debugfs, which
dumps the RAS table stored EEPROM, in a formatted
way. For instance,

$cat ras_eeprom_table
 Signature    Version  FirstOffs       Size   Checksum
0x414D4452 0x00010000 0x00000014 0x000000EC 0x000000DA
Index  Offset ErrType Bank/CU          TimeStamp      Offs/Addr MemChl MCUMCID    RetiredPage
    0 0x00014      ue    0x00 0x00000000607608DC 0x000000000000   0x00    0x00 0x000000000000
    1 0x0002C      ue    0x00 0x00000000607608DC 0x000000001000   0x00    0x00 0x000000000001
    2 0x00044      ue    0x00 0x00000000607608DC 0x000000002000   0x00    0x00 0x000000000002
    3 0x0005C      ue    0x00 0x00000000607608DC 0x000000003000   0x00    0x00 0x000000000003
    4 0x00074      ue    0x00 0x00000000607608DC 0x000000004000   0x00    0x00 0x000000000004
    5 0x0008C      ue    0x00 0x00000000607608DC 0x000000005000   0x00    0x00 0x000000000005
    6 0x000A4      ue    0x00 0x00000000607608DC 0x000000006000   0x00    0x00 0x000000000006
    7 0x000BC      ue    0x00 0x00000000607608DC 0x000000007000   0x00    0x00 0x000000000007
    8 0x000D4      ue    0x00 0x00000000607608DD 0x000000008000   0x00    0x00 0x000000000008
$_

Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
Cc: John Clements <john.clements@amd.com>
Cc: Hawking Zhang <Hawking.Zhang@amd.com>
Cc: Xinhui Pan <xinhui.pan@amd.com>
Signed-off-by: default avatarLuben Tuikov <luben.tuikov@amd.com>
Acked-by: default avatarAlexander Deucher <Alexander.Deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 63d4c081
Loading
Loading
Loading
Loading
+9 −3
Original line number Diff line number Diff line
@@ -404,8 +404,8 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
		/* umc ce/ue error injection for a bad page is not allowed */
		if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
		    amdgpu_ras_check_bad_page(adev, data.inject.address)) {
			dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked "
					"as bad before error injection!\n",
			dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
				 "already been marked as bad!\n",
				 data.inject.address);
			break;
		}
@@ -1301,6 +1301,12 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *
			   &con->bad_page_cnt_threshold);
	debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
	debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
	debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
			    &amdgpu_ras_debugfs_eeprom_size_ops);
	con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
						       S_IRUGO, dir, adev,
						       &amdgpu_ras_debugfs_eeprom_table_ops);
	amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);

	/*
	 * After one uncorrectable error happens, usually GPU recovery will
+1 −0
Original line number Diff line number Diff line
@@ -318,6 +318,7 @@ struct amdgpu_ras {
	/* sysfs */
	struct device_attribute features_attr;
	struct bin_attribute badpages_attr;
	struct dentry *de_ras_eeprom_table;
	/* block array */
	struct ras_manager *objs;

+234 −7
Original line number Diff line number Diff line
@@ -27,6 +27,8 @@
#include <linux/bits.h>
#include "atom.h"
#include "amdgpu_eeprom.h"
#include <linux/debugfs.h>
#include <linux/uaccess.h>

#define EEPROM_I2C_MADDR_VEGA20         0x0
#define EEPROM_I2C_MADDR_ARCTURUS       0x40000
@@ -70,6 +72,13 @@
#define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \
				      (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE)

/* Given a 0-based relative record index, 0, 1, 2, ..., etc., off
 * of "fri", return the absolute record index off of the end of
 * the table header.
 */
#define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \
			      (_C)->ras_max_record_count)

#define RAS_NUM_RECS(_tbl_hdr)  (((_tbl_hdr)->tbl_size - \
				  RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)

@@ -77,13 +86,10 @@

static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
{
	if ((adev->asic_type == CHIP_VEGA20) ||
	    (adev->asic_type == CHIP_ARCTURUS) ||
	    (adev->asic_type == CHIP_SIENNA_CICHLID) ||
	    (adev->asic_type == CHIP_ALDEBARAN))
		return true;

	return false;
	return  adev->asic_type == CHIP_VEGA20 ||
		adev->asic_type == CHIP_ARCTURUS ||
		adev->asic_type == CHIP_SIENNA_CICHLID ||
		adev->asic_type == CHIP_ALDEBARAN;
}

static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev,
@@ -258,6 +264,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
	control->ras_num_recs = 0;
	control->ras_fri = 0;

	amdgpu_ras_debugfs_set_ret_size(control);

	mutex_unlock(&control->ras_tbl_mutex);

	return res;
@@ -591,6 +599,8 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
	res = amdgpu_ras_eeprom_append_table(control, record, num);
	if (!res)
		res = amdgpu_ras_eeprom_update_header(control);
	if (!res)
		amdgpu_ras_debugfs_set_ret_size(control);

	mutex_unlock(&control->ras_tbl_mutex);
	return res;
@@ -734,6 +744,223 @@ inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
	return RAS_MAX_RECORD_COUNT;
}

static ssize_t
amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf,
				    size_t size, loff_t *pos)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
	struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
	u8 data[50];
	int res;

	if (!size)
		return size;

	if (!ras || !control) {
		res = snprintf(data, sizeof(data), "Not supported\n");
	} else {
		res = snprintf(data, sizeof(data), "%d bytes or %d records\n",
			       RAS_TBL_SIZE_BYTES, control->ras_max_record_count);
	}

	if (*pos >= res)
		return 0;

	res -= *pos;
	res = min_t(size_t, res, size);

	if (copy_to_user(buf, &data[*pos], res))
		return -EINVAL;

	*pos += res;

	return res;
}

const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = {
	.owner = THIS_MODULE,
	.read = amdgpu_ras_debugfs_eeprom_size_read,
	.write = NULL,
	.llseek = default_llseek,
};

static const char *tbl_hdr_str = " Signature    Version  FirstOffs       Size   Checksum\n";
static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n";
#define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1)
static const char *rec_hdr_str = "Index  Offset ErrType Bank/CU          TimeStamp      Offs/Addr MemChl MCUMCID    RetiredPage\n";
static const char *rec_hdr_fmt = "%5d 0x%05X %7s    0x%02X 0x%016llX 0x%012llX   0x%02X    0x%02X 0x%012llX\n";
#define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1)

static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = {
	"ignore",
	"re",
	"ue",
};

static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control)
{
	return strlen(tbl_hdr_str) + tbl_hdr_fmt_size +
		strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs;
}

void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control)
{
	struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras,
					      eeprom_control);
	struct dentry *de = ras->de_ras_eeprom_table;

	if (de)
		d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control);
}

static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf,
					     size_t size, loff_t *pos)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
	struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
	const size_t orig_size = size;
	int res = -EINVAL;
	size_t data_len;

	mutex_lock(&control->ras_tbl_mutex);

	/* We want *pos - data_len > 0, which means there's
	 * bytes to be printed from data.
	 */
	data_len = strlen(tbl_hdr_str);
	if (*pos < data_len) {
		data_len -= *pos;
		data_len = min_t(size_t, data_len, size);
		if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len))
			goto Out;
		buf += data_len;
		size -= data_len;
		*pos += data_len;
	}

	data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size;
	if (*pos < data_len && size > 0) {
		u8 data[tbl_hdr_fmt_size + 1];
		loff_t lpos;

		snprintf(data, sizeof(data), tbl_hdr_fmt,
			 control->tbl_hdr.header,
			 control->tbl_hdr.version,
			 control->tbl_hdr.first_rec_offset,
			 control->tbl_hdr.tbl_size,
			 control->tbl_hdr.checksum);

		data_len -= *pos;
		data_len = min_t(size_t, data_len, size);
		lpos = *pos - strlen(tbl_hdr_str);
		if (copy_to_user(buf, &data[lpos], data_len))
			goto Out;
		buf += data_len;
		size -= data_len;
		*pos += data_len;
	}

	data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str);
	if (*pos < data_len && size > 0) {
		loff_t lpos;

		data_len -= *pos;
		data_len = min_t(size_t, data_len, size);
		lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size;
		if (copy_to_user(buf, &rec_hdr_str[lpos], data_len))
			goto Out;
		buf += data_len;
		size -= data_len;
		*pos += data_len;
	}

	data_len = amdgpu_ras_debugfs_table_size(control);
	if (*pos < data_len && size > 0) {
		u8 dare[RAS_TABLE_RECORD_SIZE];
		u8 data[rec_hdr_fmt_size + 1];
		/* Find the starting record index
		 */
		int s = (*pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
			 strlen(rec_hdr_str)) / rec_hdr_fmt_size;
		int r = (*pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
			 strlen(rec_hdr_str)) % rec_hdr_fmt_size;
		struct eeprom_table_record record;

		for ( ; size > 0 && s < control->ras_num_recs; s++) {
			u32 ai = RAS_RI_TO_AI(control, s);
			/* Read a single record
			 */
			res = __amdgpu_ras_eeprom_read(control, dare, ai, 1);
			if (res)
				goto Out;
			__decode_table_record_from_buf(control, &record, dare);
			snprintf(data, sizeof(data), rec_hdr_fmt,
				 s,
				 RAS_INDEX_TO_OFFSET(control, ai),
				 record_err_type_str[record.err_type],
				 record.bank,
				 record.ts,
				 record.offset,
				 record.mem_channel,
				 record.mcumc_id,
				 record.retired_page);

			data_len = min_t(size_t, rec_hdr_fmt_size - r, size);
			if (copy_to_user(buf, &data[r], data_len))
				return -EINVAL;
			buf += data_len;
			size -= data_len;
			*pos += data_len;
			r = 0;
		}
	}
	res = 0;
Out:
	mutex_unlock(&control->ras_tbl_mutex);
	return res < 0 ? res : orig_size - size;
}

static ssize_t
amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf,
				     size_t size, loff_t *pos)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
	struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
	u8 data[81];
	int res;

	if (!size)
		return size;

	if (!ras || !control) {
		res = snprintf(data, sizeof(data), "Not supported\n");
		if (*pos >= res)
			return 0;

		res -= *pos;
		res = min_t(size_t, res, size);

		if (copy_to_user(buf, &data[*pos], res))
			return -EINVAL;

		*pos += res;

		return res;
	} else {
		return amdgpu_ras_debugfs_table_read(f, buf, size, pos);
	}
}

const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = {
	.owner = THIS_MODULE,
	.read = amdgpu_ras_debugfs_eeprom_table_read,
	.write = NULL,
	.llseek = default_llseek,
};

/**
 * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum
 * @control: pointer to control structure
+8 −2
Original line number Diff line number Diff line
@@ -29,9 +29,10 @@
struct amdgpu_device;

enum amdgpu_ras_eeprom_err_type {
	AMDGPU_RAS_EEPROM_ERR_PLACE_HOLDER,
	AMDGPU_RAS_EEPROM_ERR_NA,
	AMDGPU_RAS_EEPROM_ERR_RECOVERABLE,
	AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE
	AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE,
	AMDGPU_RAS_EEPROM_ERR_COUNT,
};

struct amdgpu_ras_eeprom_table_header {
@@ -121,4 +122,9 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,

inline uint32_t amdgpu_ras_eeprom_max_record_count(void);

void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);

extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;

#endif // _AMDGPU_RAS_EEPROM_H