Commit c0cc999f authored by Ma Jun's avatar Ma Jun Committed by Alex Deucher
Browse files

drm/amdkfd: Fix the warning of array-index-out-of-bounds



For some GPUs with more CUs, the original sibling_map[32]
in struct crat_subtype_cache is not enough
to save the cache information when create the VCRAT table,
so skip filling the struct crat_subtype_cache info instead
fill struct kfd_cache_properties directly to fix this problem.

Signed-off-by: default avatarMa Jun <Jun.Ma2@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent cfa61b8f
Loading
Loading
Loading
Loading
+34 −278
Original line number Diff line number Diff line
@@ -50,16 +50,6 @@ static inline unsigned int get_and_inc_gpu_processor_id(
	return current_id;
}

/* Static table to describe GPU Cache information */
struct kfd_gpu_cache_info {
	uint32_t	cache_size;
	uint32_t	cache_level;
	uint32_t	flags;
	/* Indicates how many Compute Units share this cache
	 * within a SA. Value = 1 indicates the cache is not shared
	 */
	uint32_t	num_cu_shared;
};

static struct kfd_gpu_cache_info kaveri_cache_info[] = {
	{
@@ -1119,9 +1109,13 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
			props->cachelines_per_tag = cache->lines_per_tag;
			props->cache_assoc = cache->associativity;
			props->cache_latency = cache->cache_latency;

			memcpy(props->sibling_map, cache->sibling_map,
					sizeof(props->sibling_map));

			/* set the sibling_map_size as 32 for CRAT from ACPI */
			props->sibling_map_size = CRAT_SIBLINGMAP_SIZE;

			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
				props->cache_type |= HSA_CACHE_TYPE_DATA;
			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
@@ -1338,125 +1332,6 @@ int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
	return ret;
}

/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
static int fill_in_l1_pcache(struct crat_subtype_cache *pcache,
				struct kfd_gpu_cache_info *pcache_info,
				struct kfd_cu_info *cu_info,
				int mem_available,
				int cu_bitmask,
				int cache_type, unsigned int cu_processor_id,
				int cu_block)
{
	unsigned int cu_sibling_map_mask;
	int first_active_cu;

	/* First check if enough memory is available */
	if (sizeof(struct crat_subtype_cache) > mem_available)
		return -ENOMEM;

	cu_sibling_map_mask = cu_bitmask;
	cu_sibling_map_mask >>= cu_block;
	cu_sibling_map_mask &=
		((1 << pcache_info[cache_type].num_cu_shared) - 1);
	first_active_cu = ffs(cu_sibling_map_mask);

	/* CU could be inactive. In case of shared cache find the first active
	 * CU. and incase of non-shared cache check if the CU is inactive. If
	 * inactive active skip it
	 */
	if (first_active_cu) {
		memset(pcache, 0, sizeof(struct crat_subtype_cache));
		pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
		pcache->length = sizeof(struct crat_subtype_cache);
		pcache->flags = pcache_info[cache_type].flags;
		pcache->processor_id_low = cu_processor_id
					 + (first_active_cu - 1);
		pcache->cache_level = pcache_info[cache_type].cache_level;
		pcache->cache_size = pcache_info[cache_type].cache_size;

		/* Sibling map is w.r.t processor_id_low, so shift out
		 * inactive CU
		 */
		cu_sibling_map_mask =
			cu_sibling_map_mask >> (first_active_cu - 1);

		pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
		pcache->sibling_map[1] =
				(uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
		pcache->sibling_map[2] =
				(uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
		pcache->sibling_map[3] =
				(uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
		return 0;
	}
	return 1;
}

/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
static int fill_in_l2_l3_pcache(struct crat_subtype_cache *pcache,
				struct kfd_gpu_cache_info *pcache_info,
				struct kfd_cu_info *cu_info,
				int mem_available,
				int cache_type, unsigned int cu_processor_id)
{
	unsigned int cu_sibling_map_mask;
	int first_active_cu;
	int i, j, k;

	/* First check if enough memory is available */
	if (sizeof(struct crat_subtype_cache) > mem_available)
		return -ENOMEM;

	cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
	cu_sibling_map_mask &=
		((1 << pcache_info[cache_type].num_cu_shared) - 1);
	first_active_cu = ffs(cu_sibling_map_mask);

	/* CU could be inactive. In case of shared cache find the first active
	 * CU. and incase of non-shared cache check if the CU is inactive. If
	 * inactive active skip it
	 */
	if (first_active_cu) {
		memset(pcache, 0, sizeof(struct crat_subtype_cache));
		pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
		pcache->length = sizeof(struct crat_subtype_cache);
		pcache->flags = pcache_info[cache_type].flags;
		pcache->processor_id_low = cu_processor_id
					 + (first_active_cu - 1);
		pcache->cache_level = pcache_info[cache_type].cache_level;
		pcache->cache_size = pcache_info[cache_type].cache_size;

		/* Sibling map is w.r.t processor_id_low, so shift out
		 * inactive CU
		 */
		cu_sibling_map_mask =
			cu_sibling_map_mask >> (first_active_cu - 1);
		k = 0;
		for (i = 0; i < cu_info->num_shader_engines; i++) {
			for (j = 0; j < cu_info->num_shader_arrays_per_engine;
				j++) {
				pcache->sibling_map[k] =
				 (uint8_t)(cu_sibling_map_mask & 0xFF);
				pcache->sibling_map[k+1] =
				 (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
				pcache->sibling_map[k+2] =
				 (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
				pcache->sibling_map[k+3] =
				 (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
				k += 4;
				cu_sibling_map_mask =
					cu_info->cu_bitmap[i % 4][j + i / 4];
				cu_sibling_map_mask &= (
				 (1 << pcache_info[cache_type].num_cu_shared)
				 - 1);
			}
		}
		return 0;
	}
	return 1;
}

#define KFD_MAX_CACHE_TYPES 6

static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
						   struct kfd_gpu_cache_info *pcache_info)
@@ -1530,231 +1405,133 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
	return i;
}

/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info
 * tables
 *
 *	@kdev - [IN] GPU device
 *	@gpu_processor_id - [IN] GPU processor ID to which these caches
 *			    associate
 *	@available_size - [IN] Amount of memory available in pcache
 *	@cu_info - [IN] Compute Unit info obtained from KGD
 *	@pcache - [OUT] memory into which cache data is to be filled in.
 *	@size_filled - [OUT] amount of data used up in pcache.
 *	@num_of_entries - [OUT] number of caches added
 */
static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
			int gpu_processor_id,
			int available_size,
			struct kfd_cu_info *cu_info,
			struct crat_subtype_cache *pcache,
			int *size_filled,
			int *num_of_entries)
{
	struct kfd_gpu_cache_info *pcache_info;
	struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
int kfd_get_gpu_cache_info(struct kfd_dev *kdev, struct kfd_gpu_cache_info **pcache_info)
{
	int num_of_cache_types = 0;
	int i, j, k;
	int ct = 0;
	int mem_available = available_size;
	unsigned int cu_processor_id;
	int ret;
	unsigned int num_cu_shared;

	switch (kdev->adev->asic_type) {
	case CHIP_KAVERI:
		pcache_info = kaveri_cache_info;
		*pcache_info = kaveri_cache_info;
		num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
		break;
	case CHIP_HAWAII:
		pcache_info = hawaii_cache_info;
		*pcache_info = hawaii_cache_info;
		num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
		break;
	case CHIP_CARRIZO:
		pcache_info = carrizo_cache_info;
		*pcache_info = carrizo_cache_info;
		num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
		break;
	case CHIP_TONGA:
		pcache_info = tonga_cache_info;
		*pcache_info = tonga_cache_info;
		num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
		break;
	case CHIP_FIJI:
		pcache_info = fiji_cache_info;
		*pcache_info = fiji_cache_info;
		num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
		break;
	case CHIP_POLARIS10:
		pcache_info = polaris10_cache_info;
		*pcache_info = polaris10_cache_info;
		num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
		break;
	case CHIP_POLARIS11:
		pcache_info = polaris11_cache_info;
		*pcache_info = polaris11_cache_info;
		num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
		break;
	case CHIP_POLARIS12:
		pcache_info = polaris12_cache_info;
		*pcache_info = polaris12_cache_info;
		num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
		break;
	case CHIP_VEGAM:
		pcache_info = vegam_cache_info;
		*pcache_info = vegam_cache_info;
		num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
		break;
	default:
		switch (KFD_GC_VERSION(kdev)) {
		case IP_VERSION(9, 0, 1):
			pcache_info = vega10_cache_info;
			*pcache_info = vega10_cache_info;
			num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
			break;
		case IP_VERSION(9, 2, 1):
			pcache_info = vega12_cache_info;
			*pcache_info = vega12_cache_info;
			num_of_cache_types = ARRAY_SIZE(vega12_cache_info);
			break;
		case IP_VERSION(9, 4, 0):
		case IP_VERSION(9, 4, 1):
			pcache_info = vega20_cache_info;
			*pcache_info = vega20_cache_info;
			num_of_cache_types = ARRAY_SIZE(vega20_cache_info);
			break;
		case IP_VERSION(9, 4, 2):
			pcache_info = aldebaran_cache_info;
			*pcache_info = aldebaran_cache_info;
			num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info);
			break;
		case IP_VERSION(9, 1, 0):
		case IP_VERSION(9, 2, 2):
			pcache_info = raven_cache_info;
			*pcache_info = raven_cache_info;
			num_of_cache_types = ARRAY_SIZE(raven_cache_info);
			break;
		case IP_VERSION(9, 3, 0):
			pcache_info = renoir_cache_info;
			*pcache_info = renoir_cache_info;
			num_of_cache_types = ARRAY_SIZE(renoir_cache_info);
			break;
		case IP_VERSION(10, 1, 10):
		case IP_VERSION(10, 1, 2):
		case IP_VERSION(10, 1, 3):
		case IP_VERSION(10, 1, 4):
			pcache_info = navi10_cache_info;
			*pcache_info = navi10_cache_info;
			num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
			break;
		case IP_VERSION(10, 1, 1):
			pcache_info = navi14_cache_info;
			*pcache_info = navi14_cache_info;
			num_of_cache_types = ARRAY_SIZE(navi14_cache_info);
			break;
		case IP_VERSION(10, 3, 0):
			pcache_info = sienna_cichlid_cache_info;
			*pcache_info = sienna_cichlid_cache_info;
			num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info);
			break;
		case IP_VERSION(10, 3, 2):
			pcache_info = navy_flounder_cache_info;
			*pcache_info = navy_flounder_cache_info;
			num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info);
			break;
		case IP_VERSION(10, 3, 4):
			pcache_info = dimgrey_cavefish_cache_info;
			*pcache_info = dimgrey_cavefish_cache_info;
			num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info);
			break;
		case IP_VERSION(10, 3, 1):
			pcache_info = vangogh_cache_info;
			*pcache_info = vangogh_cache_info;
			num_of_cache_types = ARRAY_SIZE(vangogh_cache_info);
			break;
		case IP_VERSION(10, 3, 5):
			pcache_info = beige_goby_cache_info;
			*pcache_info = beige_goby_cache_info;
			num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info);
			break;
		case IP_VERSION(10, 3, 3):
			pcache_info = yellow_carp_cache_info;
			*pcache_info = yellow_carp_cache_info;
			num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);
			break;
		case IP_VERSION(10, 3, 6):
			pcache_info = gc_10_3_6_cache_info;
			*pcache_info = gc_10_3_6_cache_info;
			num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info);
			break;
		case IP_VERSION(10, 3, 7):
			pcache_info = gfx1037_cache_info;
			*pcache_info = gfx1037_cache_info;
			num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info);
			break;
		case IP_VERSION(11, 0, 0):
		case IP_VERSION(11, 0, 1):
		case IP_VERSION(11, 0, 2):
		case IP_VERSION(11, 0, 3):
			pcache_info = cache_info;
			num_of_cache_types =
				kfd_fill_gpu_cache_info_from_gfx_config(kdev, pcache_info);
				kfd_fill_gpu_cache_info_from_gfx_config(kdev, *pcache_info);
			break;
		default:
			pcache_info = dummy_cache_info;
			*pcache_info = dummy_cache_info;
			num_of_cache_types = ARRAY_SIZE(dummy_cache_info);
			pr_warn("dummy cache info is used temporarily and real cache info need update later.\n");
			break;
		}
	}

	*size_filled = 0;
	*num_of_entries = 0;

	/* For each type of cache listed in the kfd_gpu_cache_info table,
	 * go through all available Compute Units.
	 * The [i,j,k] loop will
	 *		if kfd_gpu_cache_info.num_cu_shared = 1
	 *			will parse through all available CU
	 *		If (kfd_gpu_cache_info.num_cu_shared != 1)
	 *			then it will consider only one CU from
	 *			the shared unit
	 */

	for (ct = 0; ct < num_of_cache_types; ct++) {
	  cu_processor_id = gpu_processor_id;
	  if (pcache_info[ct].cache_level == 1) {
	    for (i = 0; i < cu_info->num_shader_engines; i++) {
	      for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
	        for (k = 0; k < cu_info->num_cu_per_sh;
		  k += pcache_info[ct].num_cu_shared) {
		  ret = fill_in_l1_pcache(pcache,
					pcache_info,
					cu_info,
					mem_available,
					cu_info->cu_bitmap[i % 4][j + i / 4],
					ct,
					cu_processor_id,
					k);

		  if (ret < 0)
			break;

		  if (!ret) {
				pcache++;
				(*num_of_entries)++;
				mem_available -= sizeof(*pcache);
				(*size_filled) += sizeof(*pcache);
		  }

		  /* Move to next CU block */
		  num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
					cu_info->num_cu_per_sh) ?
					pcache_info[ct].num_cu_shared :
					(cu_info->num_cu_per_sh - k);
		  cu_processor_id += num_cu_shared;
		}
	      }
	    }
	  } else {
			ret = fill_in_l2_l3_pcache(pcache,
				pcache_info,
				cu_info,
				mem_available,
				ct,
				cu_processor_id);

			if (ret < 0)
				break;

			if (!ret) {
				pcache++;
				(*num_of_entries)++;
				mem_available -= sizeof(*pcache);
				(*size_filled) += sizeof(*pcache);
			}
	  }
	}

	pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);

	return 0;
	return num_of_cache_types;
}

static bool kfd_ignore_crat(void)
@@ -2313,8 +2090,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
	struct kfd_cu_info cu_info;
	int avail_size = *size;
	uint32_t total_num_of_cu;
	int num_of_cache_entries = 0;
	int cache_mem_filled = 0;
	uint32_t nid = 0;
	int ret = 0;

@@ -2415,31 +2190,12 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
	crat_table->length += sizeof(struct crat_subtype_memory);
	crat_table->total_entries++;

	/* TODO: Fill in cache information. This information is NOT readily
	 * available in KGD
	 */
	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
		sub_type_hdr->length);
	ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
				avail_size,
				&cu_info,
				(struct crat_subtype_cache *)sub_type_hdr,
				&cache_mem_filled,
				&num_of_cache_entries);

	if (ret < 0)
		return ret;

	crat_table->length += cache_mem_filled;
	crat_table->total_entries += num_of_cache_entries;
	avail_size -= cache_mem_filled;

	/* Fill in Subtype: IO_LINKS
	 *  Only direct links are added here which is Link from GPU to
	 *  its NUMA node. Indirect links are added by userspace.
	 */
	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
		cache_mem_filled);
		sub_type_hdr->length);
	ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,
		(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);

+12 −0
Original line number Diff line number Diff line
@@ -295,6 +295,18 @@ struct crat_subtype_generic {

struct kfd_dev;

/* Static table to describe GPU Cache information */
struct kfd_gpu_cache_info {
	uint32_t	cache_size;
	uint32_t	cache_level;
	uint32_t	flags;
	/* Indicates how many Compute Units share this cache
	 * within a SA. Value = 1 indicates the cache is not shared
	 */
	uint32_t	num_cu_shared;
};
int kfd_get_gpu_cache_info(struct kfd_dev *kdev, struct kfd_gpu_cache_info **pcache_info);

int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
void kfd_destroy_crat_image(void *crat_image);
int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
+232 −13
Original line number Diff line number Diff line
@@ -364,7 +364,6 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,

	/* Making sure that the buffer is an empty string */
	buffer[0] = 0;

	cache = container_of(attr, struct kfd_cache_properties, attr);
	if (cache->gpu && kfd_devcgroup_check_permission(cache->gpu))
		return -EPERM;
@@ -379,8 +378,9 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
	sysfs_show_32bit_prop(buffer, offs, "association", cache->cache_assoc);
	sysfs_show_32bit_prop(buffer, offs, "latency", cache->cache_latency);
	sysfs_show_32bit_prop(buffer, offs, "type", cache->cache_type);

	offs += snprintf(buffer+offs, PAGE_SIZE-offs, "sibling_map ");
	for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
	for (i = 0; i < cache->sibling_map_size; i++)
		for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++)
			/* Check each bit */
			offs += snprintf(buffer+offs, PAGE_SIZE-offs, "%d,",
@@ -1197,7 +1197,6 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
	struct kfd_iolink_properties *iolink;
	struct kfd_iolink_properties *p2plink;

	down_write(&topology_lock);
	list_for_each_entry(dev, &topology_device_list, list) {
		/* Discrete GPUs need their own topology device list
		 * entries. Don't assign them to CPU/APU nodes.
@@ -1221,7 +1220,6 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
			break;
		}
	}
	up_write(&topology_lock);
	return out_dev;
}

@@ -1592,6 +1590,221 @@ static int kfd_dev_create_p2p_links(void)
	return ret;
}


/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext,
				struct kfd_gpu_cache_info *pcache_info,
				struct kfd_cu_info *cu_info,
				int cu_bitmask,
				int cache_type, unsigned int cu_processor_id,
				int cu_block)
{
	unsigned int cu_sibling_map_mask;
	int first_active_cu;
	struct kfd_cache_properties *pcache = NULL;

	cu_sibling_map_mask = cu_bitmask;
	cu_sibling_map_mask >>= cu_block;
	cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
	first_active_cu = ffs(cu_sibling_map_mask);

	/* CU could be inactive. In case of shared cache find the first active
	 * CU. and incase of non-shared cache check if the CU is inactive. If
	 * inactive active skip it
	 */
	if (first_active_cu) {
		pcache = kfd_alloc_struct(pcache);
		if (!pcache)
			return -ENOMEM;

		memset(pcache, 0, sizeof(struct kfd_cache_properties));
		pcache->processor_id_low = cu_processor_id + (first_active_cu - 1);
		pcache->cache_level = pcache_info[cache_type].cache_level;
		pcache->cache_size = pcache_info[cache_type].cache_size;

		if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)
			pcache->cache_type |= HSA_CACHE_TYPE_DATA;
		if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_INST_CACHE)
			pcache->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
		if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_CPU_CACHE)
			pcache->cache_type |= HSA_CACHE_TYPE_CPU;
		if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
			pcache->cache_type |= HSA_CACHE_TYPE_HSACU;

		/* Sibling map is w.r.t processor_id_low, so shift out
		 * inactive CU
		 */
		cu_sibling_map_mask =
			cu_sibling_map_mask >> (first_active_cu - 1);

		pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
		pcache->sibling_map[1] =
				(uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
		pcache->sibling_map[2] =
				(uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
		pcache->sibling_map[3] =
				(uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);

		pcache->sibling_map_size = 4;
		*props_ext = pcache;

		return 0;
	}
	return 1;
}

/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
				struct kfd_gpu_cache_info *pcache_info,
				struct kfd_cu_info *cu_info,
				int cache_type, unsigned int cu_processor_id)
{
	unsigned int cu_sibling_map_mask;
	int first_active_cu;
	int i, j, k;
	struct kfd_cache_properties *pcache = NULL;

	cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
	cu_sibling_map_mask &=
		((1 << pcache_info[cache_type].num_cu_shared) - 1);
	first_active_cu = ffs(cu_sibling_map_mask);

	/* CU could be inactive. In case of shared cache find the first active
	 * CU. and incase of non-shared cache check if the CU is inactive. If
	 * inactive active skip it
	 */
	if (first_active_cu) {
		pcache = kfd_alloc_struct(pcache);
		if (!pcache)
			return -ENOMEM;

		memset(pcache, 0, sizeof(struct kfd_cache_properties));
		pcache->processor_id_low = cu_processor_id
					+ (first_active_cu - 1);
		pcache->cache_level = pcache_info[cache_type].cache_level;
		pcache->cache_size = pcache_info[cache_type].cache_size;

		if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)
			pcache->cache_type |= HSA_CACHE_TYPE_DATA;
		if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_INST_CACHE)
			pcache->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
		if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_CPU_CACHE)
			pcache->cache_type |= HSA_CACHE_TYPE_CPU;
		if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
			pcache->cache_type |= HSA_CACHE_TYPE_HSACU;

		/* Sibling map is w.r.t processor_id_low, so shift out
		 * inactive CU
		 */
		cu_sibling_map_mask = cu_sibling_map_mask >> (first_active_cu - 1);
		k = 0;

		for (i = 0; i < cu_info->num_shader_engines; i++) {
			for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
				pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF);
				pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
				pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
				pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
				k += 4;

				cu_sibling_map_mask = cu_info->cu_bitmap[i % 4][j + i / 4];
				cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
			}
		}
		pcache->sibling_map_size = k;
		*props_ext = pcache;
		return 0;
	}
	return 1;
}

#define KFD_MAX_CACHE_TYPES 6

/* kfd_fill_cache_non_crat_info - Fill GPU cache info using kfd_gpu_cache_info
 * tables
 */
void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct kfd_dev *kdev)
{
	struct kfd_gpu_cache_info *pcache_info = NULL;
	int i, j, k;
	int ct = 0;
	unsigned int cu_processor_id;
	int ret;
	unsigned int num_cu_shared;
	struct kfd_cu_info cu_info;
	struct kfd_cu_info *pcu_info;
	int gpu_processor_id;
	struct kfd_cache_properties *props_ext;
	int num_of_entries = 0;
	int num_of_cache_types = 0;
	struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];

	amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info);
	pcu_info = &cu_info;

	gpu_processor_id = dev->node_props.simd_id_base;

	pcache_info = cache_info;
	num_of_cache_types = kfd_get_gpu_cache_info(kdev, &pcache_info);
	if (!num_of_cache_types) {
		pr_warn("no cache info found\n");
		return;
	}

	/* For each type of cache listed in the kfd_gpu_cache_info table,
	 * go through all available Compute Units.
	 * The [i,j,k] loop will
	 *		if kfd_gpu_cache_info.num_cu_shared = 1
	 *			will parse through all available CU
	 *		If (kfd_gpu_cache_info.num_cu_shared != 1)
	 *			then it will consider only one CU from
	 *			the shared unit
	 */
	for (ct = 0; ct < num_of_cache_types; ct++) {
		cu_processor_id = gpu_processor_id;
		if (pcache_info[ct].cache_level == 1) {
			for (i = 0; i < pcu_info->num_shader_engines; i++) {
				for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) {
					for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {

						ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
										pcu_info->cu_bitmap[i % 4][j + i / 4], ct,
										cu_processor_id, k);

						if (ret < 0)
							break;

						if (!ret) {
							num_of_entries++;
							list_add_tail(&props_ext->list, &dev->cache_props);
						}

						/* Move to next CU block */
						num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
							pcu_info->num_cu_per_sh) ?
							pcache_info[ct].num_cu_shared :
							(pcu_info->num_cu_per_sh - k);
						cu_processor_id += num_cu_shared;
					}
				}
			}
		} else {
			ret = fill_in_l2_l3_pcache(&props_ext, pcache_info,
								pcu_info, ct, cu_processor_id);

			if (ret < 0)
				break;

			if (!ret) {
				num_of_entries++;
				list_add_tail(&props_ext->list, &dev->cache_props);
			}
		}
	}
	dev->node_props.caches_count += num_of_entries;
	pr_debug("Added [%d] GPU cache entries\n", num_of_entries);
}

int kfd_topology_add_device(struct kfd_dev *gpu)
{
	uint32_t gpu_id;
@@ -1616,9 +1829,9 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
	 * CRAT to create a new topology device. Once created assign the gpu to
	 * that topology device
	 */
	down_write(&topology_lock);
	dev = kfd_assign_gpu(gpu);
	if (!dev) {
		down_write(&topology_lock);
		proximity_domain = ++topology_crat_proximity_domain;

		res = kfd_create_crat_image_virtual(&crat_image, &image_size,
@@ -1630,6 +1843,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
			topology_crat_proximity_domain--;
			return res;
		}

		res = kfd_parse_crat_table(crat_image,
					   &temp_topology_device_list,
					   proximity_domain);
@@ -1643,23 +1857,28 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
		kfd_topology_update_device_list(&temp_topology_device_list,
			&topology_device_list);

		dev = kfd_assign_gpu(gpu);
		if (WARN_ON(!dev)) {
			res = -ENODEV;
			goto err;
		}

		/* Fill the cache affinity information here for the GPUs
		 * using VCRAT
		 */
		kfd_fill_cache_non_crat_info(dev, gpu);

		/* Update the SYSFS tree, since we added another topology
		 * device
		 */
		res = kfd_topology_update_sysfs();
		up_write(&topology_lock);

		if (!res)
			sys_props.generation_count++;
		else
			pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
						gpu_id, res);
		dev = kfd_assign_gpu(gpu);
		if (WARN_ON(!dev)) {
			res = -ENODEV;
			goto err;
		}
	}
	up_write(&topology_lock);

	dev->gpu_id = gpu_id;
	gpu->id = gpu_id;
+4 −1
Original line number Diff line number Diff line
@@ -80,6 +80,8 @@ struct kfd_mem_properties {
	struct attribute	attr;
};

#define CACHE_SIBLINGMAP_SIZE 64

struct kfd_cache_properties {
	struct list_head	list;
	uint32_t		processor_id_low;
@@ -90,10 +92,11 @@ struct kfd_cache_properties {
	uint32_t		cache_assoc;
	uint32_t		cache_latency;
	uint32_t		cache_type;
	uint8_t			sibling_map[CRAT_SIBLINGMAP_SIZE];
	uint8_t			sibling_map[CACHE_SIBLINGMAP_SIZE];
	struct kfd_dev		*gpu;
	struct kobject		*kobj;
	struct attribute	attr;
	uint32_t		sibling_map_size;
};

struct kfd_iolink_properties {