Commit a805889a authored by Mukul Joshi's avatar Mukul Joshi Committed by Alex Deucher
Browse files

drm/amdkfd: Update SDMA queue management for GFX9.4.3



This patch updates SDMA queue management for multi XCC in GFX9.4.3.
- Allocate/deallocate SDMA queues from the correct SDMA engines
  based on the partition mode.
- Updates the kgd2kfd interface to fetch the correct SDMA register
  addresses.
- It also fixes dumping correct SDMA queue info in debugfs.

v2: squash in fix "drm/amdkfd: Fix XGMI SDMA user-mode queue allocation"

Signed-off-by: default avatarMukul Joshi <mukul.joshi@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f38f147a
Loading
Loading
Loading
Loading
+190 −4
Original line number Diff line number Diff line
@@ -31,6 +31,192 @@
#include "oss/osssys_4_0_sh_mask.h"
#include "v9_structs.h"
#include "soc15.h"
#include "sdma/sdma_4_4_2_offset.h"
#include "sdma/sdma_4_4_2_sh_mask.h"

static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
{
	return (struct v9_sdma_mqd *)mqd;
}

static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev,
					unsigned int engine_id,
					unsigned int queue_id)
{
	uint32_t sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, engine_id,
					regSDMA_RLC0_RB_CNTL) -
					regSDMA_RLC0_RB_CNTL;
	uint32_t retval = sdma_engine_reg_base +
		  queue_id * (regSDMA_RLC1_RB_CNTL - regSDMA_RLC0_RB_CNTL);

	pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id,
							queue_id, retval);
	return retval;
}

int kgd_gfx_v9_4_3_hqd_sdma_load(struct amdgpu_device *adev, void *mqd,
				 uint32_t __user *wptr, struct mm_struct *mm)
{
	struct v9_sdma_mqd *m;
	uint32_t sdma_rlc_reg_offset;
	unsigned long end_jiffies;
	uint32_t data;
	uint64_t data64;
	uint64_t __user *wptr64 = (uint64_t __user *)wptr;

	m = get_sdma_mqd(mqd);
	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
							m->sdma_queue_id);

	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL,
		m->sdmax_rlcx_rb_cntl & (~SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK));

	end_jiffies = msecs_to_jiffies(2000) + jiffies;
	while (true) {
		data = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_CONTEXT_STATUS);
		if (data & SDMA_RLC0_CONTEXT_STATUS__IDLE_MASK)
			break;
		if (time_after(jiffies, end_jiffies)) {
			pr_err("SDMA RLC not idle in %s\n", __func__);
			return -ETIME;
		}
		usleep_range(500, 1000);
	}

	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_DOORBELL_OFFSET,
		m->sdmax_rlcx_doorbell_offset);

	data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA_RLC0_DOORBELL,
				ENABLE, 1);
	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_DOORBELL, data);
	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR,
					m->sdmax_rlcx_rb_rptr);
	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_HI,
					m->sdmax_rlcx_rb_rptr_hi);

	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_MINOR_PTR_UPDATE, 1);
	if (read_user_wptr(mm, wptr64, data64)) {
		WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR,
			lower_32_bits(data64));
		WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR_HI,
			upper_32_bits(data64));
	} else {
		WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR,
			m->sdmax_rlcx_rb_rptr);
		WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR_HI,
			m->sdmax_rlcx_rb_rptr_hi);
	}
	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_MINOR_PTR_UPDATE, 0);

	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_BASE_HI,
			m->sdmax_rlcx_rb_base_hi);
	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_ADDR_LO,
			m->sdmax_rlcx_rb_rptr_addr_lo);
	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_ADDR_HI,
			m->sdmax_rlcx_rb_rptr_addr_hi);

	data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA_RLC0_RB_CNTL,
				RB_ENABLE, 1);
	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL, data);

	return 0;
}

int kgd_gfx_v9_4_3_hqd_sdma_dump(struct amdgpu_device *adev,
				 uint32_t engine_id, uint32_t queue_id,
				 uint32_t (**dump)[2], uint32_t *n_regs)
{
	uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
							engine_id, queue_id);
	uint32_t i = 0, reg;
#undef HQD_N_REGS
#define HQD_N_REGS (19+6+7+12)
#define DUMP_REG(addr) do {				\
		if (WARN_ON_ONCE(i >= HQD_N_REGS))      \
			break;				\
		(*dump)[i][0] = (addr) << 2;            \
		(*dump)[i++][1] = RREG32(addr);         \
	} while (0)

	*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
	if (*dump == NULL)
		return -ENOMEM;

	for (reg = regSDMA_RLC0_RB_CNTL; reg <= regSDMA_RLC0_DOORBELL; reg++)
		DUMP_REG(sdma_rlc_reg_offset + reg);
	for (reg = regSDMA_RLC0_STATUS; reg <= regSDMA_RLC0_CSA_ADDR_HI; reg++)
		DUMP_REG(sdma_rlc_reg_offset + reg);
	for (reg = regSDMA_RLC0_IB_SUB_REMAIN;
	     reg <= regSDMA_RLC0_MINOR_PTR_UPDATE; reg++)
		DUMP_REG(sdma_rlc_reg_offset + reg);
	for (reg = regSDMA_RLC0_MIDCMD_DATA0;
	     reg <= regSDMA_RLC0_MIDCMD_CNTL; reg++)
		DUMP_REG(sdma_rlc_reg_offset + reg);

	WARN_ON_ONCE(i != HQD_N_REGS);
	*n_regs = i;

	return 0;
}

bool kgd_gfx_v9_4_3_hqd_sdma_is_occupied(struct amdgpu_device *adev, void *mqd)
{
	struct v9_sdma_mqd *m;
	uint32_t sdma_rlc_reg_offset;
	uint32_t sdma_rlc_rb_cntl;

	m = get_sdma_mqd(mqd);
	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
							m->sdma_queue_id);

	sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL);

	if (sdma_rlc_rb_cntl & SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK)
		return true;

	return false;
}

int kgd_gfx_v9_4_3_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd,
				    unsigned int utimeout)
{
	struct v9_sdma_mqd *m;
	uint32_t sdma_rlc_reg_offset;
	uint32_t temp;
	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;

	m = get_sdma_mqd(mqd);
	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
							m->sdma_queue_id);

	temp = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL);
	temp = temp & ~SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK;
	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL, temp);

	while (true) {
		temp = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_CONTEXT_STATUS);
		if (temp & SDMA_RLC0_CONTEXT_STATUS__IDLE_MASK)
			break;
		if (time_after(jiffies, end_jiffies)) {
			pr_err("SDMA RLC not idle in %s\n", __func__);
			return -ETIME;
		}
		usleep_range(500, 1000);
	}

	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_DOORBELL, 0);
	WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL,
		RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL) |
		SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK);

	m->sdmax_rlcx_rb_rptr =
			RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR);
	m->sdmax_rlcx_rb_rptr_hi =
			RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_HI);

	return 0;
}

static int kgd_gfx_v9_4_3_set_pasid_vmid_mapping(struct amdgpu_device *adev,
				u32 pasid, unsigned int vmid, uint32_t inst)
@@ -166,13 +352,13 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
	.init_interrupts = kgd_gfx_v9_init_interrupts,
	.hqd_load = kgd_gfx_v9_4_3_hqd_load,
	.hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load,
	.hqd_sdma_load = kgd_arcturus_hqd_sdma_load,
	.hqd_sdma_load = kgd_gfx_v9_4_3_hqd_sdma_load,
	.hqd_dump = kgd_gfx_v9_hqd_dump,
	.hqd_sdma_dump = kgd_arcturus_hqd_sdma_dump,
	.hqd_sdma_dump = kgd_gfx_v9_4_3_hqd_sdma_dump,
	.hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied,
	.hqd_sdma_is_occupied = kgd_arcturus_hqd_sdma_is_occupied,
	.hqd_sdma_is_occupied = kgd_gfx_v9_4_3_hqd_sdma_is_occupied,
	.hqd_destroy = kgd_gfx_v9_hqd_destroy,
	.hqd_sdma_destroy = kgd_arcturus_hqd_sdma_destroy,
	.hqd_sdma_destroy = kgd_gfx_v9_4_3_hqd_sdma_destroy,
	.wave_control_execute = kgd_gfx_v9_wave_control_execute,
	.get_atc_vmid_pasid_mapping_info =
				kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
+5 −3
Original line number Diff line number Diff line
@@ -741,6 +741,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
		if (!node)
			goto node_alloc_error;

		node->node_id = i;
		node->adev = kfd->adev;
		node->kfd = kfd;
		node->kfd2kgd = kfd->kfd2kgd;
@@ -1323,15 +1324,16 @@ unsigned int kfd_get_num_sdma_engines(struct kfd_node *node)
{
	/* If XGMI is not supported, all SDMA engines are PCIe */
	if (!node->adev->gmc.xgmi.supported)
		return node->adev->sdma.num_instances;
		return node->adev->sdma.num_instances/(int)node->kfd->num_nodes;

	return min(node->adev->sdma.num_instances, 2);
	return min(node->adev->sdma.num_instances/(int)node->kfd->num_nodes, 2);
}

unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_node *node)
{
	/* After reserved for PCIe, the rest of engines are XGMI */
	return node->adev->sdma.num_instances - kfd_get_num_sdma_engines(node);
	return node->adev->sdma.num_instances/(int)node->kfd->num_nodes -
		kfd_get_num_sdma_engines(node);
}

#if defined(CONFIG_DEBUG_FS)
+27 −32
Original line number Diff line number Diff line
@@ -124,6 +124,15 @@ static inline uint64_t get_reserved_sdma_queues_bitmap(struct device_queue_manag
	return dqm->dev->kfd->device_info.reserved_sdma_queues_bitmap;
}

static void init_sdma_bitmaps(struct device_queue_manager *dqm)
{
	bitmap_zero(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES);
	bitmap_set(dqm->sdma_bitmap, 0, get_num_sdma_queues(dqm));

	bitmap_zero(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES);
	bitmap_set(dqm->xgmi_sdma_bitmap, 0, get_num_xgmi_sdma_queues(dqm));
}

void program_sh_mem_settings(struct device_queue_manager *dqm,
					struct qcm_process_device *qpd)
{
@@ -1268,24 +1277,6 @@ static void init_interrupts(struct device_queue_manager *dqm)
	}
}

static void init_sdma_bitmaps(struct device_queue_manager *dqm)
{
	unsigned int num_sdma_queues =
		min_t(unsigned int, sizeof(dqm->sdma_bitmap)*8,
		      get_num_sdma_queues(dqm));
	unsigned int num_xgmi_sdma_queues =
		min_t(unsigned int, sizeof(dqm->xgmi_sdma_bitmap)*8,
		      get_num_xgmi_sdma_queues(dqm));

	if (num_sdma_queues)
		dqm->sdma_bitmap = GENMASK_ULL(num_sdma_queues-1, 0);
	if (num_xgmi_sdma_queues)
		dqm->xgmi_sdma_bitmap = GENMASK_ULL(num_xgmi_sdma_queues-1, 0);

	dqm->sdma_bitmap &= ~get_reserved_sdma_queues_bitmap(dqm);
	pr_info("sdma_bitmap: %llx\n", dqm->sdma_bitmap);
}

static int initialize_nocpsch(struct device_queue_manager *dqm)
{
	int pipe, queue;
@@ -1375,46 +1366,49 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
	int bit;

	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
		if (dqm->sdma_bitmap == 0) {
		if (bitmap_empty(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES)) {
			pr_err("No more SDMA queue to allocate\n");
			return -ENOMEM;
		}

		if (restore_sdma_id) {
			/* Re-use existing sdma_id */
			if (!(dqm->sdma_bitmap & (1ULL << *restore_sdma_id))) {
			if (!test_bit(*restore_sdma_id, dqm->sdma_bitmap)) {
				pr_err("SDMA queue already in use\n");
				return -EBUSY;
			}
			dqm->sdma_bitmap &= ~(1ULL << *restore_sdma_id);
			clear_bit(*restore_sdma_id, dqm->sdma_bitmap);
			q->sdma_id = *restore_sdma_id;
		} else {
			/* Find first available sdma_id */
			bit = __ffs64(dqm->sdma_bitmap);
			dqm->sdma_bitmap &= ~(1ULL << bit);
			bit = find_first_bit(dqm->sdma_bitmap,
					     get_num_sdma_queues(dqm));
			clear_bit(bit, dqm->sdma_bitmap);
			q->sdma_id = bit;
		}

		q->properties.sdma_engine_id = q->sdma_id %
				kfd_get_num_sdma_engines(dqm->dev);
		q->properties.sdma_engine_id =
			dqm->dev->node_id * get_num_all_sdma_engines(dqm) +
			q->sdma_id % kfd_get_num_sdma_engines(dqm->dev);
		q->properties.sdma_queue_id = q->sdma_id /
				kfd_get_num_sdma_engines(dqm->dev);
	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
		if (dqm->xgmi_sdma_bitmap == 0) {
		if (bitmap_empty(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES)) {
			pr_err("No more XGMI SDMA queue to allocate\n");
			return -ENOMEM;
		}
		if (restore_sdma_id) {
			/* Re-use existing sdma_id */
			if (!(dqm->xgmi_sdma_bitmap & (1ULL << *restore_sdma_id))) {
			if (!test_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap)) {
				pr_err("SDMA queue already in use\n");
				return -EBUSY;
			}
			dqm->xgmi_sdma_bitmap &= ~(1ULL << *restore_sdma_id);
			clear_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap);
			q->sdma_id = *restore_sdma_id;
		} else {
			bit = __ffs64(dqm->xgmi_sdma_bitmap);
			dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
			bit = find_first_bit(dqm->xgmi_sdma_bitmap,
					     get_num_xgmi_sdma_queues(dqm));
			clear_bit(bit, dqm->xgmi_sdma_bitmap);
			q->sdma_id = bit;
		}
		/* sdma_engine_id is sdma id including
@@ -1424,6 +1418,7 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
		 * PCIe-optimized ones
		 */
		q->properties.sdma_engine_id =
			dqm->dev->node_id * get_num_all_sdma_engines(dqm) +
			kfd_get_num_sdma_engines(dqm->dev) +
			q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
		q->properties.sdma_queue_id = q->sdma_id /
@@ -1442,11 +1437,11 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
		if (q->sdma_id >= get_num_sdma_queues(dqm))
			return;
		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
		set_bit(q->sdma_id, dqm->sdma_bitmap);
	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
			return;
		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
		set_bit(q->sdma_id, dqm->xgmi_sdma_bitmap);
	}
}

+2 −2
Original line number Diff line number Diff line
@@ -239,8 +239,8 @@ struct device_queue_manager {
	unsigned int		total_queue_count;
	unsigned int		next_pipe_to_allocate;
	unsigned int		*allocated_queues;
	uint64_t		sdma_bitmap;
	uint64_t		xgmi_sdma_bitmap;
	DECLARE_BITMAP(sdma_bitmap, KFD_MAX_SDMA_QUEUES);
	DECLARE_BITMAP(xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES);
	/* the pasid mapping for each kfd vmid */
	uint16_t		vmid_pasid[VMID_NUM];
	uint64_t		pipelines_addr;
+3 −0
Original line number Diff line number Diff line
@@ -113,6 +113,8 @@

#define KFD_UNMAP_LATENCY_MS	(4000)

#define KFD_MAX_SDMA_QUEUES	128

/*
 * 512 = 0x200
 * The doorbell index distance between SDMA RLC (2*i) and (2*i+1) in the
@@ -260,6 +262,7 @@ struct kfd_vmid_info {
struct kfd_dev;

struct kfd_node {
	unsigned int node_id;
	struct amdgpu_device *adev;     /* Duplicated here along with keeping
					 * a copy in kfd_dev to save a hop
					 */