Commit 42c6c482 authored by David Yat Sin's avatar David Yat Sin Committed by Alex Deucher
Browse files

drm/amdkfd: CRIU checkpoint and restore queue mqds



Checkpoint contents of queue MQD's on CRIU dump and restore them during
CRIU restore.

Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarDavid Yat Sin <david.yatsin@amd.com>
Signed-off-by: default avatarRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 5bb6a8fa
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -311,7 +311,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
			p->pasid,
			dev->id);

	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, NULL,
	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, NULL, NULL,
			&doorbell_offset_in_process);
	if (err != 0)
		goto err_create_queue;
+1 −1
Original line number Diff line number Diff line
@@ -185,7 +185,7 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev)
	properties.type = KFD_QUEUE_TYPE_DIQ;

	status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,
				&properties, &qid, NULL, NULL);
				&properties, &qid, NULL, NULL, NULL);

	if (status) {
		pr_err("Failed to create DIQ\n");
+67 −6
Original line number Diff line number Diff line
@@ -322,7 +322,8 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
static int create_queue_nocpsch(struct device_queue_manager *dqm,
				struct queue *q,
				struct qcm_process_device *qpd,
				const struct kfd_criu_queue_priv_data *qd)
				const struct kfd_criu_queue_priv_data *qd,
				const void *restore_mqd)
{
	struct mqd_manager *mqd_mgr;
	int retval;
@@ -381,8 +382,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
		retval = -ENOMEM;
		goto out_deallocate_doorbell;
	}

	if (qd)
		mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, &q->gart_mqd_addr,
				     &q->properties, restore_mqd);
	else
		mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
					&q->gart_mqd_addr, &q->properties);

	if (q->properties.is_active) {
		if (!dqm->sched_running) {
			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
@@ -1334,7 +1341,8 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,

static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
			struct qcm_process_device *qpd,
			const struct kfd_criu_queue_priv_data *qd)
			const struct kfd_criu_queue_priv_data *qd,
			const void *restore_mqd)
{
	int retval;
	struct mqd_manager *mqd_mgr;
@@ -1380,6 +1388,11 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
	 * updates the is_evicted flag but is a no-op otherwise.
	 */
	q->properties.is_evicted = !!qpd->evicted;

	if (qd)
		mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, &q->gart_mqd_addr,
				     &q->properties, restore_mqd);
	else
		mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
					&q->gart_mqd_addr, &q->properties);

@@ -1784,6 +1797,50 @@ static int get_wave_state(struct device_queue_manager *dqm,
			ctl_stack_used_size, save_area_used_size);
}

static void get_queue_checkpoint_info(struct device_queue_manager *dqm,
			const struct queue *q,
			u32 *mqd_size)
{
	struct mqd_manager *mqd_mgr;
	enum KFD_MQD_TYPE mqd_type =
			get_mqd_type_from_queue_type(q->properties.type);

	dqm_lock(dqm);
	mqd_mgr = dqm->mqd_mgrs[mqd_type];
	*mqd_size = mqd_mgr->mqd_size;

	dqm_unlock(dqm);
}

static int checkpoint_mqd(struct device_queue_manager *dqm,
			  const struct queue *q,
			  void *mqd)
{
	struct mqd_manager *mqd_mgr;
	int r = 0;
	enum KFD_MQD_TYPE mqd_type =
			get_mqd_type_from_queue_type(q->properties.type);

	dqm_lock(dqm);

	if (q->properties.is_active || !q->device->cwsr_enabled) {
		r = -EINVAL;
		goto dqm_unlock;
	}

	mqd_mgr = dqm->mqd_mgrs[mqd_type];
	if (!mqd_mgr->checkpoint_mqd) {
		r = -EOPNOTSUPP;
		goto dqm_unlock;
	}

	mqd_mgr->checkpoint_mqd(mqd_mgr, q->mqd, mqd);

dqm_unlock:
	dqm_unlock(dqm);
	return r;
}

static int process_termination_cpsch(struct device_queue_manager *dqm,
		struct qcm_process_device *qpd)
{
@@ -1961,6 +2018,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
		dqm->ops.restore_process_queues = restore_process_queues_cpsch;
		dqm->ops.get_wave_state = get_wave_state;
		dqm->ops.reset_queues = reset_queues_cpsch;
		dqm->ops.get_queue_checkpoint_info = get_queue_checkpoint_info;
		dqm->ops.checkpoint_mqd = checkpoint_mqd;
		break;
	case KFD_SCHED_POLICY_NO_HWS:
		/* initialize dqm for no cp scheduling */
@@ -1980,6 +2039,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
		dqm->ops.restore_process_queues =
			restore_process_queues_nocpsch;
		dqm->ops.get_wave_state = get_wave_state;
		dqm->ops.get_queue_checkpoint_info = get_queue_checkpoint_info;
		dqm->ops.checkpoint_mqd = checkpoint_mqd;
		break;
	default:
		pr_err("Invalid scheduling policy %d\n", dqm->sched_policy);
+11 −1
Original line number Diff line number Diff line
@@ -83,13 +83,17 @@ struct device_process_node {
 * control stack, if kept in the MQD, to the given userspace address.
 *
 * @reset_queues: reset queues which consume RAS poison
 * @get_queue_checkpoint_info: Retrieves queue size information for CRIU checkpoint.
 *
 * @checkpoint_mqd: checkpoint queue MQD contents for CRIU.
 */

struct device_queue_manager_ops {
	int	(*create_queue)(struct device_queue_manager *dqm,
				struct queue *q,
				struct qcm_process_device *qpd,
				const struct kfd_criu_queue_priv_data *qd);
				const struct kfd_criu_queue_priv_data *qd,
				const void *restore_mqd);

	int	(*destroy_queue)(struct device_queue_manager *dqm,
				struct qcm_process_device *qpd,
@@ -140,6 +144,12 @@ struct device_queue_manager_ops {

	int (*reset_queues)(struct device_queue_manager *dqm,
					uint16_t pasid);
	void	(*get_queue_checkpoint_info)(struct device_queue_manager *dqm,
				  const struct queue *q, u32 *mqd_size);

	int	(*checkpoint_mqd)(struct device_queue_manager *dqm,
				  const struct queue *q,
				  void *mqd);
};

struct device_queue_manager_asic_ops {
+7 −0
Original line number Diff line number Diff line
@@ -100,6 +100,13 @@ struct mqd_manager {
				  u32 *ctl_stack_used_size,
				  u32 *save_area_used_size);

	void	(*checkpoint_mqd)(struct mqd_manager *mm, void *mqd, void *mqd_dst);

	void	(*restore_mqd)(struct mqd_manager *mm, void **mqd,
				struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
				struct queue_properties *p,
				const void *mqd_src);

#if defined(CONFIG_DEBUG_FS)
	int	(*debugfs_show_mqd)(struct seq_file *m, void *data);
#endif
Loading