Commit aa42d23d authored by Wang Wensheng's avatar Wang Wensheng
Browse files

mm/sharepool: Implement mg_sp_group_add_task()

ascend inclusion
category: Feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8LNGH



---------------------------------------------

Add tasks to share pool group. The share memory regions alloced before
are mapped to the new joiner too.

Signed-off-by: default avatarWang Wensheng <wangwensheng4@huawei.com>
parent 25d9681d
Loading
Loading
Loading
Loading
+450 −2
Original line number Diff line number Diff line
@@ -477,11 +477,34 @@ static int sp_group_setup_mapping_normal(struct mm_struct *mm, struct sp_group *
	return 0;
}

static int sp_group_setup_mapping_local(struct mm_struct *mm, struct sp_group *local)
{
	struct sp_mapping *spm;

	spm = sp_mapping_create(SP_MAPPING_DVPP);
	if (!spm)
		return -ENOMEM;

	sp_mapping_attach(local, spm);
	sp_mapping_attach(local, sp_mapping_normal);
	sp_mapping_attach(local, sp_mapping_ro);

	return 0;
}

static inline bool is_local_group(int spg_id)
{
	return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX;
}

static int sp_group_setup_mapping(struct mm_struct *mm, struct sp_group *spg)
{
	if (is_local_group(spg->id))
		return sp_group_setup_mapping_local(mm, spg);
	else
		return sp_group_setup_mapping_normal(mm, spg);
}

static int sp_init_group_master(struct task_struct *tsk, struct mm_struct *mm)
{
	return -EOPNOTSUPP;
@@ -707,6 +730,10 @@ struct sp_alloc_context {
	enum spa_type type;
};

static int sp_map_spa_to_mm(struct mm_struct *mm, struct sp_area *spa,
			    unsigned long prot, struct sp_alloc_context *ac,
			    const char *str);

struct sp_k2u_context {
	unsigned long kva;
	unsigned long kva_aligned;
@@ -716,6 +743,82 @@ struct sp_k2u_context {
	enum spa_type type;
};

static void free_sp_group_locked(struct sp_group *spg)
{
	int type;

	fput(spg->file);
	fput(spg->file_hugetlb);
	idr_remove(&sp_group_idr, spg->id);

	for (type = SP_MAPPING_START; type < SP_MAPPING_END; type++)
		sp_mapping_detach(spg, spg->mapping[type]);

	if (!is_local_group(spg->id))
		system_group_count--;

	kfree(spg);
	WARN(system_group_count < 0, "unexpected group count\n");
}

static void free_sp_group(struct sp_group *spg)
{
	down_write(&sp_global_sem);
	free_sp_group_locked(spg);
	up_write(&sp_global_sem);
}

static void sp_group_put_locked(struct sp_group *spg)
{
	lockdep_assert_held_write(&sp_global_sem);

	if (atomic_dec_and_test(&spg->use_count))
		free_sp_group_locked(spg);
}

static void sp_group_put(struct sp_group *spg)
{
	if (atomic_dec_and_test(&spg->use_count))
		free_sp_group(spg);
}

/* use with put_task_struct(task) */
static int get_task(int tgid, struct task_struct **task)
{
	struct task_struct *tsk;
	struct pid *p;

	rcu_read_lock();
	p = find_pid_ns(tgid, &init_pid_ns);
	tsk = pid_task(p, PIDTYPE_TGID);
	if (!tsk || (tsk->flags & PF_EXITING)) {
		rcu_read_unlock();
		return -ESRCH;
	}
	get_task_struct(tsk);
	rcu_read_unlock();

	*task = tsk;
	return 0;
}

/*
 * the caller must:
 * 1. hold spg->rw_lock
 * 2. ensure no concurrency problem for mm_struct
 */
static bool is_process_in_group(struct sp_group *spg,
						 struct mm_struct *mm)
{
	struct sp_group_node *spg_node;

	list_for_each_entry(spg_node, &spg->proc_head, proc_node)
		if (spg_node->master->mm == mm)
			return true;

	return false;
}

/*
 * Get the sp_group from the mm and return the associated sp_group_node.
 * The caller should promise the @mm would not be deleted from the @spg.
@@ -780,6 +883,270 @@ static bool is_online_node_id(int node_id)
	return node_id >= 0 && node_id < MAX_NUMNODES && node_online(node_id);
}

static void sp_group_init(struct sp_group *spg, int spg_id)
{
	spg->id = spg_id;
	spg->proc_num = 0;
	spg->spa_root = RB_ROOT;
	atomic_set(&spg->use_count, 1);
	atomic_set(&spg->spa_num, 0);
	INIT_LIST_HEAD(&spg->proc_head);
	INIT_LIST_HEAD(&spg->mnode);
	init_rwsem(&spg->rw_lock);
	meminfo_init(&spg->meminfo);
}

/*
 * sp_group_create - create a new sp_group
 * @spg_id: specify the id for the new sp_group
 *
 * valid @spg_id:
 * SPG_ID_AUTO:
 *	Allocate a id in range [SPG_ID_AUTO_MIN, APG_ID_AUTO_MAX]
 * SPG_ID_LOCAL:
 *	Allocate a id in range [SPG_ID_LOCAL_MIN, APG_ID_LOCAL_MAX]
 * [SPG_ID_MIN, SPG_ID_MAX]:
 *	Using the input @spg_id for the new sp_group.
 *
 * Return: the newly created sp_group or an errno.
 * Context: The caller should protect sp_group_idr from being access.
 */
static struct sp_group *sp_group_create(int spg_id)
{
	int ret, start, end;
	struct sp_group *spg;
	char name[DNAME_INLINE_LEN];
	int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT;

	if (unlikely(system_group_count + 1 == MAX_GROUP_FOR_SYSTEM &&
		     spg_id != SPG_ID_LOCAL)) {
		pr_err("reach system max group num\n");
		return ERR_PTR(-ENOSPC);
	}

	if (spg_id == SPG_ID_LOCAL) {
		start = SPG_ID_LOCAL_MIN;
		end = SPG_ID_LOCAL_MAX + 1;
	} else if (spg_id == SPG_ID_AUTO) {
		start = SPG_ID_AUTO_MIN;
		end = SPG_ID_AUTO_MAX + 1;
	} else if (spg_id >= SPG_ID_MIN && spg_id <= SPG_ID_MAX) {
		start = spg_id;
		end = spg_id + 1;
	} else {
		pr_err("invalid input spg_id:%d\n", spg_id);
		return ERR_PTR(-EINVAL);
	}

	spg = kzalloc(sizeof(*spg), GFP_KERNEL);
	if (spg == NULL)
		return ERR_PTR(-ENOMEM);

	ret = idr_alloc(&sp_group_idr, spg, start, end, GFP_KERNEL);
	if (ret < 0) {
		pr_err("group %d idr alloc failed %d\n", spg_id, ret);
		goto out_kfree;
	}
	spg_id = ret;

	sprintf(name, "sp_group_%d", spg_id);
	spg->file = shmem_kernel_file_setup(name, MAX_LFS_FILESIZE, VM_NORESERVE);
	if (IS_ERR(spg->file)) {
		pr_err("spg file setup failed %ld\n", PTR_ERR(spg->file));
		ret = PTR_ERR(spg->file);
		goto out_idr_remove;
	}

	sprintf(name, "sp_group_%d_huge", spg_id);
	spg->file_hugetlb = hugetlb_file_setup(name, MAX_LFS_FILESIZE,
				VM_NORESERVE, HUGETLB_ANONHUGE_INODE, hsize_log);
	if (IS_ERR(spg->file_hugetlb)) {
		pr_err("spg file_hugetlb setup failed %ld\n", PTR_ERR(spg->file_hugetlb));
		ret = PTR_ERR(spg->file_hugetlb);
		goto out_fput;
	}

	sp_group_init(spg, spg_id);

	if (!is_local_group(spg_id))
		system_group_count++;

	return spg;

out_fput:
	fput(spg->file);
out_idr_remove:
	idr_remove(&sp_group_idr, spg_id);
out_kfree:
	kfree(spg);
	return ERR_PTR(ret);
}

/* the caller must hold sp_global_sem */
static struct sp_group *sp_group_get_or_alloc(int spg_id)
{
	struct sp_group *spg;

	spg = idr_find(&sp_group_idr, spg_id);
	if (!spg || !atomic_inc_not_zero(&spg->use_count))
		spg = sp_group_create(spg_id);

	return spg;
}

/* the caller must hold sp_global_sem */
static struct sp_group_node *spg_node_alloc(struct mm_struct *mm,
	unsigned long prot, struct sp_group *spg)
{
	struct sp_group_master *master = mm->sp_group_master;
	struct sp_group_node *spg_node;

	spg_node = kzalloc(sizeof(struct sp_group_node), GFP_KERNEL);
	if (!spg_node)
		return NULL;

	INIT_LIST_HEAD(&spg_node->group_node);
	INIT_LIST_HEAD(&spg_node->proc_node);
	spg_node->spg = spg;
	spg_node->master = master;
	spg_node->prot = prot;
	meminfo_init(&spg_node->meminfo);

	return spg_node;
}

/*
 * sp_group_link_task - Actually add a task into a group
 * @mm: specify the input task
 * @spg: the sp_group
 * @prot: read/write protection for the task in the group
 *
 * The input @mm and @spg must have been initialized properly and could not
 * be freed during the sp_group_link_task().
 * the caller must hold sp_global_sem.
 */
static int sp_group_link_task(struct mm_struct *mm, struct sp_group *spg,
			      unsigned long prot, struct sp_group_node **pnode)
{
	int ret;
	struct sp_group_node *node;
	struct sp_group_master *master = mm->sp_group_master;

	if (master->group_num == MAX_GROUP_FOR_TASK) {
		pr_err("task reaches max group num\n");
		return -ENOSPC;
	}

	if (is_process_in_group(spg, mm)) {
		pr_err("task already in target group(%d)\n", spg->id);
		return -EEXIST;
	}

	if (spg->proc_num + 1 == MAX_PROC_PER_GROUP) {
		pr_err("add group: group(%d) reaches max process num\n", spg->id);
		return -ENOSPC;
	}

	node = spg_node_alloc(mm, prot, spg);
	if (!node)
		return -ENOMEM;

	ret = sp_group_setup_mapping(mm, spg);
	if (ret)
		goto out_kfree;

	/*
	 * We pin only the mm_struct instead of the memory space of the target mm.
	 * So we must ensure the existence of the memory space via mmget_not_zero
	 * before we would access it.
	 */
	mmgrab(mm);
	master->group_num++;
	list_add_tail(&node->group_node, &master->group_head);
	atomic_inc(&spg->use_count);
	spg->proc_num++;
	list_add_tail(&node->proc_node, &spg->proc_head);
	if (pnode)
		*pnode = node;

	return 0;

out_kfree:
	kfree(node);

	return ret;
}

static void sp_group_unlink_task(struct sp_group_node *spg_node)
{
	struct sp_group *spg = spg_node->spg;
	struct sp_group_master *master = spg_node->master;

	list_del(&spg_node->proc_node);
	spg->proc_num--;
	list_del(&spg_node->group_node);
	master->group_num--;

	mmdrop(master->mm);
	sp_group_put_locked(spg);
	kfree(spg_node);
}

/*
 * Find and initialize the mm of the task specified by @tgid.
 * We increace the usercount for the mm on success.
 */
static int mm_add_group_init(pid_t tgid, struct mm_struct **pmm)
{
	int ret;
	struct mm_struct *mm;
	struct task_struct *tsk;

	ret = get_task(tgid, &tsk);
	if (ret)
		return ret;

	/*
	 * group_leader: current thread may be exiting in a multithread process
	 *
	 * DESIGN IDEA
	 * We increase mm->mm_users deliberately to ensure it's decreased in
	 * share pool under only 2 circumstances, which will simply the overall
	 * design as mm won't be freed unexpectedly.
	 *
	 * The corresponding refcount decrements are as follows:
	 * 1. the error handling branch of THIS function.
	 * 2. In sp_group_exit(). It's called only when process is exiting.
	 */
	mm = get_task_mm(tsk->group_leader);
	if (!mm) {
		ret = -ESRCH;
		goto out_put_task;
	}

	ret = sp_init_group_master(tsk, mm);
	if (ret)
		goto out_put_mm;

	if (mm->sp_group_master && mm->sp_group_master->tgid != tgid) {
		pr_err("add: task(%d) is a vfork child of the original task(%d)\n",
			tgid, mm->sp_group_master->tgid);
		ret = -EINVAL;
		goto out_put_mm;
	}
	*pmm = mm;

out_put_mm:
	if (ret)
		mmput(mm);
out_put_task:
	put_task_struct(tsk);

	return ret;
}

static void sp_area_put_locked(struct sp_area *spa);
static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size);
/**
 * mg_sp_group_add_task() - Add a process to an share group (sp_group).
 * @tgid: the tgid of the task to be added.
@@ -808,7 +1175,89 @@ static bool is_online_node_id(int node_id)
 */
int mg_sp_group_add_task(int tgid, unsigned long prot, int spg_id)
{
	int ret = 0;
	struct sp_area *spa;
	struct mm_struct *mm;
	struct sp_group *spg;
	struct rb_node *p, *n;
	struct sp_group_node *spg_node;

	if (!sp_is_enabled())
		return -EOPNOTSUPP;

	check_interrupt_context();

	/* only allow READ, READ | WRITE */
	if (!((prot == PROT_READ) || (prot == (PROT_READ | PROT_WRITE)))) {
		pr_err_ratelimited("prot is invalid 0x%lx\n", prot);
		return -EINVAL;
	}

	if (spg_id < SPG_ID_MIN || spg_id > SPG_ID_AUTO) {
		pr_err_ratelimited("add group failed, invalid group id %d\n", spg_id);
		return -EINVAL;
	}

	ret = mm_add_group_init(tgid, &mm);
	if (ret < 0)
		return ret;

	down_write(&sp_global_sem);
	spg = sp_group_get_or_alloc(spg_id);
	if (IS_ERR(spg)) {
		ret = PTR_ERR(spg);
		goto out_unlock;
	}
	/* save spg_id before we release sp_global_sem, or UAF may occur */
	spg_id = spg->id;

	down_write(&spg->rw_lock);
	ret = sp_group_link_task(mm, spg, prot, &spg_node);
	if (ret < 0)
		goto put_spg;

	/*
	 * create mappings of existing shared memory segments into this
	 * new process' page table.
	 */
	for (p = rb_first(&spg->spa_root); p; p = n) {
		n = rb_next(p);
		spa = container_of(p, struct sp_area, spg_link);

		if (!atomic_inc_not_zero(&spa->use_count)) {
			pr_warn("be careful, add new task(%d) to an exiting group(%d)\n",
					tgid, spg_id);
			continue;
		}

		ret = sp_map_spa_to_mm(mm, spa, prot, NULL, "add_task");
		sp_area_put_locked(spa);
		if (ret) {
			pr_warn("mmap old spa to new task failed, %d\n", ret);
			/* it makes no scene to skip error for coredump here */
			ret = ret < 0 ? ret : -EFAULT;

			for (p = rb_prev(p); p; p = n) {
				n = rb_prev(p);
				spa = container_of(p, struct sp_area, spg_link);
				if (!atomic_inc_not_zero(&spa->use_count))
					continue;
				sp_munmap(mm, spa->va_start, spa_size(spa));
				sp_area_put_locked(spa);
			}
			sp_group_unlink_task(spg_node);
			break;
		}
	}
put_spg:
	up_write(&spg->rw_lock);
	sp_group_put_locked(spg);
out_unlock:
	up_write(&sp_global_sem);
	/* We put the mm_struct later to protect the mm from exiting while sp_mmap */
	mmput(mm);

	return ret < 0 ? ret : spg_id;
}
EXPORT_SYMBOL_GPL(mg_sp_group_add_task);

@@ -1098,7 +1547,6 @@ static void sp_area_put_locked(struct sp_area *spa)
	}
}

static void sp_group_put(struct sp_group *spg) {}
static void sp_area_drop_func(struct work_struct *work)
{
	bool spa_zero;