Commit aa55dfd3 authored by Andrii Nakryiko's avatar Andrii Nakryiko
Browse files

Merge branch 'Parameterize task iterators.'

Kui-Feng Lee says:

====================

Allow creating an iterator that loops through resources of one task/thread.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested in only the
resources of a specific task or process.  Passing the additional
parameters, people can now create an iterator to go through all
resources or only the resources of a task.

Major Changes:

 - Add new parameters in bpf_iter_link_info to indicate to go through
   all tasks or to go through a specific task.

 - Change the implementations of BPF iterators of vma, files, and
   tasks to allow going through only the resources of a specific task.

 - Provide the arguments of parameterized task iterators in
   bpf_link_info.

Differences from v10:

 - Check pid_alive() to avoid potential errors.

Differences from v9:

  - Fix the boundary check of computing page_shift.

  - Rewording the reason of checking and returning the same task.

Differences from v8:

 - Fix uninitialized variable.

 - Avoid redundant work of getting task from pid.

 - Change format string to use %u instead of %d.

 - Use the value of page_shift to compute correct offset in
   bpf_iter_vm_offset.c.

Differences from v7:

 - Travel the tasks of a process through task_group linked list
   instead of traveling through the whole namespace.

Differences from v6:

 - Add part 5 to make bpftool show the value of parameters.

 - Change of wording of show_fdinfo() to show pid or tid instead of
   always pid.

 - Simplify error handling and naming of test cases.

Differences from v5:

 - Use user-space tid/pid terminologies in bpf_iter_link_info and
   bpf_link_info.

 - Fix reference count

 - Merge all variants to one 'u32 pid' in internal structs.
   (bpf_iter_aux_info and bpf_iter_seq_task_common)

 - Compare the result of get_uprobe_offset() with the implementation
   with the vma iterators.

 - Implement show_fdinfo.

Differences from v4:

 - Remove 'type' from bpf_iter_link_info and bpf_link_info.

v10: https://lore.kernel.org/all/20220831181039.2680134-1-kuifeng@fb.com/
v9: https://lore.kernel.org/bpf/20220829192317.486946-1-kuifeng@fb.com/
v8: https://lore.kernel.org/bpf/20220829192317.486946-1-kuifeng@fb.com/
v7: https://lore.kernel.org/bpf/20220826003712.2810158-1-kuifeng@fb.com/
v6: https://lore.kernel.org/bpf/20220819220927.3409575-1-kuifeng@fb.com/
v5: https://lore.kernel.org/bpf/20220811001654.1316689-1-kuifeng@fb.com/
v4: https://lore.kernel.org/bpf/20220809195429.1043220-1-kuifeng@fb.com/
v3: https://lore.kernel.org/bpf/20220809063501.667610-1-kuifeng@fb.com/
v2: https://lore.kernel.org/bpf/20220801232649.2306614-1-kuifeng@fb.com/
v1: https://lore.kernel.org/bpf/20220726051713.840431-1-kuifeng@fb.com/


====================

Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
parents 87dbdc23 6bdb6d6b
Loading
Loading
Loading
Loading
+25 −0
Original line number Diff line number Diff line
@@ -1796,6 +1796,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
	extern int bpf_iter_ ## target(args);			\
	int __init bpf_iter_ ## target(args) { return 0; }

/*
 * The task type of iterators.
 *
 * For BPF task iterators, they can be parameterized with various
 * parameters to visit only some of tasks.
 *
 * BPF_TASK_ITER_ALL (default)
 *	Iterate over resources of every task.
 *
 * BPF_TASK_ITER_TID
 *	Iterate over resources of a task/tid.
 *
 * BPF_TASK_ITER_TGID
 *	Iterate over resources of every task of a process / task group.
 */
enum bpf_iter_task_type {
	BPF_TASK_ITER_ALL = 0,
	BPF_TASK_ITER_TID,
	BPF_TASK_ITER_TGID,
};

struct bpf_iter_aux_info {
	/* for map_elem iter */
	struct bpf_map *map;
@@ -1805,6 +1826,10 @@ struct bpf_iter_aux_info {
		struct cgroup *start; /* starting cgroup */
		enum bpf_cgroup_iter_order order;
	} cgroup;
	struct {
		enum bpf_iter_task_type	type;
		u32 pid;
	} task;
};

typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
+10 −0
Original line number Diff line number Diff line
@@ -110,6 +110,12 @@ union bpf_iter_link_info {
		__u32	cgroup_fd;
		__u64	cgroup_id;
	} cgroup;
	/* Parameters of task iterators. */
	struct {
		__u32	tid;
		__u32	pid;
		__u32	pid_fd;
	} task;
};

/* BPF syscall commands, see bpf(2) man-page for more details. */
@@ -6259,6 +6265,10 @@ struct bpf_link_info {
					__u64 cgroup_id;
					__u32 order;
				} cgroup;
				struct {
					__u32 tid;
					__u32 pid;
				} task;
			};
		} iter;
		struct  {
+202 −22
Original line number Diff line number Diff line
@@ -10,8 +10,17 @@
#include <linux/btf_ids.h>
#include "mmap_unlock_work.h"

static const char * const iter_task_type_names[] = {
	"ALL",
	"TID",
	"PID",
};

struct bpf_iter_seq_task_common {
	struct pid_namespace *ns;
	enum bpf_iter_task_type	type;
	u32 pid;
	u32 pid_visiting;
};

struct bpf_iter_seq_task_info {
@@ -22,18 +31,115 @@ struct bpf_iter_seq_task_info {
	u32 tid;
};

static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
						   u32 *tid,
						   bool skip_if_dup_files)
{
	struct task_struct *task, *next_task;
	struct pid *pid;
	u32 saved_tid;

	if (!*tid) {
		/* The first time, the iterator calls this function. */
		pid = find_pid_ns(common->pid, common->ns);
		if (!pid)
			return NULL;

		task = get_pid_task(pid, PIDTYPE_TGID);
		if (!task)
			return NULL;

		*tid = common->pid;
		common->pid_visiting = common->pid;

		return task;
	}

	/* If the control returns to user space and comes back to the
	 * kernel again, *tid and common->pid_visiting should be the
	 * same for task_seq_start() to pick up the correct task.
	 */
	if (*tid == common->pid_visiting) {
		pid = find_pid_ns(common->pid_visiting, common->ns);
		task = get_pid_task(pid, PIDTYPE_PID);

		return task;
	}

	pid = find_pid_ns(common->pid_visiting, common->ns);
	if (!pid)
		return NULL;

	task = get_pid_task(pid, PIDTYPE_PID);
	if (!task)
		return NULL;

retry:
	if (!pid_alive(task)) {
		put_task_struct(task);
		return NULL;
	}

	next_task = next_thread(task);
	put_task_struct(task);
	if (!next_task)
		return NULL;

	saved_tid = *tid;
	*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
	if (!*tid || *tid == common->pid) {
		/* Run out of tasks of a process.  The tasks of a
		 * thread_group are linked as circular linked list.
		 */
		*tid = saved_tid;
		return NULL;
	}

	get_task_struct(next_task);
	common->pid_visiting = *tid;

	if (skip_if_dup_files && task->files == task->group_leader->files) {
		task = next_task;
		goto retry;
	}

	return next_task;
}

static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
					     u32 *tid,
					     bool skip_if_dup_files)
{
	struct task_struct *task = NULL;
	struct pid *pid;

	if (common->type == BPF_TASK_ITER_TID) {
		if (*tid && *tid != common->pid)
			return NULL;
		rcu_read_lock();
		pid = find_pid_ns(common->pid, common->ns);
		if (pid) {
			task = get_pid_task(pid, PIDTYPE_TGID);
			*tid = common->pid;
		}
		rcu_read_unlock();

		return task;
	}

	if (common->type == BPF_TASK_ITER_TGID) {
		rcu_read_lock();
		task = task_group_seq_get_next(common, tid, skip_if_dup_files);
		rcu_read_unlock();

		return task;
	}

	rcu_read_lock();
retry:
	pid = find_ge_pid(*tid, ns);
	pid = find_ge_pid(*tid, common->ns);
	if (pid) {
		*tid = pid_nr_ns(pid, ns);
		*tid = pid_nr_ns(pid, common->ns);
		task = get_pid_task(pid, PIDTYPE_PID);
		if (!task) {
			++*tid;
@@ -56,7 +162,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
	struct bpf_iter_seq_task_info *info = seq->private;
	struct task_struct *task;

	task = task_seq_get_next(info->common.ns, &info->tid, false);
	task = task_seq_get_next(&info->common, &info->tid, false);
	if (!task)
		return NULL;

@@ -73,7 +179,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
	++*pos;
	++info->tid;
	put_task_struct((struct task_struct *)v);
	task = task_seq_get_next(info->common.ns, &info->tid, false);
	task = task_seq_get_next(&info->common, &info->tid, false);
	if (!task)
		return NULL;

@@ -117,6 +223,41 @@ static void task_seq_stop(struct seq_file *seq, void *v)
		put_task_struct((struct task_struct *)v);
}

static int bpf_iter_attach_task(struct bpf_prog *prog,
				union bpf_iter_link_info *linfo,
				struct bpf_iter_aux_info *aux)
{
	unsigned int flags;
	struct pid *pid;
	pid_t tgid;

	if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
		return -EINVAL;

	aux->task.type = BPF_TASK_ITER_ALL;
	if (linfo->task.tid != 0) {
		aux->task.type = BPF_TASK_ITER_TID;
		aux->task.pid = linfo->task.tid;
	}
	if (linfo->task.pid != 0) {
		aux->task.type = BPF_TASK_ITER_TGID;
		aux->task.pid = linfo->task.pid;
	}
	if (linfo->task.pid_fd != 0) {
		aux->task.type = BPF_TASK_ITER_TGID;

		pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
		if (IS_ERR(pid))
			return PTR_ERR(pid);

		tgid = pid_nr_ns(pid, task_active_pid_ns(current));
		aux->task.pid = tgid;
		put_pid(pid);
	}

	return 0;
}

static const struct seq_operations task_seq_ops = {
	.start	= task_seq_start,
	.next	= task_seq_next,
@@ -137,8 +278,7 @@ struct bpf_iter_seq_task_file_info {
static struct file *
task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
{
	struct pid_namespace *ns = info->common.ns;
	u32 curr_tid = info->tid;
	u32 saved_tid = info->tid;
	struct task_struct *curr_task;
	unsigned int curr_fd = info->fd;

@@ -151,22 +291,19 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
		curr_task = info->task;
		curr_fd = info->fd;
	} else {
                curr_task = task_seq_get_next(ns, &curr_tid, true);
		curr_task = task_seq_get_next(&info->common, &info->tid, true);
                if (!curr_task) {
                        info->task = NULL;
                        info->tid = curr_tid;
                        return NULL;
                }

                /* set info->task and info->tid */
		/* set info->task */
		info->task = curr_task;
		if (curr_tid == info->tid) {
		if (saved_tid == info->tid)
			curr_fd = info->fd;
		} else {
			info->tid = curr_tid;
		else
			curr_fd = 0;
	}
	}

	rcu_read_lock();
	for (;; curr_fd++) {
@@ -186,9 +323,15 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
	/* the current task is done, go to the next task */
	rcu_read_unlock();
	put_task_struct(curr_task);

	if (info->common.type == BPF_TASK_ITER_TID) {
		info->task = NULL;
		return NULL;
	}

	info->task = NULL;
	info->fd = 0;
	curr_tid = ++(info->tid);
	saved_tid = ++(info->tid);
	goto again;
}

@@ -269,6 +412,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
	struct bpf_iter_seq_task_common *common = priv_data;

	common->ns = get_pid_ns(task_active_pid_ns(current));
	common->type = aux->task.type;
	common->pid = aux->task.pid;

	return 0;
}

@@ -307,11 +453,10 @@ enum bpf_task_vma_iter_find_op {
static struct vm_area_struct *
task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
{
	struct pid_namespace *ns = info->common.ns;
	enum bpf_task_vma_iter_find_op op;
	struct vm_area_struct *curr_vma;
	struct task_struct *curr_task;
	u32 curr_tid = info->tid;
	u32 saved_tid = info->tid;

	/* If this function returns a non-NULL vma, it holds a reference to
	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
@@ -371,14 +516,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
		}
	} else {
again:
		curr_task = task_seq_get_next(ns, &curr_tid, true);
		curr_task = task_seq_get_next(&info->common, &info->tid, true);
		if (!curr_task) {
			info->tid = curr_tid + 1;
			info->tid++;
			goto finish;
		}

		if (curr_tid != info->tid) {
			info->tid = curr_tid;
		if (saved_tid != info->tid) {
			/* new task, process the first vma */
			op = task_vma_iter_first_vma;
		} else {
@@ -430,9 +574,12 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
	return curr_vma;

next_task:
	if (info->common.type == BPF_TASK_ITER_TID)
		goto finish;

	put_task_struct(curr_task);
	info->task = NULL;
	curr_tid++;
	info->tid++;
	goto again;

finish:
@@ -531,8 +678,33 @@ static const struct bpf_iter_seq_info task_seq_info = {
	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
};

static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
{
	switch (aux->task.type) {
	case BPF_TASK_ITER_TID:
		info->iter.task.tid = aux->task.pid;
		break;
	case BPF_TASK_ITER_TGID:
		info->iter.task.pid = aux->task.pid;
		break;
	default:
		break;
	}
	return 0;
}

static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
{
	seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
	if (aux->task.type == BPF_TASK_ITER_TID)
		seq_printf(seq, "tid:\t%u\n", aux->task.pid);
	else if (aux->task.type == BPF_TASK_ITER_TGID)
		seq_printf(seq, "pid:\t%u\n", aux->task.pid);
}

static struct bpf_iter_reg task_reg_info = {
	.target			= "task",
	.attach_target		= bpf_iter_attach_task,
	.feature		= BPF_ITER_RESCHED,
	.ctx_arg_info_size	= 1,
	.ctx_arg_info		= {
@@ -540,6 +712,8 @@ static struct bpf_iter_reg task_reg_info = {
		  PTR_TO_BTF_ID_OR_NULL },
	},
	.seq_info		= &task_seq_info,
	.fill_link_info		= bpf_iter_fill_link_info,
	.show_fdinfo		= bpf_iter_task_show_fdinfo,
};

static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -551,6 +725,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {

static struct bpf_iter_reg task_file_reg_info = {
	.target			= "task_file",
	.attach_target		= bpf_iter_attach_task,
	.feature		= BPF_ITER_RESCHED,
	.ctx_arg_info_size	= 2,
	.ctx_arg_info		= {
@@ -560,6 +735,8 @@ static struct bpf_iter_reg task_file_reg_info = {
		  PTR_TO_BTF_ID_OR_NULL },
	},
	.seq_info		= &task_file_seq_info,
	.fill_link_info		= bpf_iter_fill_link_info,
	.show_fdinfo		= bpf_iter_task_show_fdinfo,
};

static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -571,6 +748,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {

static struct bpf_iter_reg task_vma_reg_info = {
	.target			= "task_vma",
	.attach_target		= bpf_iter_attach_task,
	.feature		= BPF_ITER_RESCHED,
	.ctx_arg_info_size	= 2,
	.ctx_arg_info		= {
@@ -580,6 +758,8 @@ static struct bpf_iter_reg task_vma_reg_info = {
		  PTR_TO_BTF_ID_OR_NULL },
	},
	.seq_info		= &task_vma_seq_info,
	.fill_link_info		= bpf_iter_fill_link_info,
	.show_fdinfo		= bpf_iter_task_show_fdinfo,
};

BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
+19 −0
Original line number Diff line number Diff line
@@ -106,6 +106,13 @@ static const char *cgroup_order_string(__u32 order)
	}
}

static bool is_iter_task_target(const char *target_name)
{
	return strcmp(target_name, "task") == 0 ||
		strcmp(target_name, "task_file") == 0 ||
		strcmp(target_name, "task_vma") == 0;
}

static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr)
{
	const char *target_name = u64_to_ptr(info->iter.target_name);
@@ -114,6 +121,12 @@ static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr)

	if (is_iter_map_target(target_name))
		jsonw_uint_field(wtr, "map_id", info->iter.map.map_id);
	else if (is_iter_task_target(target_name)) {
		if (info->iter.task.tid)
			jsonw_uint_field(wtr, "tid", info->iter.task.tid);
		else if (info->iter.task.pid)
			jsonw_uint_field(wtr, "pid", info->iter.task.pid);
	}

	if (is_iter_cgroup_target(target_name)) {
		jsonw_lluint_field(wtr, "cgroup_id", info->iter.cgroup.cgroup_id);
@@ -237,6 +250,12 @@ static void show_iter_plain(struct bpf_link_info *info)

	if (is_iter_map_target(target_name))
		printf("map_id %u  ", info->iter.map.map_id);
	else if (is_iter_task_target(target_name)) {
		if (info->iter.task.tid)
			printf("tid %u ", info->iter.task.tid);
		else if (info->iter.task.pid)
			printf("pid %u ", info->iter.task.pid);
	}

	if (is_iter_cgroup_target(target_name)) {
		printf("cgroup_id %llu  ", info->iter.cgroup.cgroup_id);
+10 −0
Original line number Diff line number Diff line
@@ -110,6 +110,12 @@ union bpf_iter_link_info {
		__u32	cgroup_fd;
		__u64	cgroup_id;
	} cgroup;
	/* Parameters of task iterators. */
	struct {
		__u32	tid;
		__u32	pid;
		__u32	pid_fd;
	} task;
};

/* BPF syscall commands, see bpf(2) man-page for more details. */
@@ -6259,6 +6265,10 @@ struct bpf_link_info {
					__u64 cgroup_id;
					__u32 order;
				} cgroup;
				struct {
					__u32 tid;
					__u32 pid;
				} task;
			};
		} iter;
		struct  {
Loading