hugetlb: memcg: account hugetlb-backed memory in memory controller (d151025a) · Commits · EulixOS / Software / Kernel

Documentation/admin-guide/cgroup-v2.rst

+29 −0

Original line number	Diff line number	Diff line
		@@ -210,6 +210,35 @@ cgroup v2 currently supports the following mount options.
		relying on the original semantics (e.g. specifying bogusly
		high 'bypass' protection values at higher tree levels).

		memory_hugetlb_accounting
		Count HugeTLB memory usage towards the cgroup's overall
		memory usage for the memory controller (for the purpose of
		statistics reporting and memory protetion). This is a new
		behavior that could regress existing setups, so it must be
		explicitly opted in with this mount option.

		A few caveats to keep in mind:

		* There is no HugeTLB pool management involved in the memory
		controller. The pre-allocated pool does not belong to anyone.
		Specifically, when a new HugeTLB folio is allocated to
		the pool, it is not accounted for from the perspective of the
		memory controller. It is only charged to a cgroup when it is
		actually used (for e.g at page fault time). Host memory
		overcommit management has to consider this when configuring
		hard limits. In general, HugeTLB pool management should be
		done via other mechanisms (such as the HugeTLB controller).
		* Failure to charge a HugeTLB folio to the memory controller
		results in SIGBUS. This could happen even if the HugeTLB pool
		still has pages available (but the cgroup limit is hit and
		reclaim attempt fails).
		* Charging HugeTLB memory towards the memory controller affects
		memory protection and reclaim dynamics. Any userspace tuning
		(of low, min limits for e.g) needs to take this into account.
		* HugeTLB pages utilized while this option is not selected
		will not be tracked by the memory controller (even if cgroup
		v2 is remounted later on).


		Organizing Processes and Threads
		--------------------------------

include/linux/cgroup-defs.h

+5 −0

Original line number	Diff line number	Diff line
		@@ -116,6 +116,11 @@ enum {
		* Enable recursive subtree protection
		*/
		CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),

		/*
		* Enable hugetlb accounting for the memory controller.
		*/
		CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
		};

		/* cftype->flags */

include/linux/memcontrol.h

+9 −0

Original line number	Diff line number	Diff line
		@@ -807,6 +807,9 @@ static inline int mem_cgroup_charge(struct folio folio, struct mm_struct mm,
		return __mem_cgroup_charge(folio, mm, gfp);
		}

		int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
		long nr_pages);

		int mem_cgroup_swapin_charge_folio(struct folio folio, struct mm_struct mm,
		gfp_t gfp, swp_entry_t entry);
		void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
		@@ -1407,6 +1410,12 @@ static inline int mem_cgroup_charge(struct folio *folio,
		return 0;
		}

		static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
		gfp_t gfp, long nr_pages)
		{
		return 0;
		}

		static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
		struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
		{

kernel/cgroup/cgroup.c

+14 −1

Original line number	Diff line number	Diff line
		@@ -1906,6 +1906,7 @@ enum cgroup2_param {
		Opt_favordynmods,
		Opt_memory_localevents,
		Opt_memory_recursiveprot,
		Opt_memory_hugetlb_accounting,
		nr__cgroup2_params
		};

		@@ -1914,6 +1915,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
		fsparam_flag("favordynmods", Opt_favordynmods),
		fsparam_flag("memory_localevents", Opt_memory_localevents),
		fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
		fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
		{}
		};

		@@ -1940,6 +1942,9 @@ static int cgroup2_parse_param(struct fs_context fc, struct fs_parameter param
		case Opt_memory_recursiveprot:
		ctx->flags \|= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
		return 0;
		case Opt_memory_hugetlb_accounting:
		ctx->flags \|= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
		return 0;
		}
		return -EINVAL;
		}
		@@ -1964,6 +1969,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
		cgrp_dfl_root.flags \|= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
		else
		cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;

		if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
		cgrp_dfl_root.flags \|= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
		else
		cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
		}
		}

		@@ -1977,6 +1987,8 @@ static int cgroup_show_options(struct seq_file seq, struct kernfs_root kf_root
		seq_puts(seq, ",memory_localevents");
		if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
		seq_puts(seq, ",memory_recursiveprot");
		if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
		seq_puts(seq, ",memory_hugetlb_accounting");
		return 0;
		}

		@@ -7163,7 +7175,8 @@ static ssize_t features_show(struct kobject kobj, struct kobj_attribute attr,
		"nsdelegate\n"
		"favordynmods\n"
		"memory_localevents\n"
		"memory_recursiveprot\n");
		"memory_recursiveprot\n"
		"memory_hugetlb_accounting\n");
		}
		static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);

mm/hugetlb.c

+28 −8

Original line number	Diff line number	Diff line
		@@ -1947,7 +1947,7 @@ void free_huge_folio(struct folio *folio)
		pages_per_huge_page(h), folio);
		hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
		pages_per_huge_page(h), folio);

		mem_cgroup_uncharge(folio);
		if (page_from_dynamic_pool(folio_page(folio, 0))) {
		list_del(&folio->lru);
		spin_unlock_irqrestore(&hugetlb_lock, flags);
		@@ -3144,11 +3144,20 @@ struct folio alloc_hugetlb_folio(struct vm_area_struct vma,
		struct hugetlbfs_inode_info *info = HUGETLBFS_I(file_inode(vma->vm_file));
		struct hstate *h = hstate_vma(vma);
		struct folio *folio;
		long map_chg, map_commit;
		long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
		long gbl_chg;
		int ret, idx;
		int memcg_charge_ret, ret, idx;
		struct hugetlb_cgroup *h_cg = NULL;
		struct mem_cgroup *memcg;
		bool deferred_reserve;
		gfp_t gfp = htlb_alloc_mask(h) \| __GFP_RETRY_MAYFAIL;

		memcg = get_mem_cgroup_from_current();
		memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
		if (memcg_charge_ret == -ENOMEM) {
		mem_cgroup_put(memcg);
		return ERR_PTR(-ENOMEM);
		}

		idx = hstate_index(h);
		/*
		@@ -3157,8 +3166,12 @@ struct folio alloc_hugetlb_folio(struct vm_area_struct vma,
		* code of zero indicates a reservation exists (no change).
		*/
		map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
		if (map_chg < 0)
		if (map_chg < 0) {
		if (!memcg_charge_ret)
		mem_cgroup_cancel_charge(memcg, nr_pages);
		mem_cgroup_put(memcg);
		return ERR_PTR(-ENOMEM);
		}

		/*
		* Processes that did not create the mapping will have no
		@@ -3169,10 +3182,8 @@ struct folio alloc_hugetlb_folio(struct vm_area_struct vma,
		*/
		if (map_chg \|\| avoid_reserve) {
		gbl_chg = hugepage_subpool_get_pages(spool, 1, info);
		if (gbl_chg < 0) {
		vma_end_reservation(h, vma, addr);
		return ERR_PTR(-ENOSPC);
		}
		if (gbl_chg < 0)
		goto out_end_reservation;

		/*
		* Even though there was no reservation in the region/reserve
		@@ -3268,6 +3279,11 @@ struct folio alloc_hugetlb_folio(struct vm_area_struct vma,
		hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
		pages_per_huge_page(h), folio);
		}

		if (!memcg_charge_ret)
		mem_cgroup_commit_charge(folio, memcg);
		mem_cgroup_put(memcg);

		return folio;

		out_uncharge_cgroup:
		@@ -3279,7 +3295,11 @@ struct folio alloc_hugetlb_folio(struct vm_area_struct vma,
		out_subpool_put:
		if (map_chg \|\| avoid_reserve)
		hugepage_subpool_put_pages(spool, 1, info);
		out_end_reservation:
		vma_end_reservation(h, vma, addr);
		if (!memcg_charge_ret)
		mem_cgroup_cancel_charge(memcg, nr_pages);
		mem_cgroup_put(memcg);
		return ERR_PTR(-ENOSPC);
		}