Commit b756a3b5 authored by Alistair Popple's avatar Alistair Popple Committed by Linus Torvalds
Browse files

mm: device exclusive memory access

Some devices require exclusive write access to shared virtual memory (SVM)
ranges to perform atomic operations on that memory.  This requires CPU
page tables to be updated to deny access whilst atomic operations are
occurring.

In order to do this introduce a new swap entry type
(SWP_DEVICE_EXCLUSIVE).  When a SVM range needs to be marked for exclusive
access by a device all page table mappings for the particular range are
replaced with device exclusive swap entries.  This causes any CPU access
to the page to result in a fault.

Faults are resovled by replacing the faulting entry with the original
mapping.  This results in MMU notifiers being called which a driver uses
to update access permissions such as revoking atomic access.  After
notifiers have been called the device will no longer have exclusive access
to the region.

Walking of the page tables to find the target pages is handled by
get_user_pages() rather than a direct page table walk.  A direct page
table walk similar to what migrate_vma_collect()/unmap() does could also
have been utilised.  However this resulted in more code similar in
functionality to what get_user_pages() provides as page faulting is
required to make the PTEs present and to break COW.

[dan.carpenter@oracle.com: fix signedness bug in make_device_exclusive_range()]
  Link: https://lkml.kernel.org/r/YNIz5NVnZ5GiZ3u1@mwanda

Link: https://lkml.kernel.org/r/20210616105937.23201-8-apopple@nvidia.com


Signed-off-by: default avatarAlistair Popple <apopple@nvidia.com>
Signed-off-by: default avatarDan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 9a5cc85c
Loading
Loading
Loading
Loading
+17 −0
Original line number Diff line number Diff line
@@ -405,6 +405,23 @@ between device driver specific code and shared common code:

   The lock can now be released.

Exclusive access memory
=======================

Some devices have features such as atomic PTE bits that can be used to implement
atomic access to system memory. To support atomic operations to a shared virtual
memory page such a device needs access to that page which is exclusive of any
userspace access from the CPU. The ``make_device_exclusive_range()`` function
can be used to make a memory range inaccessible from userspace.

This replaces all mappings for pages in the given range with special swap
entries. Any attempt to access the swap entry results in a fault which is
resovled by replacing the entry with the original mapping. A driver gets
notified that the mapping has been changed by MMU notifiers, after which point
it will no longer have exclusive access to the page. Exclusive access is
guranteed to last until the driver drops the page lock and page reference, at
which point any CPU faults on the page may proceed as described.

Memory cgroup (memcg) and rss accounting
========================================

+6 −0
Original line number Diff line number Diff line
@@ -42,6 +42,11 @@ struct mmu_interval_notifier;
 * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
 * a device driver to possibly ignore the invalidation if the
 * owner field matches the driver's device private pgmap owner.
 *
 * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
 * longer have exclusive access to the page. When sent during creation of an
 * exclusive range the owner will be initialised to the value provided by the
 * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
 */
enum mmu_notifier_event {
	MMU_NOTIFY_UNMAP = 0,
@@ -51,6 +56,7 @@ enum mmu_notifier_event {
	MMU_NOTIFY_SOFT_DIRTY,
	MMU_NOTIFY_RELEASE,
	MMU_NOTIFY_MIGRATE,
	MMU_NOTIFY_EXCLUSIVE,
};

#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
+4 −0
Original line number Diff line number Diff line
@@ -194,6 +194,10 @@ int page_referenced(struct page *, int is_locked,
void try_to_migrate(struct page *page, enum ttu_flags flags);
void try_to_unmap(struct page *, enum ttu_flags flags);

int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
				unsigned long end, struct page **pages,
				void *arg);

/* Avoid racy checks */
#define PVMW_SYNC		(1 << 0)
/* Look for migarion entries rather than present PTEs */
+7 −2
Original line number Diff line number Diff line
@@ -62,12 +62,17 @@ static inline int current_is_kswapd(void)
 * migrate part of a process memory to device memory.
 *
 * When a page is migrated from CPU to device, we set the CPU page table entry
 * to a special SWP_DEVICE_* entry.
 * to a special SWP_DEVICE_{READ|WRITE} entry.
 *
 * When a page is mapped by the device for exclusive access we set the CPU page
 * table entries to special SWP_DEVICE_EXCLUSIVE_* entries.
 */
#ifdef CONFIG_DEVICE_PRIVATE
#define SWP_DEVICE_NUM 2
#define SWP_DEVICE_NUM 4
#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
#else
#define SWP_DEVICE_NUM 0
#endif
+43 −1
Original line number Diff line number Diff line
@@ -127,6 +127,27 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry)
{
	return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
}

static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
{
	return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset);
}

static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
{
	return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset);
}

static inline bool is_device_exclusive_entry(swp_entry_t entry)
{
	return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
		swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
}

static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
{
	return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE);
}
#else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
@@ -147,6 +168,26 @@ static inline bool is_writable_device_private_entry(swp_entry_t entry)
{
	return false;
}

static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
{
	return swp_entry(0, 0);
}

static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
{
	return swp_entry(0, 0);
}

static inline bool is_device_exclusive_entry(swp_entry_t entry)
{
	return false;
}

static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
{
	return false;
}
#endif /* CONFIG_DEVICE_PRIVATE */

#ifdef CONFIG_MIGRATION
@@ -226,7 +267,8 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
 */
static inline bool is_pfn_swap_entry(swp_entry_t entry)
{
	return is_migration_entry(entry) || is_device_private_entry(entry);
	return is_migration_entry(entry) || is_device_private_entry(entry) ||
	       is_device_exclusive_entry(entry);
}

struct page_vma_mapped_walk;
Loading