Loading fs/dax.c +128 −301 Original line number Diff line number Diff line Loading @@ -93,12 +93,6 @@ static unsigned long dax_to_pfn(void *entry) return xa_to_value(entry) >> DAX_SHIFT; } static void *dax_make_locked(unsigned long pfn, unsigned long flags) { return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) | DAX_LOCKED); } static void *dax_make_entry(pfn_t pfn, unsigned long flags) { return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); Loading Loading @@ -155,10 +149,11 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, pgoff_t index, void *entry, struct exceptional_entry_key *key) static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry, struct exceptional_entry_key *key) { unsigned long hash; unsigned long index = xas->xa_index; /* * If 'entry' is a PMD, align the 'index' that we use for the wait Loading @@ -167,11 +162,10 @@ static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, */ if (dax_is_pmd_entry(entry)) index &= ~PG_PMD_COLOUR; key->xa = xa; key->xa = xas->xa; key->entry_start = index; hash = hash_long((unsigned long)xa ^ index, DAX_WAIT_TABLE_BITS); hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); return wait_table + hash; } Loading @@ -193,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ static void dax_wake_mapping_entry_waiter(struct xarray *xa, pgoff_t index, void *entry, bool wake_all) static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) { struct exceptional_entry_key key; wait_queue_head_t *wq; wq = dax_entry_waitqueue(xa, index, entry, &key); wq = dax_entry_waitqueue(xas, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens Loading @@ -211,12 +204,6 @@ static void dax_wake_mapping_entry_waiter(struct xarray *xa, __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); } static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) { return dax_wake_mapping_entry_waiter(xas->xa, xas->xa_index, entry, wake_all); } /* * Look up entry in page cache, wait for it to become unlocked if it * is a DAX entry and return it. The caller must subsequently call Loading @@ -241,8 +228,7 @@ static void *get_unlocked_entry(struct xa_state *xas) !dax_is_locked(entry)) return entry; wq = dax_entry_waitqueue(xas->xa, xas->xa_index, entry, &ewait.key); wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xas_unlock_irq(xas); Loading Loading @@ -286,138 +272,6 @@ static void *dax_lock_entry(struct xa_state *xas, void *entry) return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); } /* * Check whether the given slot is locked. Must be called with the i_pages * lock held. */ static inline int slot_locked(struct address_space *mapping, void **slot) { unsigned long entry = xa_to_value( radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); return entry & DAX_LOCKED; } /* * Mark the given slot as locked. Must be called with the i_pages lock held. */ static inline void *lock_slot(struct address_space *mapping, void **slot) { unsigned long v = xa_to_value( radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); void *entry = xa_mk_value(v | DAX_LOCKED); radix_tree_replace_slot(&mapping->i_pages, slot, entry); return entry; } /* * Mark the given slot as unlocked. Must be called with the i_pages lock held. */ static inline void *unlock_slot(struct address_space *mapping, void **slot) { unsigned long v = xa_to_value( radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); void *entry = xa_mk_value(v & ~DAX_LOCKED); radix_tree_replace_slot(&mapping->i_pages, slot, entry); return entry; } /* * Lookup entry in page cache, wait for it to become unlocked if it is * a DAX entry and return it. The caller must call * put_unlocked_mapping_entry() when he decided not to lock the entry or * put_locked_mapping_entry() when he locked the entry and now wants to * unlock it. * * Must be called with the i_pages lock held. */ static void *__get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp, bool (*wait_fn)(void)) { void *entry, **slot; struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; for (;;) { bool revalidate; entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; return entry; } wq = dax_entry_waitqueue(&mapping->i_pages, index, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xa_unlock_irq(&mapping->i_pages); revalidate = wait_fn(); finish_wait(wq, &ewait.wait); xa_lock_irq(&mapping->i_pages); if (revalidate) return ERR_PTR(-EAGAIN); } } static bool entry_wait(void) { schedule(); /* * Never return an ERR_PTR() from * __get_unlocked_mapping_entry(), just keep looping. */ return false; } static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) { return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait); } static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) { void *entry, **slot; xa_lock_irq(&mapping->i_pages); entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (WARN_ON_ONCE(!entry || !xa_is_value(entry) || !slot_locked(mapping, slot))) { xa_unlock_irq(&mapping->i_pages); return; } unlock_slot(mapping, slot); xa_unlock_irq(&mapping->i_pages); dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false); } static void put_locked_mapping_entry(struct address_space *mapping, pgoff_t index) { unlock_mapping_entry(mapping, index); } /* * Called when we are done with page cache entry we looked up via * get_unlocked_mapping_entry() and which we didn't lock in the end. */ static void put_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { if (!entry) return; /* We have to wake up next waiter for the page cache entry lock */ dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false); } static unsigned long dax_entry_size(void *entry) { if (dax_is_zero_entry(entry)) Loading Loading @@ -558,47 +412,52 @@ void dax_unlock_mapping_entry(struct page *page) * that index, add a locked empty entry. * * When requesting an entry with size DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return an error. This error will * happen if there are any 4k entries within the 2MiB range that we are * requesting. * either return that locked entry or will return VM_FAULT_FALLBACK. * This will happen if there are any PTE entries within the PMD range * that we are requesting. * * We always favor 4k entries over 2MiB entries. There isn't a flow where we * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB * insertion will fail if it finds any 4k entries already in the tree, and a * 4k insertion will cause an existing 2MiB entry to be unmapped and * downgraded to 4k entries. This happens for both 2MiB huge zero pages as * well as 2MiB empty entries. * We always favor PTE entries over PMD entries. There isn't a flow where we * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD * insertion will fail if it finds any PTE entries already in the tree, and a * PTE insertion will cause an existing PMD entry to be unmapped and * downgraded to PTE entries. This happens for both PMD zero pages as * well as PMD empty entries. * * The exception to this downgrade path is for 2MiB DAX PMD entries that have * real storage backing them. We will leave these real 2MiB DAX entries in * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. * The exception to this downgrade path is for PMD entries that have * real storage backing them. We will leave these real PMD entries in * the tree, and PTE writes will simply dirty the entire PMD entry. * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. * * On error, this function does not return an ERR_PTR. Instead it returns * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values * overlap with xarray value entries. */ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, unsigned long size_flag) static void *grab_mapping_entry(struct xa_state *xas, struct address_space *mapping, unsigned long size_flag) { bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ void *entry, **slot; unsigned long index = xas->xa_index; bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ void *entry; restart: xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); retry: xas_lock_irq(xas); entry = get_unlocked_entry(xas); if (xa_is_internal(entry)) goto fallback; if (WARN_ON_ONCE(entry && !xa_is_value(entry))) { entry = ERR_PTR(-EIO); if (entry) { if (WARN_ON_ONCE(!xa_is_value(entry))) { xas_set_err(xas, EIO); goto out_unlock; } if (entry) { if (size_flag & DAX_PMD) { if (dax_is_pte_entry(entry)) { put_unlocked_mapping_entry(mapping, index, entry); entry = ERR_PTR(-EEXIST); goto out_unlock; put_unlocked_entry(xas, entry); goto fallback; } } else { /* trying to grab a PTE entry */ if (dax_is_pmd_entry(entry) && Loading @@ -609,87 +468,57 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, } } /* No entry for given index? Make sure radix tree is big enough. */ if (!entry || pmd_downgrade) { int err; if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop * the i_pages lock. */ entry = lock_slot(mapping, slot); } dax_lock_entry(xas, entry); xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be * unmapped. */ if (pmd_downgrade && dax_is_zero_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, if (dax_is_zero_entry(entry)) { xas_unlock_irq(xas); unmap_mapping_pages(mapping, xas->xa_index & ~PG_PMD_COLOUR, PG_PMD_NR, false); err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); if (err) { if (pmd_downgrade) put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } xa_lock_irq(&mapping->i_pages); if (!entry) { /* * We needed to drop the i_pages lock while calling * radix_tree_preload() and we didn't have an entry to * lock. See if another thread inserted an entry at * our index during this time. */ entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (entry) { radix_tree_preload_end(); xa_unlock_irq(&mapping->i_pages); goto restart; } xas_reset(xas); xas_lock_irq(xas); } if (pmd_downgrade) { dax_disassociate_entry(entry, mapping, false); radix_tree_delete(&mapping->i_pages, index); xas_store(xas, NULL); /* undo the PMD join */ dax_wake_entry(xas, entry, true); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, true); entry = NULL; xas_set(xas, index); } entry = dax_make_locked(0, size_flag | DAX_EMPTY); err = __radix_tree_insert(&mapping->i_pages, index, dax_entry_order(entry), entry); radix_tree_preload_end(); if (err) { xa_unlock_irq(&mapping->i_pages); /* * Our insertion of a DAX entry failed, most likely * because we were inserting a PMD entry and it * collided with a PTE sized entry at a different * index in the PMD range. We haven't inserted * anything into the radix tree and have no waiters to * wake. */ return ERR_PTR(err); } /* Good, we have inserted empty locked entry into the tree. */ if (entry) { dax_lock_entry(xas, entry); } else { entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY); dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; mapping->nrexceptional++; xa_unlock_irq(&mapping->i_pages); return entry; } entry = lock_slot(mapping, slot); out_unlock: xa_unlock_irq(&mapping->i_pages); xas_unlock_irq(xas); if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) goto retry; if (xas->xa_node == XA_ERROR(-ENOMEM)) return xa_mk_internal(VM_FAULT_OOM); if (xas_error(xas)) return xa_mk_internal(VM_FAULT_SIGBUS); return entry; fallback: xas_unlock_irq(xas); return xa_mk_internal(VM_FAULT_FALLBACK); } /** Loading Loading @@ -847,29 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ static void *dax_insert_entry(struct address_space *mapping, struct vm_fault *vmf, void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) static void *dax_insert_entry(struct xa_state *xas, struct address_space *mapping, struct vm_fault *vmf, void *entry, pfn_t pfn, unsigned long flags, bool dirty) { struct radix_tree_root *pages = &mapping->i_pages; unsigned long pfn = pfn_t_to_pfn(pfn_t); pgoff_t index = vmf->pgoff; void *new_entry; void *new_entry = dax_make_entry(pfn, flags); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, PG_PMD_NR, false); else /* pte entry */ unmap_mapping_pages(mapping, vmf->pgoff, 1, false); unmap_mapping_pages(mapping, index, 1, false); } xa_lock_irq(pages); new_entry = dax_make_locked(pfn, flags); xas_reset(xas); xas_lock_irq(xas); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); Loading @@ -884,21 +711,18 @@ static void *dax_insert_entry(struct address_space *mapping, * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary. */ struct radix_tree_node *node; void **slot; void *ret; ret = __radix_tree_lookup(pages, index, &node, &slot); WARN_ON_ONCE(ret != entry); __radix_tree_replace(pages, node, slot, new_entry, NULL); void *old = dax_lock_entry(xas, new_entry); WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | DAX_LOCKED)); entry = new_entry; } else { xas_load(xas); /* Walk the xa_state */ } if (dirty) radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); xas_set_mark(xas, PAGECACHE_TAG_DIRTY); xa_unlock_irq(pages); xas_unlock_irq(xas); return entry; } Loading Loading @@ -1166,7 +990,8 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, static vm_fault_t dax_load_hole(struct xa_state *xas, struct address_space *mapping, void **entry, struct vm_fault *vmf) { struct inode *inode = mapping->host; Loading @@ -1174,7 +999,7 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; dax_insert_entry(mapping, vmf, entry, pfn, *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, DAX_ZERO_PAGE, false); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); Loading Loading @@ -1384,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; XA_STATE(xas, &mapping->i_pages, vmf->pgoff); struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; Loading @@ -1410,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (write && !vmf->cow_page) flags |= IOMAP_WRITE; entry = grab_mapping_entry(mapping, vmf->pgoff, 0); if (IS_ERR(entry)) { ret = dax_fault_return(PTR_ERR(entry)); entry = grab_mapping_entry(&xas, mapping, 0); if (xa_is_internal(entry)) { ret = xa_to_internal(entry); goto out; } Loading Loading @@ -1485,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto error_finish_iomap; entry = dax_insert_entry(mapping, vmf, entry, pfn, entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, 0, write && !sync); /* Loading Loading @@ -1513,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!write) { ret = dax_load_hole(mapping, entry, vmf); ret = dax_load_hole(&xas, mapping, &entry, vmf); goto finish_iomap; } /*FALLTHRU*/ Loading @@ -1540,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff); dax_unlock_entry(&xas, entry); out: trace_dax_pte_fault_done(inode, vmf, ret); return ret | major; } #ifdef CONFIG_FS_DAX_PMD static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, struct iomap *iomap, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; struct inode *inode = mapping->host; struct page *zero_page; void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; pfn_t pfn; Loading @@ -1565,7 +1390,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; pfn = page_to_pfn_t(zero_page); ret = dax_insert_entry(mapping, vmf, entry, pfn, *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE, false); ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); Loading @@ -1578,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); return VM_FAULT_NOPAGE; fallback: trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); return VM_FAULT_FALLBACK; } Loading @@ -1591,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; Loading @@ -1598,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; pgoff_t max_pgoff; void *entry; loff_t pos; int error; Loading @@ -1609,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ pgoff = linear_page_index(vma, pmd_addr); max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); Loading @@ -1634,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; if (pgoff >= max_pgoff) { if (xas.xa_index >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } /* If the PMD would extend beyond the file size */ if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) goto fallback; /* * grab_mapping_entry() will make sure we get a 2MiB empty entry, a * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page * is already in the tree, for instance), it will return -EEXIST and * we just fall back to 4k entries. * grab_mapping_entry() will make sure we get an empty PMD entry, * a zero PMD entry or a DAX PMD. If it can't (because a PTE * entry is already in the array, for instance), it will return * VM_FAULT_FALLBACK. */ entry = grab_mapping_entry(mapping, pgoff, DAX_PMD); if (IS_ERR(entry)) entry = grab_mapping_entry(&xas, mapping, DAX_PMD); if (xa_is_internal(entry)) { result = xa_to_internal(entry); goto fallback; } /* * It is possible, particularly with mixed reads & writes to private Loading @@ -1670,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. */ pos = (loff_t)pgoff << PAGE_SHIFT; pos = (loff_t)xas.xa_index << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) goto unlock_entry; Loading @@ -1686,7 +1513,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto finish_iomap; entry = dax_insert_entry(mapping, vmf, entry, pfn, entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, DAX_PMD, write && !sync); /* Loading @@ -1711,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; result = dax_pmd_load_hole(vmf, &iomap, entry); result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); break; default: WARN_ON_ONCE(1); Loading @@ -1734,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, &iomap); } unlock_entry: put_locked_mapping_entry(mapping, pgoff); dax_unlock_entry(&xas, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); Loading Loading
fs/dax.c +128 −301 Original line number Diff line number Diff line Loading @@ -93,12 +93,6 @@ static unsigned long dax_to_pfn(void *entry) return xa_to_value(entry) >> DAX_SHIFT; } static void *dax_make_locked(unsigned long pfn, unsigned long flags) { return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) | DAX_LOCKED); } static void *dax_make_entry(pfn_t pfn, unsigned long flags) { return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); Loading Loading @@ -155,10 +149,11 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, pgoff_t index, void *entry, struct exceptional_entry_key *key) static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry, struct exceptional_entry_key *key) { unsigned long hash; unsigned long index = xas->xa_index; /* * If 'entry' is a PMD, align the 'index' that we use for the wait Loading @@ -167,11 +162,10 @@ static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, */ if (dax_is_pmd_entry(entry)) index &= ~PG_PMD_COLOUR; key->xa = xa; key->xa = xas->xa; key->entry_start = index; hash = hash_long((unsigned long)xa ^ index, DAX_WAIT_TABLE_BITS); hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); return wait_table + hash; } Loading @@ -193,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ static void dax_wake_mapping_entry_waiter(struct xarray *xa, pgoff_t index, void *entry, bool wake_all) static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) { struct exceptional_entry_key key; wait_queue_head_t *wq; wq = dax_entry_waitqueue(xa, index, entry, &key); wq = dax_entry_waitqueue(xas, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens Loading @@ -211,12 +204,6 @@ static void dax_wake_mapping_entry_waiter(struct xarray *xa, __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); } static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) { return dax_wake_mapping_entry_waiter(xas->xa, xas->xa_index, entry, wake_all); } /* * Look up entry in page cache, wait for it to become unlocked if it * is a DAX entry and return it. The caller must subsequently call Loading @@ -241,8 +228,7 @@ static void *get_unlocked_entry(struct xa_state *xas) !dax_is_locked(entry)) return entry; wq = dax_entry_waitqueue(xas->xa, xas->xa_index, entry, &ewait.key); wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xas_unlock_irq(xas); Loading Loading @@ -286,138 +272,6 @@ static void *dax_lock_entry(struct xa_state *xas, void *entry) return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); } /* * Check whether the given slot is locked. Must be called with the i_pages * lock held. */ static inline int slot_locked(struct address_space *mapping, void **slot) { unsigned long entry = xa_to_value( radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); return entry & DAX_LOCKED; } /* * Mark the given slot as locked. Must be called with the i_pages lock held. */ static inline void *lock_slot(struct address_space *mapping, void **slot) { unsigned long v = xa_to_value( radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); void *entry = xa_mk_value(v | DAX_LOCKED); radix_tree_replace_slot(&mapping->i_pages, slot, entry); return entry; } /* * Mark the given slot as unlocked. Must be called with the i_pages lock held. */ static inline void *unlock_slot(struct address_space *mapping, void **slot) { unsigned long v = xa_to_value( radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); void *entry = xa_mk_value(v & ~DAX_LOCKED); radix_tree_replace_slot(&mapping->i_pages, slot, entry); return entry; } /* * Lookup entry in page cache, wait for it to become unlocked if it is * a DAX entry and return it. The caller must call * put_unlocked_mapping_entry() when he decided not to lock the entry or * put_locked_mapping_entry() when he locked the entry and now wants to * unlock it. * * Must be called with the i_pages lock held. */ static void *__get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp, bool (*wait_fn)(void)) { void *entry, **slot; struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; for (;;) { bool revalidate; entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; return entry; } wq = dax_entry_waitqueue(&mapping->i_pages, index, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xa_unlock_irq(&mapping->i_pages); revalidate = wait_fn(); finish_wait(wq, &ewait.wait); xa_lock_irq(&mapping->i_pages); if (revalidate) return ERR_PTR(-EAGAIN); } } static bool entry_wait(void) { schedule(); /* * Never return an ERR_PTR() from * __get_unlocked_mapping_entry(), just keep looping. */ return false; } static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) { return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait); } static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) { void *entry, **slot; xa_lock_irq(&mapping->i_pages); entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (WARN_ON_ONCE(!entry || !xa_is_value(entry) || !slot_locked(mapping, slot))) { xa_unlock_irq(&mapping->i_pages); return; } unlock_slot(mapping, slot); xa_unlock_irq(&mapping->i_pages); dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false); } static void put_locked_mapping_entry(struct address_space *mapping, pgoff_t index) { unlock_mapping_entry(mapping, index); } /* * Called when we are done with page cache entry we looked up via * get_unlocked_mapping_entry() and which we didn't lock in the end. */ static void put_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { if (!entry) return; /* We have to wake up next waiter for the page cache entry lock */ dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false); } static unsigned long dax_entry_size(void *entry) { if (dax_is_zero_entry(entry)) Loading Loading @@ -558,47 +412,52 @@ void dax_unlock_mapping_entry(struct page *page) * that index, add a locked empty entry. * * When requesting an entry with size DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return an error. This error will * happen if there are any 4k entries within the 2MiB range that we are * requesting. * either return that locked entry or will return VM_FAULT_FALLBACK. * This will happen if there are any PTE entries within the PMD range * that we are requesting. * * We always favor 4k entries over 2MiB entries. There isn't a flow where we * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB * insertion will fail if it finds any 4k entries already in the tree, and a * 4k insertion will cause an existing 2MiB entry to be unmapped and * downgraded to 4k entries. This happens for both 2MiB huge zero pages as * well as 2MiB empty entries. * We always favor PTE entries over PMD entries. There isn't a flow where we * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD * insertion will fail if it finds any PTE entries already in the tree, and a * PTE insertion will cause an existing PMD entry to be unmapped and * downgraded to PTE entries. This happens for both PMD zero pages as * well as PMD empty entries. * * The exception to this downgrade path is for 2MiB DAX PMD entries that have * real storage backing them. We will leave these real 2MiB DAX entries in * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. * The exception to this downgrade path is for PMD entries that have * real storage backing them. We will leave these real PMD entries in * the tree, and PTE writes will simply dirty the entire PMD entry. * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. * * On error, this function does not return an ERR_PTR. Instead it returns * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values * overlap with xarray value entries. */ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, unsigned long size_flag) static void *grab_mapping_entry(struct xa_state *xas, struct address_space *mapping, unsigned long size_flag) { bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ void *entry, **slot; unsigned long index = xas->xa_index; bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ void *entry; restart: xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); retry: xas_lock_irq(xas); entry = get_unlocked_entry(xas); if (xa_is_internal(entry)) goto fallback; if (WARN_ON_ONCE(entry && !xa_is_value(entry))) { entry = ERR_PTR(-EIO); if (entry) { if (WARN_ON_ONCE(!xa_is_value(entry))) { xas_set_err(xas, EIO); goto out_unlock; } if (entry) { if (size_flag & DAX_PMD) { if (dax_is_pte_entry(entry)) { put_unlocked_mapping_entry(mapping, index, entry); entry = ERR_PTR(-EEXIST); goto out_unlock; put_unlocked_entry(xas, entry); goto fallback; } } else { /* trying to grab a PTE entry */ if (dax_is_pmd_entry(entry) && Loading @@ -609,87 +468,57 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, } } /* No entry for given index? Make sure radix tree is big enough. */ if (!entry || pmd_downgrade) { int err; if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop * the i_pages lock. */ entry = lock_slot(mapping, slot); } dax_lock_entry(xas, entry); xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be * unmapped. */ if (pmd_downgrade && dax_is_zero_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, if (dax_is_zero_entry(entry)) { xas_unlock_irq(xas); unmap_mapping_pages(mapping, xas->xa_index & ~PG_PMD_COLOUR, PG_PMD_NR, false); err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); if (err) { if (pmd_downgrade) put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } xa_lock_irq(&mapping->i_pages); if (!entry) { /* * We needed to drop the i_pages lock while calling * radix_tree_preload() and we didn't have an entry to * lock. See if another thread inserted an entry at * our index during this time. */ entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (entry) { radix_tree_preload_end(); xa_unlock_irq(&mapping->i_pages); goto restart; } xas_reset(xas); xas_lock_irq(xas); } if (pmd_downgrade) { dax_disassociate_entry(entry, mapping, false); radix_tree_delete(&mapping->i_pages, index); xas_store(xas, NULL); /* undo the PMD join */ dax_wake_entry(xas, entry, true); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, true); entry = NULL; xas_set(xas, index); } entry = dax_make_locked(0, size_flag | DAX_EMPTY); err = __radix_tree_insert(&mapping->i_pages, index, dax_entry_order(entry), entry); radix_tree_preload_end(); if (err) { xa_unlock_irq(&mapping->i_pages); /* * Our insertion of a DAX entry failed, most likely * because we were inserting a PMD entry and it * collided with a PTE sized entry at a different * index in the PMD range. We haven't inserted * anything into the radix tree and have no waiters to * wake. */ return ERR_PTR(err); } /* Good, we have inserted empty locked entry into the tree. */ if (entry) { dax_lock_entry(xas, entry); } else { entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY); dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; mapping->nrexceptional++; xa_unlock_irq(&mapping->i_pages); return entry; } entry = lock_slot(mapping, slot); out_unlock: xa_unlock_irq(&mapping->i_pages); xas_unlock_irq(xas); if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) goto retry; if (xas->xa_node == XA_ERROR(-ENOMEM)) return xa_mk_internal(VM_FAULT_OOM); if (xas_error(xas)) return xa_mk_internal(VM_FAULT_SIGBUS); return entry; fallback: xas_unlock_irq(xas); return xa_mk_internal(VM_FAULT_FALLBACK); } /** Loading Loading @@ -847,29 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ static void *dax_insert_entry(struct address_space *mapping, struct vm_fault *vmf, void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) static void *dax_insert_entry(struct xa_state *xas, struct address_space *mapping, struct vm_fault *vmf, void *entry, pfn_t pfn, unsigned long flags, bool dirty) { struct radix_tree_root *pages = &mapping->i_pages; unsigned long pfn = pfn_t_to_pfn(pfn_t); pgoff_t index = vmf->pgoff; void *new_entry; void *new_entry = dax_make_entry(pfn, flags); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, PG_PMD_NR, false); else /* pte entry */ unmap_mapping_pages(mapping, vmf->pgoff, 1, false); unmap_mapping_pages(mapping, index, 1, false); } xa_lock_irq(pages); new_entry = dax_make_locked(pfn, flags); xas_reset(xas); xas_lock_irq(xas); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); Loading @@ -884,21 +711,18 @@ static void *dax_insert_entry(struct address_space *mapping, * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary. */ struct radix_tree_node *node; void **slot; void *ret; ret = __radix_tree_lookup(pages, index, &node, &slot); WARN_ON_ONCE(ret != entry); __radix_tree_replace(pages, node, slot, new_entry, NULL); void *old = dax_lock_entry(xas, new_entry); WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | DAX_LOCKED)); entry = new_entry; } else { xas_load(xas); /* Walk the xa_state */ } if (dirty) radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); xas_set_mark(xas, PAGECACHE_TAG_DIRTY); xa_unlock_irq(pages); xas_unlock_irq(xas); return entry; } Loading Loading @@ -1166,7 +990,8 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, static vm_fault_t dax_load_hole(struct xa_state *xas, struct address_space *mapping, void **entry, struct vm_fault *vmf) { struct inode *inode = mapping->host; Loading @@ -1174,7 +999,7 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; dax_insert_entry(mapping, vmf, entry, pfn, *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, DAX_ZERO_PAGE, false); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); Loading Loading @@ -1384,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; XA_STATE(xas, &mapping->i_pages, vmf->pgoff); struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; Loading @@ -1410,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (write && !vmf->cow_page) flags |= IOMAP_WRITE; entry = grab_mapping_entry(mapping, vmf->pgoff, 0); if (IS_ERR(entry)) { ret = dax_fault_return(PTR_ERR(entry)); entry = grab_mapping_entry(&xas, mapping, 0); if (xa_is_internal(entry)) { ret = xa_to_internal(entry); goto out; } Loading Loading @@ -1485,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto error_finish_iomap; entry = dax_insert_entry(mapping, vmf, entry, pfn, entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, 0, write && !sync); /* Loading Loading @@ -1513,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!write) { ret = dax_load_hole(mapping, entry, vmf); ret = dax_load_hole(&xas, mapping, &entry, vmf); goto finish_iomap; } /*FALLTHRU*/ Loading @@ -1540,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff); dax_unlock_entry(&xas, entry); out: trace_dax_pte_fault_done(inode, vmf, ret); return ret | major; } #ifdef CONFIG_FS_DAX_PMD static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, struct iomap *iomap, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; struct inode *inode = mapping->host; struct page *zero_page; void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; pfn_t pfn; Loading @@ -1565,7 +1390,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; pfn = page_to_pfn_t(zero_page); ret = dax_insert_entry(mapping, vmf, entry, pfn, *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE, false); ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); Loading @@ -1578,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); return VM_FAULT_NOPAGE; fallback: trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); return VM_FAULT_FALLBACK; } Loading @@ -1591,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; Loading @@ -1598,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; pgoff_t max_pgoff; void *entry; loff_t pos; int error; Loading @@ -1609,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ pgoff = linear_page_index(vma, pmd_addr); max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); Loading @@ -1634,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; if (pgoff >= max_pgoff) { if (xas.xa_index >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } /* If the PMD would extend beyond the file size */ if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) goto fallback; /* * grab_mapping_entry() will make sure we get a 2MiB empty entry, a * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page * is already in the tree, for instance), it will return -EEXIST and * we just fall back to 4k entries. * grab_mapping_entry() will make sure we get an empty PMD entry, * a zero PMD entry or a DAX PMD. If it can't (because a PTE * entry is already in the array, for instance), it will return * VM_FAULT_FALLBACK. */ entry = grab_mapping_entry(mapping, pgoff, DAX_PMD); if (IS_ERR(entry)) entry = grab_mapping_entry(&xas, mapping, DAX_PMD); if (xa_is_internal(entry)) { result = xa_to_internal(entry); goto fallback; } /* * It is possible, particularly with mixed reads & writes to private Loading @@ -1670,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. */ pos = (loff_t)pgoff << PAGE_SHIFT; pos = (loff_t)xas.xa_index << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) goto unlock_entry; Loading @@ -1686,7 +1513,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto finish_iomap; entry = dax_insert_entry(mapping, vmf, entry, pfn, entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, DAX_PMD, write && !sync); /* Loading @@ -1711,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; result = dax_pmd_load_hole(vmf, &iomap, entry); result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); break; default: WARN_ON_ONCE(1); Loading @@ -1734,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, &iomap); } unlock_entry: put_locked_mapping_entry(mapping, pgoff); dax_unlock_entry(&xas, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); Loading