Skip to content

Commit

Permalink
mm, x86: get_user_pages() for dax mappings
Browse files Browse the repository at this point in the history
A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver
has established a devm_memremap_pages() mapping, i.e.  when the pfn_t
return from ->direct_access() has PFN_DEV and PFN_MAP set.  Later, when
encountering _PAGE_DEVMAP during a page table walk we lookup and pin a
struct dev_pagemap instance to keep the result of pfn_to_page() valid
until put_page().

Signed-off-by: Dan Williams <[email protected]>
Tested-by: Logan Gunthorpe <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
djbw authored and torvalds committed Jan 16, 2016
1 parent 5c7fb56 commit 3565fce
Show file tree
Hide file tree
Showing 8 changed files with 212 additions and 39 deletions.
7 changes: 7 additions & 0 deletions arch/x86/include/asm/pgtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,13 @@ static inline int pte_present(pte_t a)
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}

#ifdef __HAVE_ARCH_PTE_DEVMAP
static inline int pte_devmap(pte_t a)
{
return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
}
#endif

#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
Expand Down
57 changes: 54 additions & 3 deletions arch/x86/mm/gup.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <linux/vmstat.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/memremap.h>

#include <asm/pgtable.h>

Expand Down Expand Up @@ -63,6 +64,16 @@ static inline pte_t gup_get_pte(pte_t *ptep)
#endif
}

static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
{
while ((*nr) - nr_start) {
struct page *page = pages[--(*nr)];

ClearPageReferenced(page);
put_page(page);
}
}

/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
Expand All @@ -71,7 +82,9 @@ static inline pte_t gup_get_pte(pte_t *ptep)
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
unsigned long mask;
int nr_start = *nr;
pte_t *ptep;

mask = _PAGE_PRESENT|_PAGE_USER;
Expand All @@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
return 0;
}

if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
page = pte_page(pte);
if (pte_devmap(pte)) {
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
pte_unmap(ptep);
return 0;
}
} else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
}
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page);
put_dev_pagemap(pgmap);
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
Expand All @@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr)
SetPageReferenced(page);
}

static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
int nr_start = *nr;
unsigned long pfn = pmd_pfn(pmd);
struct dev_pagemap *pgmap = NULL;

pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
do {
struct page *page = pfn_to_page(pfn);

pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
return 0;
}
SetPageReferenced(page);
pages[*nr] = page;
get_page(page);
put_dev_pagemap(pgmap);
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
return 1;
}

static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
Expand All @@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
mask |= _PAGE_RW;
if ((pmd_flags(pmd) & mask) != mask)
return 0;

VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
if (pmd_devmap(pmd))
return __gup_device_huge_pmd(pmd, addr, end, pages, nr);

/* hugepages are never "special" */
VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));

refs = 0;
head = pmd_page(pmd);
Expand Down
10 changes: 9 additions & 1 deletion include/linux/huge_mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
int prot_numa);
int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
pfn_t pfn, bool write);

enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
Expand All @@ -55,6 +54,9 @@ enum transparent_hugepage_flag {
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, int flags);

#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
Expand Down Expand Up @@ -205,6 +207,12 @@ static inline bool is_huge_zero_page(struct page *page)
return false;
}


static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd, int flags)
{
return NULL;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#endif /* _LINUX_HUGE_MM_H */
59 changes: 41 additions & 18 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <linux/mm_types.h>
#include <linux/range.h>
#include <linux/pfn.h>
#include <linux/percpu-refcount.h>
#include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
#include <linux/resource.h>
Expand Down Expand Up @@ -465,17 +466,6 @@ static inline int page_count(struct page *page)
return atomic_read(&compound_head(page)->_count);
}

static inline void get_page(struct page *page)
{
page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
atomic_inc(&page->_count);
}

static inline struct page *virt_to_head_page(const void *x)
{
struct page *page = virt_to_page(x);
Expand All @@ -494,13 +484,6 @@ static inline void init_page_count(struct page *page)

void __put_page(struct page *page);

static inline void put_page(struct page *page)
{
page = compound_head(page);
if (put_page_testzero(page))
__put_page(page);
}

void put_pages_list(struct list_head *pages);

void split_page(struct page *page, unsigned int order);
Expand Down Expand Up @@ -682,17 +665,50 @@ static inline enum zone_type page_zonenum(const struct page *page)
}

#ifdef CONFIG_ZONE_DEVICE
void get_zone_device_page(struct page *page);
void put_zone_device_page(struct page *page);
static inline bool is_zone_device_page(const struct page *page)
{
return page_zonenum(page) == ZONE_DEVICE;
}
#else
static inline void get_zone_device_page(struct page *page)
{
}
static inline void put_zone_device_page(struct page *page)
{
}
static inline bool is_zone_device_page(const struct page *page)
{
return false;
}
#endif

static inline void get_page(struct page *page)
{
page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
atomic_inc(&page->_count);

if (unlikely(is_zone_device_page(page)))
get_zone_device_page(page);
}

static inline void put_page(struct page *page)
{
page = compound_head(page);

if (put_page_testzero(page))
__put_page(page);

if (unlikely(is_zone_device_page(page)))
put_zone_device_page(page);
}

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif
Expand Down Expand Up @@ -1444,6 +1460,13 @@ static inline void sync_mm_rss(struct mm_struct *mm)
}
#endif

#ifndef __HAVE_ARCH_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
return 0;
}
#endif

int vma_wants_writenotify(struct vm_area_struct *vma);

extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
Expand Down
12 changes: 12 additions & 0 deletions kernel/memremap.c
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,18 @@ struct page_map {
struct vmem_altmap altmap;
};

void get_zone_device_page(struct page *page)
{
percpu_ref_get(page->pgmap->ref);
}
EXPORT_SYMBOL(get_zone_device_page);

void put_zone_device_page(struct page *page)
{
put_dev_pagemap(page->pgmap);
}
EXPORT_SYMBOL(put_zone_device_page);

static void pgmap_radix_release(struct resource *res)
{
resource_size_t key;
Expand Down
30 changes: 28 additions & 2 deletions mm/gup.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <linux/spinlock.h>

#include <linux/mm.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
Expand Down Expand Up @@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
struct dev_pagemap *pgmap = NULL;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
Expand Down Expand Up @@ -98,7 +100,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
}

page = vm_normal_page(vma, address, pte);
if (unlikely(!page)) {
if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
/*
* Only return device mapping pages in the FOLL_GET case since
* they are only valid while holding the pgmap reference.
*/
pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
if (pgmap)
page = pte_page(pte);
else
goto no_page;
} else if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT);
Expand Down Expand Up @@ -129,8 +141,15 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto retry;
}

if (flags & FOLL_GET)
if (flags & FOLL_GET) {
get_page(page);

/* drop the pgmap reference now that we hold the page */
if (pgmap) {
put_dev_pagemap(pgmap);
pgmap = NULL;
}
}
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
Expand Down Expand Up @@ -237,6 +256,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
}
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
if (pmd_devmap(*pmd)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
if (page)
return page;
}
if (likely(!pmd_trans_huge(*pmd)))
return follow_page_pte(vma, address, pmd, flags);

Expand Down
Loading

0 comments on commit 3565fce

Please sign in to comment.