From f95de70c484bb3fc79ae6c5e97147fc6022518f5 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 30 Jul 2015 00:01:06 +0000 Subject: [PATCH] mm: introduce VM_LOCKONFAULT The cost of faulting in all memory to be locked can be very high when working with large mappings. If only portions of the mapping will be used this can incur a high penalty for locking. For the example of a large file, this is the usage pattern for a large statical language model (probably applies to other statical or graphical models as well). For the security example, any application transacting in data that cannot be swapped out (credit card data, medical records, etc). This patch introduces the ability to request that pages are not pre-faulted, but are placed on the unevictable LRU when they are finally faulted in. The VM_LOCKONFAULT flag will be used together with VM_LOCKED and has no effect when set without VM_LOCKED. Setting the VM_LOCKONFAULT flag for a VMA will cause pages faulted into that VMA to be added to the unevictable LRU when they are faulted or if they are already present, but will not cause any missing pages to be faulted in. Exposing this new lock state means that we cannot overload the meaning of the FOLL_POPULATE flag any longer. Prior to this patch it was used to mean that the VMA for a fault was locked. This means we need the new FOLL_MLOCK flag to communicate the locked state of a VMA. FOLL_POPULATE will now only control if the VMA should be populated and in the case of VM_LOCKONFAULT, it will not be set. Signed-off-by: Eric B Munson Acked-by: Kirill A. Shutemov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Guenter Roeck Cc: Heiko Carstens Cc: Michael Kerrisk Cc: Ralf Baechle Cc: Shuah Khan Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- drivers/gpu/drm/drm_vm.c | 8 +++++++- fs/proc/task_mmu.c | 1 + include/linux/mm.h | 2 ++ kernel/fork.c | 3 ++- mm/debug.c | 1 + mm/gup.c | 10 ++++++++-- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mlock.c | 2 +- mm/mmap.c | 2 +- mm/rmap.c | 4 ++-- 11 files changed, 28 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index aab49ee4ed40d2..103a5f6b969a87 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -699,9 +699,15 @@ int drm_vma_info(struct seq_file *m, void *data) (void *)(unsigned long)virt_to_phys(high_memory)); list_for_each_entry(pt, &dev->vmalist, head) { + char lock_flag = '-'; + vma = pt->vma; if (!vma) continue; + if (vma->vm_flags & VM_LOCKONFAULT) + lock_flag = 'f'; + else if (vma->vm_flags & VM_LOCKED) + lock_flag = 'l'; seq_printf(m, "\n%5d 0x%pK-0x%pK %c%c%c%c%c%c 0x%08lx000", pt->pid, @@ -710,7 +716,7 @@ int drm_vma_info(struct seq_file *m, void *data) vma->vm_flags & VM_WRITE ? 'w' : '-', vma->vm_flags & VM_EXEC ? 'x' : '-', vma->vm_flags & VM_MAYSHARE ? 's' : 'p', - vma->vm_flags & VM_LOCKED ? 'l' : '-', + lock_flag, vma->vm_flags & VM_IO ? 'i' : '-', vma->vm_pgoff); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3b4d8255e8068d..2010e21af54d2d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -579,6 +579,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_X86_INTEL_MPX [ilog2(VM_MPX)] = "mp", #endif + [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", diff --git a/include/linux/mm.h b/include/linux/mm.h index 56a243346bf479..03b90a9e9b8c8b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -129,6 +129,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ +#define VM_LOCKONFAULT 0x00001000 /* Lock the pages covered when they are faulted in */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ @@ -2045,6 +2046,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ +#define FOLL_MLOCK 0x1000 /* lock present pages */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/kernel/fork.c b/kernel/fork.c index 85f8f0cee705a2..f81ace08b481ac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -455,7 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT | + VM_UFFD_MISSING | VM_UFFD_WP); tmp->vm_next = tmp->vm_prev = NULL; tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; file = tmp->vm_file; diff --git a/mm/debug.c b/mm/debug.c index 76089ddf99ea1c..25176bb1efec24 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -121,6 +121,7 @@ static const struct trace_print_flags vmaflags_names[] = { {VM_GROWSDOWN, "growsdown" }, {VM_PFNMAP, "pfnmap" }, {VM_DENYWRITE, "denywrite" }, + {VM_LOCKONFAULT, "lockonfault" }, {VM_LOCKED, "locked" }, {VM_IO, "io" }, {VM_SEQ_READ, "seqread" }, diff --git a/mm/gup.c b/mm/gup.c index 6297f6bccfb1e4..e6329082795600 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -92,7 +92,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, */ mark_page_accessed(page); } - if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -265,6 +265,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret; + /* mlock all present pages, but do not fault in new pages */ + if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) + return -ENOENT; /* For mm_populate(), just skip the stack guard page. */ if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) || @@ -850,7 +853,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - gup_flags = FOLL_TOUCH | FOLL_POPULATE; + gup_flags = FOLL_TOUCH | FOLL_MLOCK; + if ((vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) == VM_LOCKED) + gup_flags |= FOLL_POPULATE; + /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9f3337a3e783b..843f1199db1777 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1265,7 +1265,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8c3087089d8a8..82caa48a18fe05 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3764,8 +3764,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; + unsigned long vm_flags = vma->vm_flags & ~(VM_LOCKED | VM_LOCKONFAULT); + unsigned long svm_flags = svma->vm_flags & ~(VM_LOCKED | VM_LOCKONFAULT); /* * match the virtual addresses, permission and the alignment of the diff --git a/mm/mlock.c b/mm/mlock.c index a23a533e946245..b8532ca0185c6f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - vma->vm_flags &= ~VM_LOCKED; + vma->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); while (start < end) { struct page *page = NULL; diff --git a/mm/mmap.c b/mm/mmap.c index a2fd40a595e152..75bb0ec2a79f02 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1664,7 +1664,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma == get_gate_vma(current->mm))) mm->locked_vm += (len >> PAGE_SHIFT); else - vma->vm_flags &= ~VM_LOCKED; + vma->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); } if (file) diff --git a/mm/rmap.c b/mm/rmap.c index 171b68768df147..47c855a86f7379 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -744,7 +744,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) { spin_unlock(ptl); - pra->vm_flags |= VM_LOCKED; + pra->vm_flags |= (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)); return SWAP_FAIL; /* To break the loop */ } @@ -765,7 +765,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) { pte_unmap_unlock(pte, ptl); - pra->vm_flags |= VM_LOCKED; + pra->vm_flags |= (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)); return SWAP_FAIL; /* To break the loop */ }