diff --git a/linux-tkg-patches/6.6/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch b/linux-tkg-patches/6.6/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch new file mode 100644 index 000000000..a56913ac8 --- /dev/null +++ b/linux-tkg-patches/6.6/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch @@ -0,0 +1,248 @@ +From 23d7461352e269625f418ff525de2451fbfaef54 Mon Sep 17 00:00:00 2001 +From: Paul Gofman +Date: Wed, 6 May 2020 14:37:44 +0300 +Subject: [PATCH 1/2] mm: Support soft dirty flag reset for VA range. + +v2: ported from 6.1 to 6.6 + +Signed-off-by: Kai Krakow +--- + fs/proc/task_mmu.c | 129 ++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 103 insertions(+), 26 deletions(-) + +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index 3dd5be96691b4..daa4fc32de9a2 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1072,6 +1072,8 @@ enum clear_refs_types { + + struct clear_refs_private { + enum clear_refs_types type; ++ unsigned long start, end; ++ bool clear_range; + }; + + #ifdef CONFIG_MEM_SOFT_DIRTY +@@ -1163,6 +1165,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + spinlock_t *ptl; + struct page *page; + ++ BUG_ON(addr < cp->start || end > cp->end); ++ + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { +@@ -1220,9 +1224,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = walk->vma; + +- if (vma->vm_flags & VM_PFNMAP) ++ if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP)) + return 1; + ++ BUG_ON(start < cp->start || end > cp->end); ++ + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. +@@ -1246,10 +1252,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { + struct task_struct *task; +- char buffer[PROC_NUMBUF]; ++ char buffer[18]; + struct mm_struct *mm; + struct vm_area_struct *vma; + enum clear_refs_types type; ++ unsigned long start, end; ++ bool clear_range; + int itype; + int rv; + +@@ -1258,12 +1266,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; +- rv = kstrtoint(strstrip(buffer), 10, &itype); +- if (rv < 0) +- return rv; +- type = (enum clear_refs_types)itype; +- if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) +- return -EINVAL; ++ ++ if (buffer[0] == '6') ++ { ++ static int once; ++ ++ if (!once++) ++ printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n"); ++ ++ if (count != 17) ++ return -EINVAL; ++ ++ type = CLEAR_REFS_SOFT_DIRTY; ++ start = *(unsigned long *)(buffer + 1); ++ end = *(unsigned long *)(buffer + 1 + 8); ++ } ++ else ++ { ++ rv = kstrtoint(strstrip(buffer), 10, &itype); ++ if (rv < 0) ++ return rv; ++ type = (enum clear_refs_types)itype; ++ ++ if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) ++ return -EINVAL; ++ ++ start = 0; ++ end = -1UL; ++ } + + task = get_proc_task(file_inode(file)); + if (!task) +@@ -1276,40 +1306,86 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, + .type = type, + }; + +- if (mmap_write_lock_killable(mm)) { +- count = -EINTR; +- goto out_mm; ++ if (start || end != -1UL) ++ { ++ start = min(start, -1UL) & PAGE_MASK; ++ end = min(end, -1UL) & PAGE_MASK; ++ ++ if (start >= end) ++ { ++ count = -EINVAL; ++ goto out_mm; ++ } ++ clear_range = true; + } ++ else ++ { ++ clear_range = false; ++ } ++ ++ cp.start = start; ++ cp.end = end; ++ cp.clear_range = clear_range; ++ + if (type == CLEAR_REFS_MM_HIWATER_RSS) { ++ if (mmap_write_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; ++ } ++ + /* + * Writing 5 to /proc/pid/clear_refs resets the peak + * resident set size to this mm's current rss value. + */ + reset_mm_hiwater_rss(mm); +- goto out_unlock; ++ mmap_write_unlock(mm); ++ goto out_mm; + } + + if (type == CLEAR_REFS_SOFT_DIRTY) { +- for_each_vma(vmi, vma) { +- if (!(vma->vm_flags & VM_SOFTDIRTY)) +- continue; +- vm_flags_clear(vma, VM_SOFTDIRTY); +- vma_set_page_prot(vma); ++ if (mmap_read_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; + } +- ++ if (!clear_range) ++ for_each_vma(vmi, vma) { ++ if (!(vma->vm_flags & VM_SOFTDIRTY)) ++ continue; ++ mmap_read_unlock(mm); ++ if (mmap_write_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; ++ } ++ for_each_vma(vmi, vma) { ++ vm_flags_clear(vma, VM_SOFTDIRTY); ++ vma_set_page_prot(vma); ++ } ++ mmap_write_downgrade(mm); ++ break; ++ } + inc_tlb_flush_pending(mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, +- 0, mm, 0, -1UL); ++ 0, mm, start, end); + mmu_notifier_invalidate_range_start(&range); + } +- walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); ++ else ++ { ++ if (mmap_write_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; ++ } ++ } ++ walk_page_range(mm, start, end == -1UL ? -1 : end, &clear_refs_walk_ops, &cp); + if (type == CLEAR_REFS_SOFT_DIRTY) { + mmu_notifier_invalidate_range_end(&range); + flush_tlb_mm(mm); + dec_tlb_flush_pending(mm); ++ mmap_read_unlock(mm); ++ } ++ else ++ { ++ mmap_write_unlock(mm); + } +-out_unlock: +- mmap_write_unlock(mm); + out_mm: + mmput(mm); + } +@@ -1341,6 +1417,7 @@ struct pagemapread { + #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) + #define PM_SOFT_DIRTY BIT_ULL(55) + #define PM_MMAP_EXCLUSIVE BIT_ULL(56) ++#define PM_SOFT_DIRTY_PAGE BIT_ULL(57) + #define PM_UFFD_WP BIT_ULL(57) + #define PM_FILE BIT_ULL(61) + #define PM_SWAP BIT_ULL(62) +@@ -1415,13 +1492,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, + flags |= PM_PRESENT; + page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + if (pte_uffd_wp(pte)) + flags |= PM_UFFD_WP; + } else if (is_swap_pte(pte)) { + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + if (pte_swp_uffd_wp(pte)) + flags |= PM_UFFD_WP; + entry = pte_to_swp_entry(pte); +@@ -1481,7 +1558,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + + flags |= PM_PRESENT; + if (pmd_soft_dirty(pmd)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + if (pmd_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + if (pm->show_pfn) +@@ -1505,7 +1582,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + } + flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + if (pmd_swp_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + VM_BUG_ON(!is_pmd_migration_entry(pmd)); +-- +2.41.0 + diff --git a/linux-tkg-patches/6.6/0002-mm-Support-soft-dirty-flag-read-with-reset.patch b/linux-tkg-patches/6.6/0002-mm-Support-soft-dirty-flag-read-with-reset.patch new file mode 100644 index 000000000..de390b8d5 --- /dev/null +++ b/linux-tkg-patches/6.6/0002-mm-Support-soft-dirty-flag-read-with-reset.patch @@ -0,0 +1,353 @@ +From 30aad6be7fa9513c818f33cf1e0e725920619145 Mon Sep 17 00:00:00 2001 +From: Paul Gofman +Date: Thu, 7 May 2020 14:05:31 +0300 +Subject: [PATCH 2/2] mm: Support soft dirty flag read with reset. + +v2: ported from 6.1 to 6.6 + +Signed-off-by: Kai Krakow +--- + fs/proc/base.c | 3 + + fs/proc/internal.h | 1 + + fs/proc/task_mmu.c | 139 +++++++++++++++++++++++++++++++++++++++------ + 3 files changed, 127 insertions(+), 16 deletions(-) + +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ffd54617c3547..5683da416a891 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -3284,6 +3284,9 @@ static const struct pid_entry tgid_base_stuff[] = { + REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), ++#ifdef CONFIG_MEM_SOFT_DIRTY ++ REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations), ++#endif + #endif + #ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +diff --git a/fs/proc/internal.h b/fs/proc/internal.h +index 9a8f32f21ff56..f3a16b26dd6e4 100644 +--- a/fs/proc/internal.h ++++ b/fs/proc/internal.h +@@ -303,6 +303,7 @@ extern const struct file_operations proc_pid_smaps_operations; + extern const struct file_operations proc_pid_smaps_rollup_operations; + extern const struct file_operations proc_clear_refs_operations; + extern const struct file_operations proc_pagemap_operations; ++extern const struct file_operations proc_pagemap_reset_operations; + + extern unsigned long task_vsize(struct mm_struct *); + extern unsigned long task_statm(struct mm_struct *, +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index daa4fc32de9a2..291f530a8cfb2 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1094,7 +1094,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, + return page_maybe_dma_pinned(page); + } + +-static inline void clear_soft_dirty(struct vm_area_struct *vma, ++static inline bool clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) + { + /* +@@ -1104,37 +1104,46 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, + * of how soft-dirty works. + */ + pte_t ptent = ptep_get(pte); ++ bool ret = false; + + if (pte_present(ptent)) { + pte_t old_pte; + + if (pte_is_pinned(vma, addr, ptent)) +- return; ++ return ret; + old_pte = ptep_modify_prot_start(vma, addr, pte); ++ ret = pte_soft_dirty(old_pte); + ptent = pte_wrprotect(old_pte); + ptent = pte_clear_soft_dirty(ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else if (is_swap_pte(ptent)) { ++ ret = pte_swp_soft_dirty(ptent); + ptent = pte_swp_clear_soft_dirty(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } ++ return ret; + } + #else +-static inline void clear_soft_dirty(struct vm_area_struct *vma, ++static inline bool clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) + { ++ return false; + } + #endif + + #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, ++static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { + pmd_t old, pmd = *pmdp; ++ bool ret = false; + + if (pmd_present(pmd)) { + /* See comment in change_huge_pmd() */ + old = pmdp_invalidate(vma, addr, pmdp); ++ ++ ret = pmd_soft_dirty(old); ++ + if (pmd_dirty(old)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(old)) +@@ -1145,14 +1154,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { ++ ret = pmd_swp_soft_dirty(pmd); + pmd = pmd_swp_clear_soft_dirty(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } ++ return ret; + } + #else +-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, ++static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { ++ return false; + } + #endif + +@@ -1407,6 +1419,7 @@ struct pagemapread { + int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ + pagemap_entry_t *buffer; + bool show_pfn; ++ bool reset; + }; + + #define PAGEMAP_WALK_SIZE (PMD_SIZE) +@@ -1439,6 +1452,14 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, + return 0; + } + ++static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm) ++{ ++ ((unsigned long *)pm->buffer)[pm->pos++] = addr; ++ if (pm->pos >= pm->len) ++ return PM_END_OF_BUFFER; ++ return 0; ++} ++ + static int pagemap_pte_hole(unsigned long start, unsigned long end, + __always_unused int depth, struct mm_walk *walk) + { +@@ -1446,6 +1467,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, + unsigned long addr = start; + int err = 0; + ++ if (pm->reset) ++ goto out; ++ + while (addr < end) { + struct vm_area_struct *vma = find_vma(walk->mm, addr); + pagemap_entry_t pme = make_pme(0, 0); +@@ -1550,6 +1574,20 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + pmd_t pmd = *pmdp; + struct page *page = NULL; + ++ if (pm->reset) ++ { ++ if (clear_soft_dirty_pmd(vma, addr, pmdp)) ++ { ++ for (; addr != end; addr += PAGE_SIZE) ++ { ++ err = add_addr_to_pagemap(addr, pm); ++ if (err) ++ break; ++ } ++ } ++ goto trans_huge_done; ++ } ++ + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; + +@@ -1607,6 +1645,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + frame += (1 << MAX_SWAPFILES_SHIFT); + } + } ++trans_huge_done: + spin_unlock(ptl); + return err; + } +@@ -1622,10 +1661,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + return err; + } + for (; addr < end; pte++, addr += PAGE_SIZE) { +- pagemap_entry_t pme; ++ if (pm->reset) ++ { ++ if (clear_soft_dirty(vma, addr, pte)) ++ err = add_addr_to_pagemap(addr, pm); ++ } ++ else ++ { ++ pagemap_entry_t pme; + +- pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); +- err = add_to_pagemap(addr, &pme, pm); ++ pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); ++ err = add_to_pagemap(addr, &pme, pm); ++ } + if (err) + break; + } +@@ -1724,8 +1771,8 @@ static const struct mm_walk_ops pagemap_ops = { + * determine which areas of memory are actually mapped and llseek to + * skip over unmapped regions. + */ +-static ssize_t pagemap_read(struct file *file, char __user *buf, +- size_t count, loff_t *ppos) ++static ssize_t do_pagemap_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos, bool reset) + { + struct mm_struct *mm = file->private_data; + struct pagemapread pm; +@@ -1734,6 +1781,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + unsigned long start_vaddr; + unsigned long end_vaddr; + int ret = 0, copied = 0; ++ struct mmu_notifier_range range; ++ size_t buffer_len; + + if (!mm || !mmget_not_zero(mm)) + goto out; +@@ -1749,19 +1798,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + + /* do not disclose physical addresses: attack vector */ + pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); ++ pm.reset = reset; + +- pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); +- pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); ++ buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES); ++ ++ pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL); + ret = -ENOMEM; + if (!pm.buffer) + goto out_mm; + + src = *ppos; + svpfn = src / PM_ENTRY_BYTES; +- end_vaddr = mm->task_size; ++ ++ start_vaddr = svpfn << PAGE_SHIFT; ++ ++ if (reset) ++ { ++ if (count < sizeof(end_vaddr)) ++ { ++ ret = -EINVAL; ++ goto out_mm; ++ } ++ if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr))) ++ return -EFAULT; ++ end_vaddr = min(end_vaddr, mm->task_size); ++ } ++ else ++ { ++ end_vaddr = mm->task_size; ++ start_vaddr = end_vaddr; ++ } + + /* watch out for wraparound */ +- start_vaddr = end_vaddr; + if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { + unsigned long end; + +@@ -1786,18 +1854,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + unsigned long end; + + pm.pos = 0; +- end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; ++ pm.len = min(buffer_len, count / PM_ENTRY_BYTES); ++ ++ end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT)); + /* overflow ? */ + if (end < start_vaddr || end > end_vaddr) + end = end_vaddr; ++ + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_free; ++ ++ if (reset) ++ { ++ inc_tlb_flush_pending(mm); ++ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, ++ 0, mm, start_vaddr, end); ++ mmu_notifier_invalidate_range_start(&range); ++ } + ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); ++ if (reset) ++ { ++ mmu_notifier_invalidate_range_end(&range); ++ flush_tlb_mm(mm); ++ dec_tlb_flush_pending(mm); ++ } + mmap_read_unlock(mm); +- start_vaddr = end; + + len = min(count, PM_ENTRY_BYTES * pm.pos); ++ BUG_ON(ret && ret != PM_END_OF_BUFFER); + if (copy_to_user(buf, pm.buffer, len)) { + ret = -EFAULT; + goto out_free; +@@ -1805,6 +1890,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + copied += len; + buf += len; + count -= len; ++ ++ start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end; + } + *ppos += copied; + if (!ret || ret == PM_END_OF_BUFFER) +@@ -1818,6 +1905,18 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + return ret; + } + ++static ssize_t pagemap_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ return do_pagemap_read(file, buf, count, ppos, false); ++} ++ ++static ssize_t pagemap_reset_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ return do_pagemap_read(file, buf, count, ppos, true); ++} ++ + static int pagemap_open(struct inode *inode, struct file *file) + { + struct mm_struct *mm; +@@ -1844,6 +1943,14 @@ const struct file_operations proc_pagemap_operations = { + .open = pagemap_open, + .release = pagemap_release, + }; ++ ++const struct file_operations proc_pagemap_reset_operations = { ++ .llseek = mem_lseek, /* borrow this */ ++ .read = pagemap_reset_read, ++ .open = pagemap_open, ++ .release = pagemap_release, ++}; ++ + #endif /* CONFIG_PROC_PAGE_MONITOR */ + + #ifdef CONFIG_NUMA +-- +2.41.0 +