| From f0a7422826e61264eedbe14bde55a19d477408bb Mon Sep 17 00:00:00 2001 |
| From: Yu Zhao <yuzhao@google.com> |
| Date: Sat, 1 Apr 2023 14:14:19 -0600 |
| Subject: [PATCH] BACKPORT: FROMLIST: mm: multi-gen LRU: use |
| mmu_notifier_test_clear_young() |
| |
| An existing selftest can quickly demonstrate the effectiveness of this |
| patch. On a generic workstation equipped with 64 CPUs and 256GB DRAM: |
| |
| $ sudo max_guest_memory_test -c 64 -m 256 -s 256 |
| |
| MGLRU run2 |
| --------------- |
| Before ~600s |
| After ~50s |
| Off ~250s |
| |
| kswapd (MGLRU before) |
| 100.00% balance_pgdat |
| 100.00% shrink_node |
| 100.00% shrink_one |
| 99.97% try_to_shrink_lruvec |
| 99.06% evict_folios |
| 97.41% shrink_folio_list |
| 31.33% folio_referenced |
| 31.06% rmap_walk_file |
| 30.89% folio_referenced_one |
| 20.83% __mmu_notifier_clear_flush_young |
| 20.54% kvm_mmu_notifier_clear_flush_young |
| => 19.34% _raw_write_lock |
| |
| kswapd (MGLRU after) |
| 100.00% balance_pgdat |
| 100.00% shrink_node |
| 100.00% shrink_one |
| 99.97% try_to_shrink_lruvec |
| 99.51% evict_folios |
| 71.70% shrink_folio_list |
| 7.08% folio_referenced |
| 6.78% rmap_walk_file |
| 6.72% folio_referenced_one |
| 5.60% lru_gen_look_around |
| => 1.53% __mmu_notifier_test_clear_young |
| |
| kswapd (MGLRU off) |
| 100.00% balance_pgdat |
| 100.00% shrink_node |
| 99.92% shrink_lruvec |
| 69.95% shrink_folio_list |
| 19.35% folio_referenced |
| 18.37% rmap_walk_file |
| 17.88% folio_referenced_one |
| 13.20% __mmu_notifier_clear_flush_young |
| 11.64% kvm_mmu_notifier_clear_flush_young |
| => 9.93% _raw_write_lock |
| 26.23% shrink_active_list |
| 25.50% folio_referenced |
| 25.35% rmap_walk_file |
| 25.28% folio_referenced_one |
| 23.87% __mmu_notifier_clear_flush_young |
| 23.69% kvm_mmu_notifier_clear_flush_young |
| => 18.98% _raw_write_lock |
| |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| (am from https://patchwork.kernel.org/patch/13144315/) |
| |
| BUG=b:266976439 |
| UPSTREAM-TASK=b:266738578 |
| TEST=ran crostini.VimCompile |
| |
| Change-Id: I2d6c83ccad765d903d4c2d2e48ded7bd114942c8 |
| Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/4392744 |
| Commit-Queue: Yu Zhao <yuzhao@chromium.org> |
| Tested-by: Yu Zhao <yuzhao@chromium.org> |
| Reviewed-by: T.J. Alumbaugh <talumbau@google.com> |
| Kcr-patch: aacc60cbb8febd8ac1fd442d810253c6228937c86a6829ca3f6d7f09.patch |
| --- |
| include/linux/mmzone.h | 6 +- |
| mm/rmap.c | 9 +-- |
| mm/vmscan.c | 147 ++++++++++++++++++++++++++++++++++++----- |
| 3 files changed, 136 insertions(+), 26 deletions(-) |
| |
| diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h |
| index 4ed33b12782151632e36aa114039cb4a0916fe06..f30d391089f743a69de67391fd963a6aed90a042 100644 |
| --- a/include/linux/mmzone.h |
| +++ b/include/linux/mmzone.h |
| @@ -394,6 +394,7 @@ enum { |
| LRU_GEN_CORE, |
| LRU_GEN_MM_WALK, |
| LRU_GEN_NONLEAF_YOUNG, |
| + LRU_GEN_SPTE_WALK, |
| NR_LRU_GEN_CAPS |
| }; |
| |
| @@ -551,7 +552,7 @@ struct lru_gen_memcg { |
| |
| void lru_gen_init_pgdat(struct pglist_data *pgdat); |
| void lru_gen_init_lruvec(struct lruvec *lruvec); |
| -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); |
| +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); |
| |
| void lru_gen_init_memcg(struct mem_cgroup *memcg); |
| void lru_gen_exit_memcg(struct mem_cgroup *memcg); |
| @@ -570,8 +571,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec) |
| { |
| } |
| |
| -static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| +static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| { |
| + return false; |
| } |
| |
| static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) |
| diff --git a/mm/rmap.c b/mm/rmap.c |
| index f5d43edad529a76858a9aab5536a755f0a50ec67..47588cb3768f4b77206b219349221c7858412368 100644 |
| --- a/mm/rmap.c |
| +++ b/mm/rmap.c |
| @@ -867,13 +867,10 @@ static bool folio_referenced_one(struct folio *folio, |
| continue; |
| } |
| |
| - if (pvmw.pte) { |
| - if (lru_gen_enabled() && |
| - pte_young(ptep_get(pvmw.pte))) { |
| - lru_gen_look_around(&pvmw); |
| + if (lru_gen_enabled() && pvmw.pte) { |
| + if (lru_gen_look_around(&pvmw)) |
| referenced++; |
| - } |
| - |
| + } else if (pvmw.pte) { |
| if (ptep_clear_flush_young_notify(vma, address, |
| pvmw.pte)) |
| referenced++; |
| diff --git a/mm/vmscan.c b/mm/vmscan.c |
| index 9644d7de33682875e21e8ea6bbaef6d70b9f33d0..d7126561849ad518c4dfdd8b498d182f153e32f9 100644 |
| --- a/mm/vmscan.c |
| +++ b/mm/vmscan.c |
| @@ -56,6 +56,10 @@ |
| #include <linux/khugepaged.h> |
| #include <linux/rculist_nulls.h> |
| #include <linux/random.h> |
| +#include <linux/mmu_notifier.h> |
| +#ifdef CONFIG_KVM |
| +#include <linux/kvm_host.h> |
| +#endif |
| |
| #include <asm/tlbflush.h> |
| #include <asm/div64.h> |
| @@ -3313,6 +3317,59 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, |
| return folio; |
| } |
| |
| +#ifndef kvm_arch_has_test_clear_young |
| +#define kvm_arch_has_test_clear_young() 0 |
| +#endif |
| + |
| +static bool test_spte_young(struct mm_struct *mm, unsigned long addr, unsigned long end, |
| + unsigned long *bitmap, unsigned long *last) |
| +{ |
| + if (!kvm_arch_has_test_clear_young() || !get_cap(LRU_GEN_SPTE_WALK)) |
| + return false; |
| + |
| + if (*last > addr) |
| + goto done; |
| + |
| + *last = end - addr > MIN_LRU_BATCH * PAGE_SIZE ? |
| + addr + MIN_LRU_BATCH * PAGE_SIZE - 1 : end - 1; |
| + bitmap_zero(bitmap, MIN_LRU_BATCH); |
| + |
| + mmu_notifier_test_clear_young(mm, addr, *last + 1, false, bitmap); |
| +done: |
| + return test_bit((*last - addr) / PAGE_SIZE, bitmap); |
| +} |
| + |
| +static void clear_spte_young(struct mm_struct *mm, unsigned long addr, |
| + unsigned long *bitmap, unsigned long *last) |
| +{ |
| + int i; |
| + unsigned long start, end = *last + 1; |
| + |
| + if (addr + PAGE_SIZE != end) |
| + return; |
| + |
| + i = find_last_bit(bitmap, MIN_LRU_BATCH); |
| + if (i == MIN_LRU_BATCH) |
| + return; |
| + |
| + start = end - (i + 1) * PAGE_SIZE; |
| + |
| + i = find_first_bit(bitmap, MIN_LRU_BATCH); |
| + |
| + end -= i * PAGE_SIZE; |
| + |
| + mmu_notifier_test_clear_young(mm, start, end, false, bitmap); |
| +} |
| + |
| +static void skip_spte_young(struct mm_struct *mm, unsigned long addr, |
| + unsigned long *bitmap, unsigned long *last) |
| +{ |
| + if (*last > addr) |
| + __clear_bit((*last - addr) / PAGE_SIZE, bitmap); |
| + |
| + clear_spte_young(mm, addr, bitmap, last); |
| +} |
| + |
| static bool suitable_to_scan(int total, int young) |
| { |
| int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); |
| @@ -3328,6 +3385,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, |
| pte_t *pte; |
| spinlock_t *ptl; |
| unsigned long addr; |
| + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; |
| + unsigned long last = 0; |
| int total = 0; |
| int young = 0; |
| struct lru_gen_mm_walk *walk = args->private; |
| @@ -3346,6 +3405,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, |
| arch_enter_lazy_mmu_mode(); |
| restart: |
| for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { |
| + bool success; |
| unsigned long pfn; |
| struct folio *folio; |
| pte_t ptent = ptep_get(pte + i); |
| @@ -3354,20 +3414,27 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, |
| walk->mm_stats[MM_LEAF_TOTAL]++; |
| |
| pfn = get_pte_pfn(ptent, args->vma, addr); |
| - if (pfn == -1) |
| + if (pfn == -1) { |
| + skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); |
| continue; |
| + } |
| |
| - if (!pte_young(ptent)) { |
| + success = test_spte_young(args->vma->vm_mm, addr, end, bitmap, &last); |
| + if (!success && !pte_young(ptent)) { |
| + skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); |
| walk->mm_stats[MM_LEAF_OLD]++; |
| continue; |
| } |
| |
| folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); |
| - if (!folio) |
| + if (!folio) { |
| + skip_spte_young(args->vma->vm_mm, addr, bitmap, &last); |
| continue; |
| + } |
| |
| - if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) |
| - VM_WARN_ON_ONCE(true); |
| + clear_spte_young(args->vma->vm_mm, addr, bitmap, &last); |
| + if (pte_young(ptent)) |
| + ptep_test_and_clear_young(args->vma, addr, pte + i); |
| |
| young++; |
| walk->mm_stats[MM_LEAF_YOUNG]++; |
| @@ -3966,6 +4033,32 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) |
| } |
| } |
| |
| +static bool should_look_around(struct vm_area_struct *vma, unsigned long addr, |
| + pte_t *pte, int *young) |
| +{ |
| + unsigned long old = true; |
| + |
| + if (!get_cap(LRU_GEN_SPTE_WALK)) { |
| + old = !pte_young(*pte); |
| + *young = ptep_clear_flush_young_notify(vma, addr, pte); |
| + |
| + return !old; |
| + } |
| + |
| + *young = mmu_notifier_test_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE, true, &old); |
| + |
| + if (!old) |
| + *young = true; |
| + |
| + if (pte_young(*pte)) { |
| + ptep_test_and_clear_young(vma, addr, pte); |
| + *young = true; |
| + return true; |
| + } |
| + |
| + return !old && get_cap(LRU_GEN_SPTE_WALK); |
| +} |
| + |
| /****************************************************************************** |
| * rmap/PT walk feedback |
| ******************************************************************************/ |
| @@ -3977,12 +4070,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) |
| * the PTE table to the Bloom filter. This forms a feedback loop between the |
| * eviction and the aging. |
| */ |
| -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| { |
| int i; |
| unsigned long start; |
| unsigned long end; |
| struct lru_gen_mm_walk *walk; |
| + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; |
| + unsigned long last = 0; |
| int young = 0; |
| pte_t *pte = pvmw->pte; |
| unsigned long addr = pvmw->address; |
| @@ -3999,12 +4094,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| lockdep_assert_held(pvmw->ptl); |
| VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); |
| |
| - if (spin_is_contended(pvmw->ptl)) |
| - return; |
| + if (!should_look_around(pvmw->vma, addr, pte, &young)) |
| + return young; |
| |
| - /* exclude special VMAs containing anon pages from COW */ |
| - if (vma->vm_flags & VM_SPECIAL) |
| - return; |
| + if (spin_is_contended(pvmw->ptl)) |
| + return young; |
| |
| /* avoid taking the LRU lock under the PTL when possible */ |
| walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; |
| @@ -4012,6 +4106,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| start = max(addr & PMD_MASK, vma->vm_start); |
| end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; |
| |
| + if (end - start == PAGE_SIZE) |
| + return young; |
| + |
| if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { |
| if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) |
| end = start + MIN_LRU_BATCH * PAGE_SIZE; |
| @@ -4025,29 +4122,38 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| |
| /* folio_update_gen() requires stable folio_memcg() */ |
| if (!mem_cgroup_trylock_pages(memcg)) |
| - return; |
| + return young; |
| |
| arch_enter_lazy_mmu_mode(); |
| |
| pte -= (addr - start) / PAGE_SIZE; |
| |
| for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { |
| + bool success; |
| unsigned long pfn; |
| pte_t ptent = ptep_get(pte + i); |
| |
| - pfn = get_pte_pfn(ptent, vma, addr); |
| - if (pfn == -1) |
| + pfn = get_pte_pfn(ptent, pvmw->vma, addr); |
| + if (pfn == -1) { |
| + skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); |
| continue; |
| + } |
| |
| - if (!pte_young(ptent)) |
| + success = test_spte_young(pvmw->vma->vm_mm, addr, end, bitmap, &last); |
| + if (!success && !pte_young(ptent)) { |
| + skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); |
| continue; |
| + } |
| |
| folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); |
| - if (!folio) |
| + if (!folio) { |
| + skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); |
| continue; |
| + } |
| |
| - if (!ptep_test_and_clear_young(vma, addr, pte + i)) |
| - VM_WARN_ON_ONCE(true); |
| + clear_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last); |
| + if (pte_young(ptent)) |
| + ptep_test_and_clear_young(pvmw->vma, addr, pte + i); |
| |
| young++; |
| |
| @@ -4077,6 +4183,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) |
| /* feedback from rmap walkers to page table walkers */ |
| if (mm_state && suitable_to_scan(i, young)) |
| update_bloom_filter(mm_state, max_seq, pvmw->pmd); |
| + |
| + return young; |
| } |
| |
| /****************************************************************************** |
| @@ -5118,6 +5226,9 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c |
| if (should_clear_pmd_young()) |
| caps |= BIT(LRU_GEN_NONLEAF_YOUNG); |
| |
| + if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK)) |
| + caps |= BIT(LRU_GEN_SPTE_WALK); |
| + |
| return sysfs_emit(buf, "0x%04x\n", caps); |
| } |
| |
| -- |
| 2.43.0.429.g432eaa2c6b-goog |
| |