blob: 439294b9034aecb7586b3aa3e67a39f34699d4ba [file] [log] [blame]
From f0a7422826e61264eedbe14bde55a19d477408bb Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 1 Apr 2023 14:14:19 -0600
Subject: [PATCH] BACKPORT: FROMLIST: mm: multi-gen LRU: use
mmu_notifier_test_clear_young()
An existing selftest can quickly demonstrate the effectiveness of this
patch. On a generic workstation equipped with 64 CPUs and 256GB DRAM:
$ sudo max_guest_memory_test -c 64 -m 256 -s 256
MGLRU run2
---------------
Before ~600s
After ~50s
Off ~250s
kswapd (MGLRU before)
100.00% balance_pgdat
100.00% shrink_node
100.00% shrink_one
99.97% try_to_shrink_lruvec
99.06% evict_folios
97.41% shrink_folio_list
31.33% folio_referenced
31.06% rmap_walk_file
30.89% folio_referenced_one
20.83% __mmu_notifier_clear_flush_young
20.54% kvm_mmu_notifier_clear_flush_young
=> 19.34% _raw_write_lock
kswapd (MGLRU after)
100.00% balance_pgdat
100.00% shrink_node
100.00% shrink_one
99.97% try_to_shrink_lruvec
99.51% evict_folios
71.70% shrink_folio_list
7.08% folio_referenced
6.78% rmap_walk_file
6.72% folio_referenced_one
5.60% lru_gen_look_around
=> 1.53% __mmu_notifier_test_clear_young
kswapd (MGLRU off)
100.00% balance_pgdat
100.00% shrink_node
99.92% shrink_lruvec
69.95% shrink_folio_list
19.35% folio_referenced
18.37% rmap_walk_file
17.88% folio_referenced_one
13.20% __mmu_notifier_clear_flush_young
11.64% kvm_mmu_notifier_clear_flush_young
=> 9.93% _raw_write_lock
26.23% shrink_active_list
25.50% folio_referenced
25.35% rmap_walk_file
25.28% folio_referenced_one
23.87% __mmu_notifier_clear_flush_young
23.69% kvm_mmu_notifier_clear_flush_young
=> 18.98% _raw_write_lock
Signed-off-by: Yu Zhao <yuzhao@google.com>
(am from https://patchwork.kernel.org/patch/13144315/)
BUG=b:266976439
UPSTREAM-TASK=b:266738578
TEST=ran crostini.VimCompile
Change-Id: I2d6c83ccad765d903d4c2d2e48ded7bd114942c8
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/4392744
Commit-Queue: Yu Zhao <yuzhao@chromium.org>
Tested-by: Yu Zhao <yuzhao@chromium.org>
Reviewed-by: T.J. Alumbaugh <talumbau@google.com>
Kcr-patch: aacc60cbb8febd8ac1fd442d810253c6228937c86a6829ca3f6d7f09.patch
---
include/linux/mmzone.h | 6 +-
mm/rmap.c | 9 +--
mm/vmscan.c | 147 ++++++++++++++++++++++++++++++++++++-----
3 files changed, 136 insertions(+), 26 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4ed33b12782151632e36aa114039cb4a0916fe06..f30d391089f743a69de67391fd963a6aed90a042 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -394,6 +394,7 @@ enum {
LRU_GEN_CORE,
LRU_GEN_MM_WALK,
LRU_GEN_NONLEAF_YOUNG,
+ LRU_GEN_SPTE_WALK,
NR_LRU_GEN_CAPS
};
@@ -551,7 +552,7 @@ struct lru_gen_memcg {
void lru_gen_init_pgdat(struct pglist_data *pgdat);
void lru_gen_init_lruvec(struct lruvec *lruvec);
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -570,8 +571,9 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}
-static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
+ return false;
}
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
diff --git a/mm/rmap.c b/mm/rmap.c
index f5d43edad529a76858a9aab5536a755f0a50ec67..47588cb3768f4b77206b219349221c7858412368 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -867,13 +867,10 @@ static bool folio_referenced_one(struct folio *folio,
continue;
}
- if (pvmw.pte) {
- if (lru_gen_enabled() &&
- pte_young(ptep_get(pvmw.pte))) {
- lru_gen_look_around(&pvmw);
+ if (lru_gen_enabled() && pvmw.pte) {
+ if (lru_gen_look_around(&pvmw))
referenced++;
- }
-
+ } else if (pvmw.pte) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte))
referenced++;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9644d7de33682875e21e8ea6bbaef6d70b9f33d0..d7126561849ad518c4dfdd8b498d182f153e32f9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,6 +56,10 @@
#include <linux/khugepaged.h>
#include <linux/rculist_nulls.h>
#include <linux/random.h>
+#include <linux/mmu_notifier.h>
+#ifdef CONFIG_KVM
+#include <linux/kvm_host.h>
+#endif
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -3313,6 +3317,59 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
return folio;
}
+#ifndef kvm_arch_has_test_clear_young
+#define kvm_arch_has_test_clear_young() 0
+#endif
+
+static bool test_spte_young(struct mm_struct *mm, unsigned long addr, unsigned long end,
+ unsigned long *bitmap, unsigned long *last)
+{
+ if (!kvm_arch_has_test_clear_young() || !get_cap(LRU_GEN_SPTE_WALK))
+ return false;
+
+ if (*last > addr)
+ goto done;
+
+ *last = end - addr > MIN_LRU_BATCH * PAGE_SIZE ?
+ addr + MIN_LRU_BATCH * PAGE_SIZE - 1 : end - 1;
+ bitmap_zero(bitmap, MIN_LRU_BATCH);
+
+ mmu_notifier_test_clear_young(mm, addr, *last + 1, false, bitmap);
+done:
+ return test_bit((*last - addr) / PAGE_SIZE, bitmap);
+}
+
+static void clear_spte_young(struct mm_struct *mm, unsigned long addr,
+ unsigned long *bitmap, unsigned long *last)
+{
+ int i;
+ unsigned long start, end = *last + 1;
+
+ if (addr + PAGE_SIZE != end)
+ return;
+
+ i = find_last_bit(bitmap, MIN_LRU_BATCH);
+ if (i == MIN_LRU_BATCH)
+ return;
+
+ start = end - (i + 1) * PAGE_SIZE;
+
+ i = find_first_bit(bitmap, MIN_LRU_BATCH);
+
+ end -= i * PAGE_SIZE;
+
+ mmu_notifier_test_clear_young(mm, start, end, false, bitmap);
+}
+
+static void skip_spte_young(struct mm_struct *mm, unsigned long addr,
+ unsigned long *bitmap, unsigned long *last)
+{
+ if (*last > addr)
+ __clear_bit((*last - addr) / PAGE_SIZE, bitmap);
+
+ clear_spte_young(mm, addr, bitmap, last);
+}
+
static bool suitable_to_scan(int total, int young)
{
int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
@@ -3328,6 +3385,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
unsigned long addr;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
+ unsigned long last = 0;
int total = 0;
int young = 0;
struct lru_gen_mm_walk *walk = args->private;
@@ -3346,6 +3405,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
arch_enter_lazy_mmu_mode();
restart:
for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ bool success;
unsigned long pfn;
struct folio *folio;
pte_t ptent = ptep_get(pte + i);
@@ -3354,20 +3414,27 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
walk->mm_stats[MM_LEAF_TOTAL]++;
pfn = get_pte_pfn(ptent, args->vma, addr);
- if (pfn == -1)
+ if (pfn == -1) {
+ skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
continue;
+ }
- if (!pte_young(ptent)) {
+ success = test_spte_young(args->vma->vm_mm, addr, end, bitmap, &last);
+ if (!success && !pte_young(ptent)) {
+ skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
- if (!folio)
+ if (!folio) {
+ skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
continue;
+ }
- if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
- VM_WARN_ON_ONCE(true);
+ clear_spte_young(args->vma->vm_mm, addr, bitmap, &last);
+ if (pte_young(ptent))
+ ptep_test_and_clear_young(args->vma, addr, pte + i);
young++;
walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -3966,6 +4033,32 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
}
}
+static bool should_look_around(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *pte, int *young)
+{
+ unsigned long old = true;
+
+ if (!get_cap(LRU_GEN_SPTE_WALK)) {
+ old = !pte_young(*pte);
+ *young = ptep_clear_flush_young_notify(vma, addr, pte);
+
+ return !old;
+ }
+
+ *young = mmu_notifier_test_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE, true, &old);
+
+ if (!old)
+ *young = true;
+
+ if (pte_young(*pte)) {
+ ptep_test_and_clear_young(vma, addr, pte);
+ *young = true;
+ return true;
+ }
+
+ return !old && get_cap(LRU_GEN_SPTE_WALK);
+}
+
/******************************************************************************
* rmap/PT walk feedback
******************************************************************************/
@@ -3977,12 +4070,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
* the PTE table to the Bloom filter. This forms a feedback loop between the
* eviction and the aging.
*/
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
int i;
unsigned long start;
unsigned long end;
struct lru_gen_mm_walk *walk;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
+ unsigned long last = 0;
int young = 0;
pte_t *pte = pvmw->pte;
unsigned long addr = pvmw->address;
@@ -3999,12 +4094,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
lockdep_assert_held(pvmw->ptl);
VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
- if (spin_is_contended(pvmw->ptl))
- return;
+ if (!should_look_around(pvmw->vma, addr, pte, &young))
+ return young;
- /* exclude special VMAs containing anon pages from COW */
- if (vma->vm_flags & VM_SPECIAL)
- return;
+ if (spin_is_contended(pvmw->ptl))
+ return young;
/* avoid taking the LRU lock under the PTL when possible */
walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
@@ -4012,6 +4106,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
start = max(addr & PMD_MASK, vma->vm_start);
end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
+ if (end - start == PAGE_SIZE)
+ return young;
+
if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
end = start + MIN_LRU_BATCH * PAGE_SIZE;
@@ -4025,29 +4122,38 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
/* folio_update_gen() requires stable folio_memcg() */
if (!mem_cgroup_trylock_pages(memcg))
- return;
+ return young;
arch_enter_lazy_mmu_mode();
pte -= (addr - start) / PAGE_SIZE;
for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ bool success;
unsigned long pfn;
pte_t ptent = ptep_get(pte + i);
- pfn = get_pte_pfn(ptent, vma, addr);
- if (pfn == -1)
+ pfn = get_pte_pfn(ptent, pvmw->vma, addr);
+ if (pfn == -1) {
+ skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
continue;
+ }
- if (!pte_young(ptent))
+ success = test_spte_young(pvmw->vma->vm_mm, addr, end, bitmap, &last);
+ if (!success && !pte_young(ptent)) {
+ skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
continue;
+ }
folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
- if (!folio)
+ if (!folio) {
+ skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
continue;
+ }
- if (!ptep_test_and_clear_young(vma, addr, pte + i))
- VM_WARN_ON_ONCE(true);
+ clear_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
+ if (pte_young(ptent))
+ ptep_test_and_clear_young(pvmw->vma, addr, pte + i);
young++;
@@ -4077,6 +4183,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
/* feedback from rmap walkers to page table walkers */
if (mm_state && suitable_to_scan(i, young))
update_bloom_filter(mm_state, max_seq, pvmw->pmd);
+
+ return young;
}
/******************************************************************************
@@ -5118,6 +5226,9 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c
if (should_clear_pmd_young())
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+ if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK))
+ caps |= BIT(LRU_GEN_SPTE_WALK);
+
return sysfs_emit(buf, "0x%04x\n", caps);
}
--
2.43.0.429.g432eaa2c6b-goog