From: Rui Salvaterra Date: Tue, 29 Mar 2022 16:10:48 +0000 (+0100) Subject: kernel: add and enable MGLRU for Linux 5.15 X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=05158082f669aeb16a85604620abf7973a5ea807;p=openwrt%2Fstaging%2Fneocturne.git kernel: add and enable MGLRU for Linux 5.15 Backport a preliminary version of Yu Zhao's multi-generational LRU, for improved memory management. Refresh the patches while at it. Signed-off-by: Rui Salvaterra --- diff --git a/target/linux/generic/config-5.15 b/target/linux/generic/config-5.15 index efeaf2c452..f188ce36ac 100644 --- a/target/linux/generic/config-5.15 +++ b/target/linux/generic/config-5.15 @@ -3195,6 +3195,9 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 # CONFIG_LPC_ICH is not set # CONFIG_LPC_SCH is not set # CONFIG_LP_CONSOLE is not set +CONFIG_LRU_GEN=y +CONFIG_LRU_GEN_ENABLED=y +# CONFIG_LRU_GEN_STATS is not set # CONFIG_LSI_ET1011C_PHY is not set CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity" CONFIG_LSM_MMAP_MIN_ADDR=65536 @@ -4388,6 +4391,7 @@ CONFIG_NMI_LOG_BUF_SHIFT=13 # CONFIG_NO_HZ is not set # CONFIG_NO_HZ_FULL is not set # CONFIG_NO_HZ_IDLE is not set +CONFIG_NR_LRU_GENS=7 # CONFIG_NS83820 is not set # CONFIG_NTB is not set # CONFIG_NTFS3_64BIT_CLUSTER is not set @@ -6480,6 +6484,7 @@ CONFIG_THIN_ARCHIVES=y # CONFIG_THUNDER_NIC_VF is not set # CONFIG_TICK_CPU_ACCOUNTING is not set CONFIG_TICK_ONESHOT=y +CONFIG_TIERS_PER_GEN=4 # CONFIG_TIFM_CORE is not set # CONFIG_TIGON3 is not set # CONFIG_TIMB_DMA is not set diff --git a/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch b/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch new file mode 100644 index 0000000000..548d8e61b2 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch @@ -0,0 +1,169 @@ +From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Wed, 4 Aug 2021 01:31:34 -0600 +Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young() + +Some architectures automatically set the accessed bit in PTEs, e.g., +x86 and arm64 v8.2. On architectures that do not have this capability, +clearing the accessed bit in a PTE triggers a page fault following the +TLB miss of this PTE. + +Being aware of this capability can help make better decisions, i.e., +whether to limit the size of each batch of PTEs and the burst of +batches when clearing the accessed bit. + +Signed-off-by: Yu Zhao +Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a +--- + arch/arm64/include/asm/cpufeature.h | 5 +++++ + arch/arm64/include/asm/pgtable.h | 13 ++++++++----- + arch/arm64/kernel/cpufeature.c | 10 ++++++++++ + arch/arm64/tools/cpucaps | 1 + + arch/x86/include/asm/pgtable.h | 6 +++--- + include/linux/pgtable.h | 13 +++++++++++++ + mm/memory.c | 14 +------------- + 7 files changed, 41 insertions(+), 21 deletions(-) + +--- a/arch/arm64/include/asm/cpufeature.h ++++ b/arch/arm64/include/asm/cpufeature.h +@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r + cpus_have_const_cap(ARM64_HAS_TLB_RANGE); + } + ++static inline bool system_has_hw_af(void) ++{ ++ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF); ++} ++ + extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); + + static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru + * page after fork() + CoW for pfn mappings. We don't always have a + * hardware-managed access flag on arm64. + */ +-static inline bool arch_faults_on_old_pte(void) ++static inline bool arch_has_hw_pte_young(bool local) + { +- WARN_ON(preemptible()); ++ if (local) { ++ WARN_ON(preemptible()); ++ return cpu_has_hw_af(); ++ } + +- return !cpu_has_hw_af(); ++ return system_has_hw_af(); + } +-#define arch_faults_on_old_pte arch_faults_on_old_pte ++#define arch_has_hw_pte_young arch_has_hw_pte_young + + /* + * Experimentally, it's cheap to set the access flag in hardware and we +@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt + */ + static inline bool arch_wants_old_prefaulted_pte(void) + { +- return !arch_faults_on_old_pte(); ++ return arch_has_hw_pte_young(true); + } + #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte + +--- a/arch/arm64/kernel/cpufeature.c ++++ b/arch/arm64/kernel/cpufeature.c +@@ -2184,6 +2184,16 @@ static const struct arm64_cpu_capabiliti + .matches = has_hw_dbm, + .cpu_enable = cpu_enable_hw_dbm, + }, ++ { ++ .desc = "Hardware update of the Access flag", ++ .type = ARM64_CPUCAP_SYSTEM_FEATURE, ++ .capability = ARM64_HW_AF, ++ .sys_reg = SYS_ID_AA64MMFR1_EL1, ++ .sign = FTR_UNSIGNED, ++ .field_pos = ID_AA64MMFR1_HADBS_SHIFT, ++ .min_field_value = 1, ++ .matches = has_cpuid_feature, ++ }, + #endif + { + .desc = "CRC32 instructions", +--- a/arch/arm64/tools/cpucaps ++++ b/arch/arm64/tools/cpucaps +@@ -35,6 +35,7 @@ HAS_STAGE2_FWB + HAS_SYSREG_GIC_CPUIF + HAS_TLB_RANGE + HAS_VIRT_HOST_EXTN ++HW_AF + HW_DBM + KVM_PROTECTED_MODE + MISMATCHED_CACHE_TYPE +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c + return boot_cpu_has_bug(X86_BUG_L1TF); + } + +-#define arch_faults_on_old_pte arch_faults_on_old_pte +-static inline bool arch_faults_on_old_pte(void) ++#define arch_has_hw_pte_young arch_has_hw_pte_young ++static inline bool arch_has_hw_pte_young(bool local) + { +- return false; ++ return true; + } + + #endif /* __ASSEMBLY__ */ +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #endif + ++#ifndef arch_has_hw_pte_young ++/* ++ * Return whether the accessed bit is supported by the local CPU or all CPUs. ++ * ++ * Those arches which have hw access flag feature need to implement their own ++ * helper. By default, "false" means pagefault will be hit on old pte. ++ */ ++static inline bool arch_has_hw_pte_young(bool local) ++{ ++ return false; ++} ++#endif ++ + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR + static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly = + 2; + #endif + +-#ifndef arch_faults_on_old_pte +-static inline bool arch_faults_on_old_pte(void) +-{ +- /* +- * Those arches which don't have hw access flag feature need to +- * implement their own helper. By default, "true" means pagefault +- * will be hit on old pte. +- */ +- return true; +-} +-#endif +- + #ifndef arch_wants_old_prefaulted_pte + static inline bool arch_wants_old_prefaulted_pte(void) + { +@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct + * On architectures with software "accessed" bits, we would + * take a double page fault, so mark it accessed here. + */ +- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { ++ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) { + pte_t entry; + + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); diff --git a/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch b/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch new file mode 100644 index 0000000000..785af275f5 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch @@ -0,0 +1,111 @@ +From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Sat, 26 Sep 2020 21:17:18 -0600 +Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG + +Some architectures support the accessed bit on non-leaf PMD entries, +e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using +it as part of linear address translation [1]. As an optimization, page +table walkers who are interested in the accessed bit can skip the PTEs +under a non-leaf PMD entry if the accessed bit is cleared on this PMD +entry. + +Although an inline function may be preferable, this capability is +added as a configuration option to look consistent when used with the +existing macros. + +[1]: Intel 64 and IA-32 Architectures Software Developer's Manual + Volume 3 (June 2021), section 4.8 + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8 +--- + arch/Kconfig | 9 +++++++++ + arch/x86/Kconfig | 1 + + arch/x86/include/asm/pgtable.h | 3 ++- + arch/x86/mm/pgtable.c | 5 ++++- + include/linux/pgtable.h | 4 ++-- + 5 files changed, 18 insertions(+), 4 deletions(-) + +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT + config ARCH_HAS_PARANOID_L1D_FLUSH + bool + ++config ARCH_HAS_NONLEAF_PMD_YOUNG ++ bool ++ depends on PGTABLE_LEVELS > 2 ++ help ++ Architectures that select this are able to set the accessed bit on ++ non-leaf PMD entries in addition to leaf PTE entries where pages are ++ mapped. For them, page table walkers that clear the accessed bit may ++ stop at non-leaf PMD entries if they do not see the accessed bit. ++ + source "kernel/gcov/Kconfig" + + source "scripts/gcc-plugins/Kconfig" +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -84,6 +84,7 @@ config X86 + select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_DEVMAP if X86_64 + select ARCH_HAS_PTE_SPECIAL ++ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64 + select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 + select ARCH_HAS_COPY_MC if X86_64 + select ARCH_HAS_SET_MEMORY +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad + + static inline int pmd_bad(pmd_t pmd) + { +- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; ++ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != ++ (_KERNPG_TABLE & ~_PAGE_ACCESSED); + } + + static inline unsigned long pages_to_mb(unsigned long npg) +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_ + return ret; + } + +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { +@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_ + + return ret; + } ++#endif ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) + { +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo + #endif + + #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo + BUILD_BUG(); + return 0; + } +-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ++#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ + #endif + + #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH diff --git a/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch b/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch new file mode 100644 index 0000000000..f466f1105c --- /dev/null +++ b/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch @@ -0,0 +1,224 @@ +From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Sun, 27 Sep 2020 20:49:08 -0600 +Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node() + +This patch refactors shrink_node(). This will make the upcoming +changes to mm/vmscan.c more readable. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04 +--- + mm/vmscan.c | 186 +++++++++++++++++++++++++++------------------------- + 1 file changed, 98 insertions(+), 88 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2562,6 +2562,103 @@ enum scan_balance { + SCAN_FILE, + }; + ++static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) ++{ ++ unsigned long file; ++ struct lruvec *target_lruvec; ++ ++ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); ++ ++ /* ++ * Determine the scan balance between anon and file LRUs. ++ */ ++ spin_lock_irq(&target_lruvec->lru_lock); ++ sc->anon_cost = target_lruvec->anon_cost; ++ sc->file_cost = target_lruvec->file_cost; ++ spin_unlock_irq(&target_lruvec->lru_lock); ++ ++ /* ++ * Target desirable inactive:active list ratios for the anon ++ * and file LRU lists. ++ */ ++ if (!sc->force_deactivate) { ++ unsigned long refaults; ++ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_ANON); ++ if (refaults != target_lruvec->refaults[0] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) ++ sc->may_deactivate |= DEACTIVATE_ANON; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_ANON; ++ ++ /* ++ * When refaults are being observed, it means a new ++ * workingset is being established. Deactivate to get ++ * rid of any stale active pages quickly. ++ */ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_FILE); ++ if (refaults != target_lruvec->refaults[1] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) ++ sc->may_deactivate |= DEACTIVATE_FILE; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_FILE; ++ } else ++ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; ++ ++ /* ++ * If we have plenty of inactive file pages that aren't ++ * thrashing, try to reclaim those first before touching ++ * anonymous pages. ++ */ ++ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); ++ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) ++ sc->cache_trim_mode = 1; ++ else ++ sc->cache_trim_mode = 0; ++ ++ /* ++ * Prevent the reclaimer from falling into the cache trap: as ++ * cache pages start out inactive, every cache fault will tip ++ * the scan balance towards the file LRU. And as the file LRU ++ * shrinks, so does the window for rotation from references. ++ * This means we have a runaway feedback loop where a tiny ++ * thrashing file LRU becomes infinitely more attractive than ++ * anon pages. Try to detect this based on file LRU size. ++ */ ++ if (!cgroup_reclaim(sc)) { ++ unsigned long total_high_wmark = 0; ++ unsigned long free, anon; ++ int z; ++ ++ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); ++ file = node_page_state(pgdat, NR_ACTIVE_FILE) + ++ node_page_state(pgdat, NR_INACTIVE_FILE); ++ ++ for (z = 0; z < MAX_NR_ZONES; z++) { ++ struct zone *zone = &pgdat->node_zones[z]; ++ ++ if (!managed_zone(zone)) ++ continue; ++ ++ total_high_wmark += high_wmark_pages(zone); ++ } ++ ++ /* ++ * Consider anon: if that's low too, this isn't a ++ * runaway file reclaim problem, but rather just ++ * extreme pressure. Reclaim as per usual then. ++ */ ++ anon = node_page_state(pgdat, NR_INACTIVE_ANON); ++ ++ sc->file_is_tiny = ++ file + free <= total_high_wmark && ++ !(sc->may_deactivate & DEACTIVATE_ANON) && ++ anon >> sc->priority; ++ } ++} ++ + /* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined +@@ -3032,7 +3129,6 @@ static void shrink_node(pg_data_t *pgdat + unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; + bool reclaimable = false; +- unsigned long file; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + +@@ -3048,93 +3144,7 @@ again: + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + +- /* +- * Determine the scan balance between anon and file LRUs. +- */ +- spin_lock_irq(&target_lruvec->lru_lock); +- sc->anon_cost = target_lruvec->anon_cost; +- sc->file_cost = target_lruvec->file_cost; +- spin_unlock_irq(&target_lruvec->lru_lock); +- +- /* +- * Target desirable inactive:active list ratios for the anon +- * and file LRU lists. +- */ +- if (!sc->force_deactivate) { +- unsigned long refaults; +- +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_ANON); +- if (refaults != target_lruvec->refaults[0] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) +- sc->may_deactivate |= DEACTIVATE_ANON; +- else +- sc->may_deactivate &= ~DEACTIVATE_ANON; +- +- /* +- * When refaults are being observed, it means a new +- * workingset is being established. Deactivate to get +- * rid of any stale active pages quickly. +- */ +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_FILE); +- if (refaults != target_lruvec->refaults[1] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) +- sc->may_deactivate |= DEACTIVATE_FILE; +- else +- sc->may_deactivate &= ~DEACTIVATE_FILE; +- } else +- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; +- +- /* +- * If we have plenty of inactive file pages that aren't +- * thrashing, try to reclaim those first before touching +- * anonymous pages. +- */ +- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); +- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) +- sc->cache_trim_mode = 1; +- else +- sc->cache_trim_mode = 0; +- +- /* +- * Prevent the reclaimer from falling into the cache trap: as +- * cache pages start out inactive, every cache fault will tip +- * the scan balance towards the file LRU. And as the file LRU +- * shrinks, so does the window for rotation from references. +- * This means we have a runaway feedback loop where a tiny +- * thrashing file LRU becomes infinitely more attractive than +- * anon pages. Try to detect this based on file LRU size. +- */ +- if (!cgroup_reclaim(sc)) { +- unsigned long total_high_wmark = 0; +- unsigned long free, anon; +- int z; +- +- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); +- file = node_page_state(pgdat, NR_ACTIVE_FILE) + +- node_page_state(pgdat, NR_INACTIVE_FILE); +- +- for (z = 0; z < MAX_NR_ZONES; z++) { +- struct zone *zone = &pgdat->node_zones[z]; +- if (!managed_zone(zone)) +- continue; +- +- total_high_wmark += high_wmark_pages(zone); +- } +- +- /* +- * Consider anon: if that's low too, this isn't a +- * runaway file reclaim problem, but rather just +- * extreme pressure. Reclaim as per usual then. +- */ +- anon = node_page_state(pgdat, NR_INACTIVE_ANON); +- +- sc->file_is_tiny = +- file + free <= total_high_wmark && +- !(sc->may_deactivate & DEACTIVATE_ANON) && +- anon >> sc->priority; +- } ++ prepare_scan_count(pgdat, sc); + + shrink_node_memcgs(pgdat, sc); + diff --git a/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch b/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch new file mode 100644 index 0000000000..146f510d28 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch @@ -0,0 +1,996 @@ +From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 25 Jan 2021 21:12:33 -0700 +Subject: [PATCH 04/10] mm: multigenerational lru: groundwork + +For each lruvec, evictable pages are divided into multiple +generations. The youngest generation number is stored in +lrugen->max_seq for both anon and file types as they are aged on an +equal footing. The oldest generation numbers are stored in +lrugen->min_seq[] separately for anon and file types as clean file +pages can be evicted regardless of swap constraints. These three +variables are monotonically increasing. Generation numbers are +truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into +page->flags. The sliding window technique is used to prevent truncated +generation numbers from overlapping. Each truncated generation number +is an index to +lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. + +The framework comprises two conceptually independent components: the +aging, which produces young generations, and the eviction, which +consumes old generations. Both can be invoked independently from user +space for the purpose of working set estimation and proactive reclaim. + +The protection of hot pages and the selection of cold pages are based +on page access types and patterns. There are two access types: one via +page tables and the other via file descriptors. The protection of the +former type is by design stronger because: + 1) The uncertainty in determining the access patterns of the former + type is higher due to the coalesced nature of the accessed bit. + 2) The cost of evicting the former type is higher due to the TLB + flushes required and the likelihood of involving I/O. + 3) The penalty of under-protecting the former type is higher because + applications usually do not prepare themselves for major faults like + they do for blocked I/O. For example, client applications commonly + dedicate blocked I/O to separate threads to avoid UI janks that + negatively affect user experience. + +There are also two access patterns: one with temporal locality and the +other without. The latter pattern, e.g., random and sequential, needs +to be explicitly excluded to avoid weakening the protection of the +former pattern. Generally the former type follows the former pattern +unless MADV_SEQUENTIAL is specified and the latter type follows the +latter pattern unless outlying refaults have been observed. + +Upon faulting, a page is added to the youngest generation, which +provides the strongest protection as the eviction will not consider +this page before the aging has scanned it at least twice. The first +scan clears the accessed bit set during the initial fault. And the +second scan makes sure this page has not been used since the first +scan. A page from any other generations is brought back to the +youngest generation whenever the aging finds the accessed bit set on +any of the PTEs mapping this page. + +Unmapped pages are initially added to the oldest generation and then +conditionally protected by tiers. This is done later [PATCH 07/10]. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7 +--- + fs/fuse/dev.c | 3 +- + include/linux/cgroup.h | 15 +- + include/linux/mm.h | 36 ++++ + include/linux/mm_inline.h | 182 ++++++++++++++++++++ + include/linux/mmzone.h | 70 ++++++++ + include/linux/page-flags-layout.h | 19 ++- + include/linux/page-flags.h | 4 +- + include/linux/sched.h | 3 + + kernel/bounds.c | 3 + + kernel/cgroup/cgroup-internal.h | 1 - + mm/huge_memory.c | 3 +- + mm/memcontrol.c | 1 + + mm/memory.c | 7 + + mm/mm_init.c | 6 +- + mm/page_alloc.c | 1 + + mm/swap.c | 9 +- + mm/swapfile.c | 2 + + mm/vmscan.c | 268 ++++++++++++++++++++++++++++++ + 18 files changed, 618 insertions(+), 15 deletions(-) + +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -785,7 +785,8 @@ static int fuse_check_page(struct page * + 1 << PG_active | + 1 << PG_workingset | + 1 << PG_reclaim | +- 1 << PG_waiters))) { ++ 1 << PG_waiters | ++ LRU_GEN_MASK | LRU_REFS_MASK))) { + dump_page(page, "fuse: trying to steal weird page"); + return 1; + } +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr + css_put(&cgrp->self); + } + ++extern struct mutex cgroup_mutex; ++ ++static inline void cgroup_lock(void) ++{ ++ mutex_lock(&cgroup_mutex); ++} ++ ++static inline void cgroup_unlock(void) ++{ ++ mutex_unlock(&cgroup_mutex); ++} ++ + /** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for +@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr + * as locks used during the cgroup_subsys::attach() methods. + */ + #ifdef CONFIG_PROVE_RCU +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + #define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ +@@ -707,6 +718,8 @@ struct cgroup; + static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } + static inline void css_get(struct cgroup_subsys_state *css) {} + static inline void css_put(struct cgroup_subsys_state *css) {} ++static inline void cgroup_lock(void) {} ++static inline void cgroup_unlock(void) {} + static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } + static inline int cgroupstats_build(struct cgroupstats *stats, +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v + #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) + #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) ++#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) ++#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) + + /* + * Define the bit shifts to access each section. For non-existent +@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s + loff_t const holebegin, loff_t const holelen, int even_cows) { } + #endif + ++#ifdef CONFIG_LRU_GEN ++static inline void task_enter_nonseq_fault(void) ++{ ++ WARN_ON(current->in_nonseq_fault); ++ ++ current->in_nonseq_fault = 1; ++} ++ ++static inline void task_exit_nonseq_fault(void) ++{ ++ WARN_ON(!current->in_nonseq_fault); ++ ++ current->in_nonseq_fault = 0; ++} ++ ++static inline bool task_in_nonseq_fault(void) ++{ ++ return current->in_nonseq_fault; ++} ++#else ++static inline void task_enter_nonseq_fault(void) ++{ ++} ++ ++static inline void task_exit_nonseq_fault(void) ++{ ++} ++ ++static inline bool task_in_nonseq_fault(void) ++{ ++ return false; ++} ++#endif /* CONFIG_LRU_GEN */ ++ + static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) + { +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag + return lru; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static inline bool lru_gen_enabled(void) ++{ ++#ifdef CONFIG_LRU_GEN_ENABLED ++ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); ++ ++ return static_branch_likely(&lru_gen_static_key); ++#else ++ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); ++ ++ return static_branch_unlikely(&lru_gen_static_key); ++#endif ++} ++ ++/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */ ++static inline int lru_gen_from_seq(unsigned long seq) ++{ ++ return seq % MAX_NR_GENS; ++} ++ ++/* The youngest and the second youngest generations are counted as active. */ ++static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) ++{ ++ unsigned long max_seq = lruvec->evictable.max_seq; ++ ++ VM_BUG_ON(gen >= MAX_NR_GENS); ++ ++ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); ++} ++ ++/* Update the sizes of the multigenerational lru lists. */ ++static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, ++ int old_gen, int new_gen) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ enum lru_list lru = type * LRU_FILE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lockdep_assert_held(&lruvec->lru_lock); ++ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); ++ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); ++ VM_BUG_ON(old_gen == -1 && new_gen == -1); ++ ++ if (old_gen >= 0) ++ WRITE_ONCE(lrugen->sizes[old_gen][type][zone], ++ lrugen->sizes[old_gen][type][zone] - delta); ++ if (new_gen >= 0) ++ WRITE_ONCE(lrugen->sizes[new_gen][type][zone], ++ lrugen->sizes[new_gen][type][zone] + delta); ++ ++ if (old_gen < 0) { ++ if (lru_gen_is_active(lruvec, new_gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, delta); ++ return; ++ } ++ ++ if (new_gen < 0) { ++ if (lru_gen_is_active(lruvec, old_gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, -delta); ++ return; ++ } ++ ++ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { ++ update_lru_size(lruvec, lru, zone, -delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); ++ } ++ ++ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); ++} ++ ++/* Add a page to one of the multigenerational lru lists. Return true on success. */ ++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int gen; ++ unsigned long old_flags, new_flags; ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ if (PageUnevictable(page) || !lrugen->enabled[type]) ++ return false; ++ /* ++ * If a page shouldn't be considered for eviction, i.e., a page mapped ++ * upon fault during which the accessed bit is set, add it to the ++ * youngest generation. ++ * ++ * If a page can't be evicted immediately, i.e., an anon page not in ++ * swap cache or a dirty page pending writeback, add it to the second ++ * oldest generation. ++ * ++ * If a page could be evicted immediately, e.g., a clean page, add it to ++ * the oldest generation. ++ */ ++ if (PageActive(page)) ++ gen = lru_gen_from_seq(lrugen->max_seq); ++ else if ((!type && !PageSwapCache(page)) || ++ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))) ++ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); ++ else ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page); ++ ++ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, -1, gen); ++ /* for rotate_reclaimable_page() */ ++ if (reclaiming) ++ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); ++ else ++ list_add(&page->lru, &lrugen->lists[gen][type][zone]); ++ ++ return true; ++} ++ ++/* Delete a page from one of the multigenerational lru lists. Return true on success. */ ++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int gen; ++ unsigned long old_flags, new_flags; ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ if (!(new_flags & LRU_GEN_MASK)) ++ return false; ++ ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ ++ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ ++ new_flags &= ~LRU_GEN_MASK; ++ /* for shrink_page_list() */ ++ if (reclaiming) ++ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); ++ else if (lru_gen_is_active(lruvec, gen)) ++ new_flags |= BIT(PG_active); ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, gen, -1); ++ list_del(&page->lru); ++ ++ return true; ++} ++ ++#else ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static __always_inline void add_page_to_lru_list(struct page *page, + struct lruvec *lruvec) + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(page, lruvec, false)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add(&page->lru, &lruvec->lists[lru]); + } +@@ -93,6 +269,9 @@ static __always_inline void add_page_to_ + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(page, lruvec, true)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add_tail(&page->lru, &lruvec->lists[lru]); + } +@@ -100,6 +279,9 @@ static __always_inline void add_page_to_ + static __always_inline void del_page_from_lru_list(struct page *page, + struct lruvec *lruvec) + { ++ if (lru_gen_del_page(page, lruvec, false)) ++ return; ++ + list_del(&page->lru); + update_lru_size(lruvec, page_lru(page), page_zonenum(page), + -thp_nr_pages(page)); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -294,6 +294,72 @@ enum lruvec_flags { + */ + }; + ++struct lruvec; ++ ++#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) ++#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) ++ ++#ifdef CONFIG_LRU_GEN ++ ++/* ++ * For each lruvec, evictable pages are divided into multiple generations. The ++ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are ++ * monotonically increasing. The sliding window technique is used to track at ++ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the ++ * window, AKA gen, indexes an array of per-type and per-zone lists for the ++ * corresponding generation. The counter in page->flags stores gen+1 while a ++ * page is on one of the multigenerational lru lists. Otherwise, it stores 0. ++ * ++ * After a page is faulted in, the aging must check the accessed bit at least ++ * twice before the eviction would consider it. The first check clears the ++ * accessed bit set during the initial fault. The second check makes sure this ++ * page hasn't been used since then. ++ */ ++#define MIN_NR_GENS 2 ++#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) ++ ++struct lrugen { ++ /* the aging increments the max generation number */ ++ unsigned long max_seq; ++ /* the eviction increments the min generation numbers */ ++ unsigned long min_seq[ANON_AND_FILE]; ++ /* the birth time of each generation in jiffies */ ++ unsigned long timestamps[MAX_NR_GENS]; ++ /* the multigenerational lru lists */ ++ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the sizes of the multigenerational lru lists in pages */ ++ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* whether the multigenerational lru is enabled */ ++ bool enabled[ANON_AND_FILE]; ++}; ++ ++#define MAX_BATCH_SIZE 8192 ++ ++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); ++void lru_gen_change_state(bool enable, bool main, bool swap); ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg); ++#endif ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) ++{ ++} ++ ++static inline void lru_gen_change_state(bool enable, bool main, bool swap) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++} ++#endif ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct lruvec { + struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ +@@ -311,6 +377,10 @@ struct lruvec { + unsigned long refaults[ANON_AND_FILE]; + /* Various lruvec state flags (enum lruvec_flags) */ + unsigned long flags; ++#ifdef CONFIG_LRU_GEN ++ /* unevictable pages are on LRU_UNEVICTABLE */ ++ struct lrugen evictable; ++#endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; + #endif +--- a/include/linux/page-flags-layout.h ++++ b/include/linux/page-flags-layout.h +@@ -26,6 +26,14 @@ + + #define ZONES_WIDTH ZONES_SHIFT + ++#ifdef CONFIG_LRU_GEN ++/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ ++#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) ++#else ++#define LRU_GEN_WIDTH 0 ++#define LRU_REFS_WIDTH 0 ++#endif /* CONFIG_LRU_GEN */ ++ + #ifdef CONFIG_SPARSEMEM + #include + #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) +@@ -55,7 +63,8 @@ + #define SECTIONS_WIDTH 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ ++ <= BITS_PER_LONG - NR_PAGEFLAGS + #define NODES_WIDTH NODES_SHIFT + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) + #error "Vmemmap: No space for nodes field in page flags" +@@ -89,8 +98,8 @@ + #define LAST_CPUPID_SHIFT 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ +- <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS + #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT + #else + #define LAST_CPUPID_WIDTH 0 +@@ -100,8 +109,8 @@ + #define LAST_CPUPID_NOT_IN_PAGE_FLAGS + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ +- > BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS + #error "Not enough bits in page flags" + #endif + +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall + 1UL << PG_private | 1UL << PG_private_2 | \ + 1UL << PG_writeback | 1UL << PG_reserved | \ + 1UL << PG_slab | 1UL << PG_active | \ +- 1UL << PG_unevictable | __PG_MLOCKED) ++ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) + + /* + * Flags checked when a page is prepped for return by the page allocator. +@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall + * alloc-free cycle to prevent from reusing the page. + */ + #define PAGE_FLAGS_CHECK_AT_PREP \ +- (PAGEFLAGS_MASK & ~__PG_HWPOISON) ++ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) + + #define PAGE_FLAGS_PRIVATE \ + (1UL << PG_private | 1UL << PG_private_2) +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -911,6 +911,9 @@ struct task_struct { + #ifdef CONFIG_MEMCG + unsigned in_user_fault:1; + #endif ++#ifdef CONFIG_LRU_GEN ++ unsigned in_nonseq_fault:1; ++#endif + #ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; + #endif +--- a/kernel/bounds.c ++++ b/kernel/bounds.c +@@ -22,6 +22,9 @@ int main(void) + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + #endif + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); ++#ifdef CONFIG_LRU_GEN ++ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); ++#endif + /* End of constants */ + + return 0; +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -165,7 +165,6 @@ struct cgroup_mgctx { + #define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + extern struct cgroup_subsys *cgroup_subsys[]; + extern struct list_head cgroup_roots; +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc + #ifdef CONFIG_64BIT + (1L << PG_arch_2) | + #endif +- (1L << PG_dirty))); ++ (1L << PG_dirty) | ++ LRU_GEN_MASK | LRU_REFS_MASK)); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all + memcg->deferred_split_queue.split_queue_len = 0; + #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); ++ lru_gen_init_memcg(memcg); + return memcg; + fail: + mem_cgroup_id_remove(memcg); +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are + unsigned int flags, struct pt_regs *regs) + { + vm_fault_t ret; ++ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); + + __set_current_state(TASK_RUNNING); + +@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are + if (flags & FAULT_FLAG_USER) + mem_cgroup_enter_user_fault(); + ++ if (nonseq_fault) ++ task_enter_nonseq_fault(); ++ + if (unlikely(is_vm_hugetlb_page(vma))) + ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + else + ret = __handle_mm_fault(vma, address, flags); + ++ if (nonseq_fault) ++ task_exit_nonseq_fault(); ++ + if (flags & FAULT_FLAG_USER) { + mem_cgroup_exit_user_fault(); + /* +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH +- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; ++ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", +- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", ++ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, ++ LRU_GEN_WIDTH, ++ LRU_REFS_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -7411,6 +7411,7 @@ static void __meminit pgdat_init_interna + + pgdat_page_ext_init(pgdat); + lruvec_init(&pgdat->__lruvec); ++ lru_gen_init_state(NULL, &pgdat->__lruvec); + } + + static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page) + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + ++ /* see the comment in lru_gen_add_page() */ ++ if (lru_gen_enabled() && !PageUnevictable(page) && ++ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) ++ SetPageActive(page); ++ + get_page(page); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); +@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc + + static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) + { +- if (PageActive(page) && !PageUnevictable(page)) { ++ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + int nr_pages = thp_nr_pages(page); + + del_page_from_lru_list(page, lruvec); +@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p + */ + void deactivate_page(struct page *page) + { +- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { ++ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us + err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ lru_gen_change_state(false, false, true); + + out_dput: + filp_close(victim, NULL); +@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use + mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ lru_gen_change_state(true, false, true); + + error = 0; + goto out; +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg + return can_demote(pgdat->node_id, sc); + } + ++#ifdef CONFIG_LRU_GEN ++ ++/****************************************************************************** ++ * shorthand helpers ++ ******************************************************************************/ ++ ++#define for_each_gen_type_zone(gen, type, zone) \ ++ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ ++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) ++ ++static int page_lru_gen(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) ++{ ++ struct pglist_data *pgdat = NODE_DATA(nid); ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) { ++ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; ++ ++ if (lruvec->pgdat != pgdat) ++ lruvec->pgdat = pgdat; ++ ++ return lruvec; ++ } ++#endif ++ return pgdat ? &pgdat->__lruvec : NULL; ++} ++ ++static int get_nr_gens(struct lruvec *lruvec, int type) ++{ ++ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; ++} ++ ++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) ++{ ++ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS && ++ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) && ++ get_nr_gens(lruvec, 0) <= MAX_NR_GENS; ++} ++ ++/****************************************************************************** ++ * state change ++ ******************************************************************************/ ++ ++#ifdef CONFIG_LRU_GEN_ENABLED ++DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); ++#else ++DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); ++#endif ++ ++static int lru_gen_nr_swapfiles; ++ ++static bool __maybe_unused state_is_valid(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ enum lru_list lru; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for_each_evictable_lru(lru) { ++ type = is_file_lru(lru); ++ ++ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) ++ return false; ++ } ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) ++ return false; ++ ++ /* unlikely but not a bug when reset_batch_size() is pending */ ++ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); ++ } ++ ++ return true; ++} ++ ++static bool fill_lists(struct lruvec *lruvec) ++{ ++ enum lru_list lru; ++ int remaining = MAX_BATCH_SIZE; ++ ++ for_each_evictable_lru(lru) { ++ int type = is_file_lru(lru); ++ bool active = is_active_lru(lru); ++ struct list_head *head = &lruvec->lists[lru]; ++ ++ if (!lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page) != active, page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ del_page_from_lru_list(page, lruvec); ++ success = lru_gen_add_page(page, lruvec, false); ++ VM_BUG_ON(!success); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static bool drain_lists(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ int remaining = MAX_BATCH_SIZE; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; ++ ++ if (lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ success = lru_gen_del_page(page, lruvec, false); ++ VM_BUG_ON(!success); ++ add_page_to_lru_list(page, lruvec); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++/* ++ * For file page tracking, we enable/disable it according to the main switch. ++ * For anon page tracking, we only enabled it when the main switch is on and ++ * there is at least one swapfile; we disable it when there are no swapfiles ++ * regardless of the value of the main switch. Otherwise, we will eventually ++ * reach the max size of the sliding window and have to call inc_min_seq(). ++ */ ++void lru_gen_change_state(bool enable, bool main, bool swap) ++{ ++ static DEFINE_MUTEX(state_mutex); ++ ++ struct mem_cgroup *memcg; ++ ++ mem_hotplug_begin(); ++ cgroup_lock(); ++ mutex_lock(&state_mutex); ++ ++ if (swap) { ++ if (enable) ++ swap = !lru_gen_nr_swapfiles++; ++ else ++ swap = !--lru_gen_nr_swapfiles; ++ } ++ ++ if (main && enable != lru_gen_enabled()) { ++ if (enable) ++ static_branch_enable(&lru_gen_static_key); ++ else ++ static_branch_disable(&lru_gen_static_key); ++ } else if (!swap || !lru_gen_enabled()) ++ goto unlock; ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ if (!lruvec) ++ continue; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ VM_BUG_ON(!state_is_valid(lruvec)); ++ ++ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lruvec->evictable.enabled[1] = lru_gen_enabled(); ++ ++ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++unlock: ++ mutex_unlock(&state_mutex); ++ cgroup_unlock(); ++ mem_hotplug_done(); ++} ++ ++/****************************************************************************** ++ * initialization ++ ******************************************************************************/ ++ ++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) ++{ ++ int i; ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lrugen->max_seq = MIN_NR_GENS + 1; ++ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lrugen->enabled[1] = lru_gen_enabled(); ++ ++ for (i = 0; i <= MIN_NR_GENS + 1; i++) ++ lrugen->timestamps[i] = jiffies; ++ ++ for_each_gen_type_zone(gen, type, zone) ++ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ lru_gen_init_state(memcg, lruvec); ++ } ++} ++#endif ++ ++static int __init init_lru_gen(void) ++{ ++ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); ++ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ ++ return 0; ++}; ++late_initcall(init_lru_gen); ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + unsigned long nr[NR_LRU_LISTS]; diff --git a/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch b/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch new file mode 100644 index 0000000000..75fd39d99d --- /dev/null +++ b/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch @@ -0,0 +1,760 @@ +From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 5 Apr 2021 04:17:41 -0600 +Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list + +To scan PTEs for accessed pages, a mm_struct list is maintained for +each memcg. When multiple threads traverse the same memcg->mm_list, +each of them gets a unique mm_struct and therefore they can run +walk_page_range() concurrently to reach page tables of all processes +of this memcg. + +This infrastructure also provides the following optimizations: + 1) it allows walkers to skip processes that have been sleeping since + the last walk by tracking the usage of mm_struct between context + switches. + 2) it allows walkers to add interesting items they find during a + walk to a Bloom filter so that they can skip uninteresting items + during the next walk by testing whether an item is in this Bloom + filter. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8 +--- + fs/exec.c | 2 + + include/linux/memcontrol.h | 4 + + include/linux/mm_inline.h | 6 + + include/linux/mm_types.h | 75 +++++++++ + include/linux/mmzone.h | 63 +++++++ + kernel/exit.c | 1 + + kernel/fork.c | 9 + + kernel/sched/core.c | 1 + + mm/memcontrol.c | 25 +++ + mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++ + 10 files changed, 517 insertions(+) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m + active_mm = tsk->active_mm; + tsk->active_mm = mm; + tsk->mm = mm; ++ lru_gen_add_mm(mm); + /* + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for +@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m + if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + activate_mm(active_mm, mm); ++ lru_gen_activate_mm(mm); + if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + tsk->mm->vmacache_seqnum = 0; +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -348,6 +348,10 @@ struct mem_cgroup { + struct deferred_split deferred_split_queue; + #endif + ++#ifdef CONFIG_LRU_GEN ++ struct lru_gen_mm_list mm_list; ++#endif ++ + struct mem_cgroup_per_node *nodeinfo[]; + }; + +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig + return seq % MAX_NR_GENS; + } + ++/* Return a proper index regardless whether we keep stats for historical generations. */ ++static inline int lru_hist_from_seq(unsigned long seq) ++{ ++ return seq % NR_HIST_GENS; ++} ++ + /* The youngest and the second youngest generations are counted as active. */ + static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) + { +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -3,6 +3,7 @@ + #define _LINUX_MM_TYPES_H + + #include ++#include + + #include + #include +@@ -15,6 +16,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -580,6 +583,18 @@ struct mm_struct { + #ifdef CONFIG_IOMMU_SUPPORT + u32 pasid; + #endif ++#ifdef CONFIG_LRU_GEN ++ struct { ++ /* the node of a global or per-memcg mm_struct list */ ++ struct list_head list; ++#ifdef CONFIG_MEMCG ++ /* points to the memcg of the owner task above */ ++ struct mem_cgroup *memcg; ++#endif ++ /* whether this mm_struct has been used since the last walk */ ++ nodemask_t nodes; ++ } lrugen; ++#endif /* CONFIG_LRU_GEN */ + } __randomize_layout; + + /* +@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru + return (struct cpumask *)&mm->cpu_bitmap; + } + ++#ifdef CONFIG_LRU_GEN ++ ++struct lru_gen_mm_list { ++ /* a global or per-memcg mm_struct list */ ++ struct list_head fifo; ++ /* protects the list above */ ++ spinlock_t lock; ++}; ++ ++void lru_gen_add_mm(struct mm_struct *mm); ++void lru_gen_del_mm(struct mm_struct *mm); ++#ifdef CONFIG_MEMCG ++void lru_gen_migrate_mm(struct mm_struct *mm); ++#endif ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++ INIT_LIST_HEAD(&mm->lrugen.list); ++#ifdef CONFIG_MEMCG ++ mm->lrugen.memcg = NULL; ++#endif ++ nodes_clear(mm->lrugen.nodes); ++} ++ ++/* Track the usage of each mm_struct so that we can skip inactive ones. */ ++static inline void lru_gen_activate_mm(struct mm_struct *mm) ++{ ++ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */ ++ VM_WARN_ON(list_empty(&mm->lrugen.list)); ++ ++ if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes)) ++ nodes_setall(mm->lrugen.nodes); ++} ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_add_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_del_mm(struct mm_struct *mm) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++} ++#endif ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_activate_mm(struct mm_struct *mm) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -318,6 +318,13 @@ struct lruvec; + #define MIN_NR_GENS 2 + #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) + ++/* Whether to keep stats for historical generations. */ ++#ifdef CONFIG_LRU_GEN_STATS ++#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) ++#else ++#define NR_HIST_GENS 1U ++#endif ++ + struct lrugen { + /* the aging increments the max generation number */ + unsigned long max_seq; +@@ -333,13 +340,63 @@ struct lrugen { + bool enabled[ANON_AND_FILE]; + }; + ++enum { ++ MM_LEAF_TOTAL, /* total leaf entries */ ++ MM_LEAF_OLD, /* old leaf entries */ ++ MM_LEAF_YOUNG, /* young leaf entries */ ++ MM_NONLEAF_TOTAL, /* total non-leaf entries */ ++ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */ ++ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */ ++ NR_MM_STATS ++}; ++ ++/* mnemonic codes for the stats above */ ++#define MM_STAT_CODES "toydpc" ++ ++/* double buffering bloom filters */ ++#define NR_BLOOM_FILTERS 2 ++ ++struct lru_gen_mm_walk { ++ /* set to max_seq after each round of walk */ ++ unsigned long seq; ++ /* the next mm_struct on the list to walk */ ++ struct list_head *head; ++ /* the first mm_struct never walked before */ ++ struct list_head *tail; ++ /* to wait for the last walker to finish */ ++ struct wait_queue_head wait; ++ /* bloom filters flip after each round of walk */ ++ unsigned long *filters[NR_BLOOM_FILTERS]; ++ /* page table stats for debugging */ ++ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; ++ /* the number of concurrent walkers */ ++ int nr_walkers; ++}; ++ ++#define MIN_BATCH_SIZE 64 + #define MAX_BATCH_SIZE 8192 + ++struct mm_walk_args { ++ struct mem_cgroup *memcg; ++ unsigned long max_seq; ++ unsigned long start_pfn; ++ unsigned long end_pfn; ++ unsigned long next_addr; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)]; ++ int node_id; ++ int swappiness; ++ int batch_size; ++ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ int mm_stats[NR_MM_STATS]; ++ bool use_filter; ++}; ++ + void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); + void lru_gen_change_state(bool enable, bool main, bool swap); + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg); ++void lru_gen_free_memcg(struct mem_cgroup *memcg); + #endif + + #else /* !CONFIG_LRU_GEN */ +@@ -356,6 +413,10 @@ static inline void lru_gen_change_state( + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { + } ++ ++static inline void lru_gen_free_memcg(struct mem_cgroup *memcg) ++{ ++} + #endif + + #endif /* CONFIG_LRU_GEN */ +@@ -380,6 +441,8 @@ struct lruvec { + #ifdef CONFIG_LRU_GEN + /* unevictable pages are on LRU_UNEVICTABLE */ + struct lrugen evictable; ++ /* state for mm list and page table walks */ ++ struct lru_gen_mm_walk mm_walk; + #endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -422,6 +422,7 @@ assign_new_owner: + goto retry; + } + WRITE_ONCE(mm->owner, c); ++ lru_gen_migrate_mm(mm); + task_unlock(c); + put_task_struct(c); + } +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1080,6 +1080,7 @@ static struct mm_struct *mm_init(struct + goto fail_nocontext; + + mm->user_ns = get_user_ns(user_ns); ++ lru_gen_init_mm(mm); + return mm; + + fail_nocontext: +@@ -1122,6 +1123,7 @@ static inline void __mmput(struct mm_str + } + if (mm->binfmt) + module_put(mm->binfmt->module); ++ lru_gen_del_mm(mm); + mmdrop(mm); + } + +@@ -2616,6 +2618,13 @@ pid_t kernel_clone(struct kernel_clone_a + get_task_struct(p); + } + ++ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { ++ /* lock the task to synchronize with memcg migration */ ++ task_lock(p); ++ lru_gen_add_mm(p->mm); ++ task_unlock(p); ++ } ++ + wake_up_new_task(p); + + /* forking complete and child started to run, tell ptracer */ +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4978,6 +4978,7 @@ context_switch(struct rq *rq, struct tas + * finish_task_switch()'s mmdrop(). + */ + switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ lru_gen_activate_mm(next->mm); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5163,6 +5163,7 @@ static void __mem_cgroup_free(struct mem + + static void mem_cgroup_free(struct mem_cgroup *memcg) + { ++ lru_gen_free_memcg(memcg); + memcg_wb_domain_exit(memcg); + __mem_cgroup_free(memcg); + } +@@ -6195,6 +6196,29 @@ static void mem_cgroup_move_task(void) + } + #endif + ++#ifdef CONFIG_LRU_GEN ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++ struct cgroup_subsys_state *css; ++ struct task_struct *task = NULL; ++ ++ cgroup_taskset_for_each_leader(task, css, tset) ++ break; ++ ++ if (!task) ++ return; ++ ++ task_lock(task); ++ if (task->mm && task->mm->owner == task) ++ lru_gen_migrate_mm(task->mm); ++ task_unlock(task); ++} ++#else ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) + { + if (value == PAGE_COUNTER_MAX) +@@ -6538,6 +6562,7 @@ struct cgroup_subsys memory_cgrp_subsys + .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, + .can_attach = mem_cgroup_can_attach, ++ .attach = mem_cgroup_attach, + .cancel_attach = mem_cgroup_cancel_attach, + .post_attach = mem_cgroup_move_task, + .dfl_cftypes = memory_files, +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2929,6 +2929,306 @@ static bool __maybe_unused seq_is_valid( + } + + /****************************************************************************** ++ * mm_struct list ++ ******************************************************************************/ ++ ++static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) ++{ ++ static struct lru_gen_mm_list mm_list = { ++ .fifo = LIST_HEAD_INIT(mm_list.fifo), ++ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), ++ }; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ return &memcg->mm_list; ++#endif ++ return &mm_list; ++} ++ ++void lru_gen_add_mm(struct mm_struct *mm) ++{ ++ int nid; ++ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ ++ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm); ++#ifdef CONFIG_MEMCG ++ VM_BUG_ON_MM(mm->lrugen.memcg, mm); ++ mm->lrugen.memcg = memcg; ++#endif ++ spin_lock(&mm_list->lock); ++ ++ list_add_tail(&mm->lrugen.list, &mm_list->fifo); ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ if (!lruvec) ++ continue; ++ ++ if (lruvec->mm_walk.tail == &mm_list->fifo) ++ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev; ++ } ++ ++ spin_unlock(&mm_list->lock); ++} ++ ++void lru_gen_del_mm(struct mm_struct *mm) ++{ ++ int nid; ++ struct lru_gen_mm_list *mm_list; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (list_empty(&mm->lrugen.list)) ++ return; ++ ++#ifdef CONFIG_MEMCG ++ memcg = mm->lrugen.memcg; ++#endif ++ mm_list = get_mm_list(memcg); ++ ++ spin_lock(&mm_list->lock); ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ if (!lruvec) ++ continue; ++ ++ if (lruvec->mm_walk.tail == &mm->lrugen.list) ++ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next; ++ ++ if (lruvec->mm_walk.head != &mm->lrugen.list) ++ continue; ++ ++ lruvec->mm_walk.head = lruvec->mm_walk.head->next; ++ if (lruvec->mm_walk.head == &mm_list->fifo) ++ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1); ++ } ++ ++ list_del_init(&mm->lrugen.list); ++ ++ spin_unlock(&mm_list->lock); ++ ++#ifdef CONFIG_MEMCG ++ mem_cgroup_put(mm->lrugen.memcg); ++ mm->lrugen.memcg = NULL; ++#endif ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++ struct mem_cgroup *memcg; ++ ++ lockdep_assert_held(&mm->owner->alloc_lock); ++ ++ if (mem_cgroup_disabled()) ++ return; ++ ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_task(mm->owner); ++ rcu_read_unlock(); ++ if (memcg == mm->lrugen.memcg) ++ return; ++ ++ VM_BUG_ON_MM(!mm->lrugen.memcg, mm); ++ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm); ++ ++ lru_gen_del_mm(mm); ++ lru_gen_add_mm(mm); ++} ++#endif ++ ++#define BLOOM_FILTER_SHIFT 15 ++ ++static inline int filter_gen_from_seq(unsigned long seq) ++{ ++ return seq % NR_BLOOM_FILTERS; ++} ++ ++static void get_item_key(void *item, int *key) ++{ ++ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); ++ ++ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); ++ ++ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); ++ key[1] = hash >> BLOOM_FILTER_SHIFT; ++} ++ ++static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq) ++{ ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); ++ ++ filter = lruvec->mm_walk.filters[gen]; ++ if (filter) { ++ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); ++ return; ++ } ++ ++ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC); ++ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter); ++} ++ ++static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); ++ if (!filter) ++ return; ++ ++ get_item_key(item, key); ++ ++ if (!test_bit(key[0], filter)) ++ set_bit(key[0], filter); ++ if (!test_bit(key[1], filter)) ++ set_bit(key[1], filter); ++} ++ ++static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); ++ if (!filter) ++ return false; ++ ++ get_item_key(item, key); ++ ++ return test_bit(key[0], filter) && test_bit(key[1], filter); ++} ++ ++static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args) ++{ ++ int i; ++ int hist = lru_hist_from_seq(args->max_seq); ++ ++ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); ++ ++ for (i = 0; i < NR_MM_STATS; i++) { ++ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], ++ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]); ++ args->mm_stats[i] = 0; ++ } ++ ++ if (!last || NR_HIST_GENS == 1) ++ return; ++ ++ hist = lru_hist_from_seq(args->max_seq + 1); ++ for (i = 0; i < NR_MM_STATS; i++) ++ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0); ++} ++ ++static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args) ++{ ++ int type; ++ unsigned long size = 0; ++ ++ if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes)) ++ return true; ++ ++ if (mm_is_oom_victim(mm)) ++ return true; ++ ++ for (type = !args->swappiness; type < ANON_AND_FILE; type++) { ++ size += type ? get_mm_counter(mm, MM_FILEPAGES) : ++ get_mm_counter(mm, MM_ANONPAGES) + ++ get_mm_counter(mm, MM_SHMEMPAGES); ++ } ++ ++ if (size < MIN_BATCH_SIZE) ++ return true; ++ ++ if (!mmget_not_zero(mm)) ++ return true; ++ ++ node_clear(args->node_id, mm->lrugen.nodes); ++ ++ return false; ++} ++ ++/* To support multiple walkers that concurrently walk an mm_struct list. */ ++static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args, ++ struct mm_struct **iter) ++{ ++ bool first = false; ++ bool last = true; ++ struct mm_struct *mm = NULL; ++ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk; ++ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg); ++ ++ if (*iter) ++ mmput_async(*iter); ++ else if (args->max_seq <= READ_ONCE(mm_walk->seq)) ++ return false; ++ ++ spin_lock(&mm_list->lock); ++ ++ VM_BUG_ON(args->max_seq > mm_walk->seq + 1); ++ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq); ++ VM_BUG_ON(*iter && !mm_walk->nr_walkers); ++ ++ if (args->max_seq <= mm_walk->seq) { ++ if (!*iter) ++ last = false; ++ goto done; ++ } ++ ++ if (mm_walk->head == &mm_list->fifo) { ++ VM_BUG_ON(mm_walk->nr_walkers); ++ mm_walk->head = mm_walk->head->next; ++ first = true; ++ } ++ ++ while (!mm && mm_walk->head != &mm_list->fifo) { ++ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list); ++ ++ mm_walk->head = mm_walk->head->next; ++ ++ if (mm_walk->tail == &mm->lrugen.list) { ++ mm_walk->tail = mm_walk->tail->next; ++ args->use_filter = false; ++ } ++ ++ if (should_skip_mm(mm, args)) ++ mm = NULL; ++ } ++ ++ if (mm_walk->head == &mm_list->fifo) ++ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1); ++done: ++ if (*iter && !mm) ++ mm_walk->nr_walkers--; ++ if (!*iter && mm) ++ mm_walk->nr_walkers++; ++ ++ if (mm_walk->nr_walkers) ++ last = false; ++ ++ if (mm && first) ++ clear_bloom_filter(lruvec, args->max_seq + 1); ++ ++ if (*iter || last) ++ reset_mm_stats(lruvec, last, args); ++ ++ spin_unlock(&mm_list->lock); ++ ++ *iter = mm; ++ ++ return last; ++} ++ ++/****************************************************************************** + * state change + ******************************************************************************/ + +@@ -3112,6 +3412,7 @@ void lru_gen_init_state(struct mem_cgrou + int i; + int gen, type, zone; + struct lrugen *lrugen = &lruvec->evictable; ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; +@@ -3122,6 +3423,17 @@ void lru_gen_init_state(struct mem_cgrou + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++ ++ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) ++ spin_lock(&mm_list->lock); ++ ++ lruvec->mm_walk.seq = MIN_NR_GENS; ++ lruvec->mm_walk.head = &mm_list->fifo; ++ lruvec->mm_walk.tail = &mm_list->fifo; ++ init_waitqueue_head(&lruvec->mm_walk.wait); ++ ++ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) ++ spin_unlock(&mm_list->lock); + } + + #ifdef CONFIG_MEMCG +@@ -3129,18 +3441,37 @@ void lru_gen_init_memcg(struct mem_cgrou + { + int nid; + ++ INIT_LIST_HEAD(&memcg->mm_list.fifo); ++ spin_lock_init(&memcg->mm_list.lock); ++ + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(nid, memcg); + + lru_gen_init_state(memcg, lruvec); + } + } ++ ++void lru_gen_free_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; ++ ++ for_each_node(nid) { ++ int i; ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ for (i = 0; i < NR_BLOOM_FILTERS; i++) { ++ bitmap_free(lruvec->mm_walk.filters[i]); ++ lruvec->mm_walk.filters[i] = NULL; ++ } ++ } ++} + #endif + + static int __init init_lru_gen(void) + { + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); + + return 0; + }; diff --git a/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch b/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch new file mode 100644 index 0000000000..e5622ccbbe --- /dev/null +++ b/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch @@ -0,0 +1,1176 @@ +From 8217cd2238c40cf77208aa27a7cc09879e685890 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 5 Apr 2021 04:35:07 -0600 +Subject: [PATCH 06/10] mm: multigenerational lru: aging + +The aging produces young generations. Given an lruvec, the aging +traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan +PTEs for accessed pages. Upon finding one, the aging updates its +generation number to max_seq (modulo MAX_NR_GENS). After each round of +traversal, the aging increments max_seq. The aging is due when +min_seq[] reaches max_seq-1. + +The aging uses the following optimizations when walking page tables: + 1) It skips non-leaf PMD entries that have the accessed bit cleared + when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. + 2) It does not zigzag between a PGD table and the same PMD or PTE + table spanning multiple VMAs. In other words, it finishes all the + VMAs within the range of the same PMD or PTE table before it returns + to this PGD table. This optimizes workloads that have large numbers + of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I3ae8abc3100d023cecb3a699d86020ae6fc10a45 +--- + include/linux/memcontrol.h | 3 + + include/linux/mmzone.h | 9 + + include/linux/oom.h | 16 + + include/linux/swap.h | 3 + + mm/memcontrol.c | 5 + + mm/oom_kill.c | 4 +- + mm/rmap.c | 8 + + mm/vmscan.c | 948 +++++++++++++++++++++++++++++++++++++ + 8 files changed, 994 insertions(+), 2 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -1367,10 +1367,13 @@ mem_cgroup_print_oom_meminfo(struct mem_ + + static inline void lock_page_memcg(struct page *page) + { ++ /* to match page_memcg_rcu() */ ++ rcu_read_lock(); + } + + static inline void unlock_page_memcg(struct page *page) + { ++ rcu_read_unlock(); + } + + static inline void mem_cgroup_handle_over_high(void) +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -295,6 +295,7 @@ enum lruvec_flags { + }; + + struct lruvec; ++struct page_vma_mapped_walk; + + #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) + #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +@@ -393,6 +394,7 @@ struct mm_walk_args { + + void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); + void lru_gen_change_state(bool enable, bool main, bool swap); ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg); +@@ -409,6 +411,10 @@ static inline void lru_gen_change_state( + { + } + ++static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++} ++ + #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { +@@ -1028,6 +1034,9 @@ typedef struct pglist_data { + + unsigned long flags; + ++#ifdef CONFIG_LRU_GEN ++ struct mm_walk_args mm_walk_args; ++#endif + ZONE_PADDING(_pad2_) + + /* Per-node vmstats */ +--- a/include/linux/oom.h ++++ b/include/linux/oom.h +@@ -57,6 +57,22 @@ struct oom_control { + extern struct mutex oom_lock; + extern struct mutex oom_adj_mutex; + ++#ifdef CONFIG_MMU ++extern struct task_struct *oom_reaper_list; ++extern struct wait_queue_head oom_reaper_wait; ++ ++static inline bool oom_reaping_in_progress(void) ++{ ++ /* racy check to see if oom reaping could be in progress */ ++ return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait); ++} ++#else ++static inline bool oom_reaping_in_progress(void) ++{ ++ return false; ++} ++#endif ++ + static inline void set_current_oom_origin(void) + { + current->signal->oom_flag_origin = true; +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -137,6 +137,9 @@ union swap_header { + */ + struct reclaim_state { + unsigned long reclaimed_slab; ++#ifdef CONFIG_LRU_GEN ++ struct mm_walk_args *mm_walk_args; ++#endif + }; + + #ifdef __KERNEL__ +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -1304,12 +1304,17 @@ void mem_cgroup_update_lru_size(struct l + *lru_size += nr_pages; + + size = *lru_size; ++#ifdef CONFIG_LRU_GEN ++ /* unlikely but not a bug when reset_batch_size() is pending */ ++ VM_WARN_ON(size + MAX_BATCH_SIZE < 0); ++#else + if (WARN_ONCE(size < 0, + "%s(%p, %d, %d): lru_size %ld\n", + __func__, lruvec, lru, nr_pages, size)) { + VM_BUG_ON(1); + *lru_size = 0; + } ++#endif + + if (nr_pages > 0) + *lru_size += nr_pages; +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struc + * victim (if that is possible) to help the OOM killer to move on. + */ + static struct task_struct *oom_reaper_th; +-static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +-static struct task_struct *oom_reaper_list; ++DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); ++struct task_struct *oom_reaper_list; + static DEFINE_SPINLOCK(oom_reaper_lock); + + bool __oom_reap_task_mm(struct mm_struct *mm) +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -73,6 +73,7 @@ + #include + #include + #include ++#include + + #include + +@@ -790,6 +791,13 @@ static bool page_referenced_one(struct p + } + + if (pvmw.pte) { ++ /* the multigenerational lru exploits the spatial locality */ ++ if (lru_gen_enabled() && pte_young(*pvmw.pte) && ++ !(vma->vm_flags & VM_SEQ_READ)) { ++ lru_gen_look_around(&pvmw); ++ referenced++; ++ } ++ + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -51,6 +51,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -2887,6 +2889,15 @@ static bool can_age_anon_pages(struct pg + * shorthand helpers + ******************************************************************************/ + ++#define DEFINE_MAX_SEQ(lruvec) \ ++ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq) ++ ++#define DEFINE_MIN_SEQ(lruvec) \ ++ unsigned long min_seq[ANON_AND_FILE] = { \ ++ READ_ONCE((lruvec)->evictable.min_seq[0]), \ ++ READ_ONCE((lruvec)->evictable.min_seq[1]), \ ++ } ++ + #define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ +@@ -2899,6 +2910,12 @@ static int page_lru_gen(struct page *pag + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + } + ++static int get_swappiness(struct mem_cgroup *memcg) ++{ ++ return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? ++ mem_cgroup_swappiness(memcg) : 0; ++} ++ + static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) + { + struct pglist_data *pgdat = NODE_DATA(nid); +@@ -3229,6 +3246,926 @@ done: + } + + /****************************************************************************** ++ * the aging ++ ******************************************************************************/ ++ ++static int page_update_gen(struct page *page, int gen) ++{ ++ unsigned long old_flags, new_flags; ++ ++ VM_BUG_ON(gen >= MAX_NR_GENS); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ ++ if (!(new_flags & LRU_GEN_MASK)) { ++ new_flags |= BIT(PG_referenced); ++ continue; ++ } ++ ++ new_flags &= ~LRU_GEN_MASK; ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int old_gen, new_gen; ++ unsigned long old_flags, new_flags; ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ old_gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page); ++ ++ new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ /* page_update_gen() has updated this page? */ ++ if (new_gen >= 0 && new_gen != old_gen) { ++ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); ++ return; ++ } ++ ++ new_gen = (old_gen + 1) % MAX_NR_GENS; ++ ++ new_flags &= ~LRU_GEN_MASK; ++ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; ++ /* for end_page_writeback() */ ++ if (reclaiming) ++ new_flags |= BIT(PG_reclaim); ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, old_gen, new_gen); ++ if (reclaiming) ++ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); ++ else ++ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); ++} ++ ++static void update_batch_size(struct page *page, int old_gen, int new_gen, ++ struct mm_walk_args *args) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ ++ VM_BUG_ON(old_gen >= MAX_NR_GENS); ++ VM_BUG_ON(new_gen >= MAX_NR_GENS); ++ ++ args->batch_size++; ++ ++ args->nr_pages[old_gen][type][zone] -= delta; ++ args->nr_pages[new_gen][type][zone] += delta; ++} ++ ++static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args) ++{ ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ args->batch_size = 0; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ enum lru_list lru = type * LRU_FILE; ++ int delta = args->nr_pages[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ args->nr_pages[gen][type][zone] = 0; ++ WRITE_ONCE(lrugen->sizes[gen][type][zone], ++ lrugen->sizes[gen][type][zone] + delta); ++ ++ if (lru_gen_is_active(lruvec, gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, delta); ++ } ++} ++ ++static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk) ++{ ++ struct address_space *mapping; ++ struct vm_area_struct *vma = walk->vma; ++ struct mm_walk_args *args = walk->private; ++ ++ if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) || ++ (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ))) ++ return true; ++ ++ if (vma_is_anonymous(vma)) ++ return !args->swappiness; ++ ++ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) ++ return true; ++ ++ mapping = vma->vm_file->f_mapping; ++ if (!mapping->a_ops->writepage) ++ return true; ++ ++ return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping); ++} ++ ++/* ++ * Some userspace memory allocators create many single-page VMAs. So instead of ++ * returning back to the PGD table for each of such VMAs, we finish at least an ++ * entire PMD table and therefore avoid many zigzags. ++ */ ++static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size, ++ unsigned long *start, unsigned long *end) ++{ ++ unsigned long next = round_up(*end, size); ++ ++ VM_BUG_ON(mask & size); ++ VM_BUG_ON(*start >= *end); ++ VM_BUG_ON((next & mask) != (*start & mask)); ++ ++ while (walk->vma) { ++ if (next >= walk->vma->vm_end) { ++ walk->vma = walk->vma->vm_next; ++ continue; ++ } ++ ++ if ((next & mask) != (walk->vma->vm_start & mask)) ++ return false; ++ ++ if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) { ++ walk->vma = walk->vma->vm_next; ++ continue; ++ } ++ ++ *start = max(next, walk->vma->vm_start); ++ next = (next | ~mask) + 1; ++ /* rounded-up boundaries can wrap to 0 */ ++ *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end; ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pte_t *pte; ++ spinlock_t *ptl; ++ unsigned long addr; ++ int worth = 0; ++ struct mm_walk_args *args = walk->private; ++ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); ++ ++ VM_BUG_ON(pmd_leaf(*pmd)); ++ ++ pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl); ++ arch_enter_lazy_mmu_mode(); ++restart: ++ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ struct page *page; ++ unsigned long pfn = pte_pfn(pte[i]); ++ ++ args->mm_stats[MM_LEAF_TOTAL]++; ++ ++ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) ++ continue; ++ ++ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) ++ continue; ++ ++ if (!pte_young(pte[i])) { ++ args->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) ++ continue; ++ ++ page = compound_head(pfn_to_page(pfn)); ++ if (page_to_nid(page) != args->node_id) ++ continue; ++ ++ if (page_memcg_rcu(page) != args->memcg) ++ continue; ++ ++ VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end); ++ if (!ptep_test_and_clear_young(walk->vma, addr, pte + i)) ++ continue; ++ ++ args->mm_stats[MM_LEAF_YOUNG]++; ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) ++ set_page_dirty(page); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ ++ worth++; ++ } ++ ++ if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end)) ++ goto restart; ++ ++ arch_leave_lazy_mmu_mode(); ++ pte_unmap_unlock(pte, ptl); ++ ++ return worth >= MIN_BATCH_SIZE / 2; ++} ++ ++/* ++ * We scan PMD entries in two passes. The first pass reaches to PTE tables and ++ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD ++ * entries and needs to take the PMD lock. ++ */ ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) ++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, ++ struct vm_area_struct *vma, struct mm_walk *walk) ++{ ++ int i; ++ pmd_t *pmd; ++ spinlock_t *ptl; ++ struct mm_walk_args *args = walk->private; ++ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); ++ ++ VM_BUG_ON(pud_leaf(*pud)); ++ ++ start = (start & PUD_MASK) + offset * PMD_SIZE; ++ pmd = pmd_offset(pud, start); ++ ptl = pmd_lock(walk->mm, pmd); ++ arch_enter_lazy_mmu_mode(); ++ ++ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) { ++ struct page *page; ++ unsigned long pfn = pmd_pfn(pmd[i]); ++ unsigned long addr = start + i * PMD_SIZE; ++ ++ if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i])) ++ continue; ++ ++ if (WARN_ON_ONCE(pmd_devmap(pmd[i]))) ++ continue; ++ ++ if (!pmd_trans_huge(pmd[i])) { ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) ++ pmdp_test_and_clear_young(vma, addr, pmd + i); ++ continue; ++ } ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) ++ continue; ++ ++ page = pfn_to_page(pfn); ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ if (page_to_nid(page) != args->node_id) ++ continue; ++ ++ if (page_memcg_rcu(page) != args->memcg) ++ continue; ++ ++ VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end); ++ if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) ++ continue; ++ ++ args->mm_stats[MM_LEAF_YOUNG]++; ++ ++ if (pmd_dirty(pmd[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) ++ set_page_dirty(page); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ spin_unlock(ptl); ++ ++ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); ++} ++#else ++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, ++ struct vm_area_struct *vma, struct mm_walk *walk) ++{ ++} ++#endif ++ ++static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pmd_t *pmd; ++ unsigned long next; ++ unsigned long addr; ++ struct vm_area_struct *vma; ++ int offset = -1; ++ bool reset = false; ++ struct mm_walk_args *args = walk->private; ++ struct lruvec *lruvec = get_lruvec(args->node_id, args->memcg); ++ ++ VM_BUG_ON(pud_leaf(*pud)); ++ ++ pmd = pmd_offset(pud, start & PUD_MASK); ++restart: ++ vma = walk->vma; ++ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { ++ pmd_t val = pmd_read_atomic(pmd + i); ++ ++ /* for pmd_read_atomic() */ ++ barrier(); ++ ++ next = pmd_addr_end(addr, end); ++ ++ if (!pmd_present(val)) { ++ args->mm_stats[MM_LEAF_TOTAL]++; ++ continue; ++ } ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ if (pmd_trans_huge(val)) { ++ unsigned long pfn = pmd_pfn(val); ++ ++ args->mm_stats[MM_LEAF_TOTAL]++; ++ ++ if (is_huge_zero_pmd(val)) ++ continue; ++ ++ if (!pmd_young(val)) { ++ args->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) ++ continue; ++ ++ if (offset < 0) ++ offset = i; ++ else if (i - offset >= MIN_BATCH_SIZE) { ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++ offset = i; ++ } ++ __set_bit(i - offset, args->bitmap); ++ reset = true; ++ continue; ++ } ++#endif ++ args->mm_stats[MM_NONLEAF_TOTAL]++; ++ ++#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG ++ if (!pmd_young(val)) ++ continue; ++ ++ if (offset < 0) ++ offset = i; ++ else if (i - offset >= MIN_BATCH_SIZE) { ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++ offset = i; ++ reset = false; ++ } ++ __set_bit(i - offset, args->bitmap); ++#endif ++ if (args->use_filter && !test_bloom_filter(lruvec, args->max_seq, pmd + i)) ++ continue; ++ ++ args->mm_stats[MM_NONLEAF_PREV]++; ++ ++ if (!walk_pte_range(&val, addr, next, walk)) ++ continue; ++ ++ args->mm_stats[MM_NONLEAF_CUR]++; ++ ++ set_bloom_filter(lruvec, args->max_seq + 1, pmd + i); ++ } ++ ++ if (reset) { ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++ offset = -1; ++ reset = false; ++ } ++ ++ if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end)) ++ goto restart; ++ ++ if (offset >= 0) ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++} ++ ++static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pud_t *pud; ++ unsigned long addr; ++ unsigned long next; ++ struct mm_walk_args *args = walk->private; ++ ++ VM_BUG_ON(p4d_leaf(*p4d)); ++ ++ pud = pud_offset(p4d, start & P4D_MASK); ++restart: ++ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { ++ pud_t val = READ_ONCE(pud[i]); ++ ++ next = pud_addr_end(addr, end); ++ ++ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) ++ continue; ++ ++ walk_pmd_range(&val, addr, next, walk); ++ ++ if (args->batch_size >= MAX_BATCH_SIZE) { ++ end = (addr | ~PUD_MASK) + 1; ++ goto done; ++ } ++ } ++ ++ if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end)) ++ goto restart; ++ ++ end = round_up(end, P4D_SIZE); ++done: ++ /* rounded-up boundaries can wrap to 0 */ ++ args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0; ++ ++ return -EAGAIN; ++} ++ ++static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct mm_walk_args *args) ++{ ++ static const struct mm_walk_ops mm_walk_ops = { ++ .test_walk = should_skip_vma, ++ .p4d_entry = walk_pud_range, ++ }; ++ ++ int err; ++ ++ args->next_addr = FIRST_USER_ADDRESS; ++ ++ do { ++ unsigned long start = args->next_addr; ++ unsigned long end = mm->highest_vm_end; ++ ++ err = -EBUSY; ++ ++ rcu_read_lock(); ++#ifdef CONFIG_MEMCG ++ if (args->memcg && atomic_read(&args->memcg->moving_account)) ++ goto contended; ++#endif ++ if (!mmap_read_trylock(mm)) ++ goto contended; ++ ++ err = walk_page_range(mm, start, end, &mm_walk_ops, args); ++ ++ mmap_read_unlock(mm); ++ ++ if (args->batch_size) { ++ spin_lock_irq(&lruvec->lru_lock); ++ reset_batch_size(lruvec, args); ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++contended: ++ rcu_read_unlock(); ++ ++ cond_resched(); ++ } while (err == -EAGAIN && args->next_addr && !mm_is_oom_victim(mm)); ++} ++ ++static struct mm_walk_args *alloc_mm_walk_args(void) ++{ ++ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) ++ return kvzalloc(sizeof(struct mm_walk_args), GFP_KERNEL); ++ ++ return current->reclaim_state->mm_walk_args; ++} ++ ++static void free_mm_walk_args(struct mm_walk_args *args) ++{ ++ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) ++ kvfree(args); ++} ++ ++static bool inc_min_seq(struct lruvec *lruvec, int type) ++{ ++ int gen, zone; ++ int remaining = MAX_BATCH_SIZE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ if (get_nr_gens(lruvec, type) != MAX_NR_GENS) ++ return true; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ page_inc_gen(page, lruvec, false); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); ++ ++ return true; ++} ++ ++static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) ++{ ++ int gen, type, zone; ++ bool success = false; ++ struct lrugen *lrugen = &lruvec->evictable; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ while (lrugen->max_seq - min_seq[type] >= MIN_NR_GENS) { ++ gen = lru_gen_from_seq(min_seq[type]); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ if (!list_empty(&lrugen->lists[gen][type][zone])) ++ goto next; ++ } ++ ++ min_seq[type]++; ++ } ++next: ++ ; ++ } ++ ++ min_seq[0] = min(min_seq[0], min_seq[1]); ++ if (swappiness) ++ min_seq[1] = max(min_seq[0], lrugen->min_seq[1]); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ if (min_seq[type] == lrugen->min_seq[type]) ++ continue; ++ ++ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); ++ success = true; ++ } ++ ++ return success; ++} ++ ++static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq) ++{ ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ if (max_seq != lrugen->max_seq) ++ goto unlock; ++ ++ if (!try_to_inc_min_seq(lruvec, true)) { ++ for (type = ANON_AND_FILE - 1; type >= 0; type--) { ++ while (!inc_min_seq(lruvec, type)) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ } ++ } ++ ++ gen = lru_gen_from_seq(lrugen->max_seq - 1); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ enum lru_list lru = type * LRU_FILE; ++ long delta = lrugen->sizes[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ WARN_ON_ONCE(delta != (int)delta); ++ ++ update_lru_size(lruvec, lru, zone, delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); ++ } ++ } ++ ++ gen = lru_gen_from_seq(lrugen->max_seq + 1); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ enum lru_list lru = type * LRU_FILE; ++ long delta = lrugen->sizes[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ WARN_ON_ONCE(delta != (int)delta); ++ ++ update_lru_size(lruvec, lru, zone, -delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); ++ } ++ } ++ ++ WRITE_ONCE(lrugen->timestamps[gen], jiffies); ++ /* make sure all preceding modifications appear first */ ++ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); ++unlock: ++ spin_unlock_irq(&lruvec->lru_lock); ++} ++ ++/* Main function used by the foreground, the background and the user-triggered aging. */ ++static bool try_to_inc_max_seq(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long max_seq, bool use_filter) ++{ ++ bool last; ++ struct mm_walk_args *args; ++ struct mm_struct *mm = NULL; ++ struct lrugen *lrugen = &lruvec->evictable; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ int nid = pgdat->node_id; ++ ++ VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq)); ++ ++ /* ++ * If we are not from run_aging() and clearing the accessed bit may ++ * trigger page faults, then don't proceed to clearing all accessed ++ * PTEs. Instead, fallback to lru_gen_look_around(), which only clears a ++ * handful of accessed PTEs. This is less efficient but causes fewer ++ * page faults on CPUs that don't have the capability. ++ */ ++ if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young(false)) { ++ inc_max_seq(lruvec, max_seq); ++ return true; ++ } ++ ++ args = alloc_mm_walk_args(); ++ if (!args) ++ return false; ++ ++ args->memcg = memcg; ++ args->max_seq = max_seq; ++ args->start_pfn = pgdat->node_start_pfn; ++ args->end_pfn = pgdat_end_pfn(pgdat); ++ args->node_id = nid; ++ args->swappiness = swappiness; ++ args->use_filter = use_filter; ++ ++ do { ++ last = get_next_mm(lruvec, args, &mm); ++ if (mm) ++ walk_mm(lruvec, mm, args); ++ ++ cond_resched(); ++ } while (mm); ++ ++ free_mm_walk_args(args); ++ ++ if (!last) { ++ /* don't wait unless we may have trouble reclaiming */ ++ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2) ++ wait_event_killable(lruvec->mm_walk.wait, ++ max_seq < READ_ONCE(lrugen->max_seq)); ++ ++ return max_seq < READ_ONCE(lrugen->max_seq); ++ } ++ ++ VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq)); ++ ++ inc_max_seq(lruvec, max_seq); ++ /* either we see any waiters or they will see updated max_seq */ ++ if (wq_has_sleeper(&lruvec->mm_walk.wait)) ++ wake_up_all(&lruvec->mm_walk.wait); ++ ++ wakeup_flusher_threads(WB_REASON_VMSCAN); ++ ++ return true; ++} ++ ++static long get_nr_evictable(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long max_seq, unsigned long *min_seq, bool *low) ++{ ++ int gen, type, zone; ++ long max = 0; ++ long min = 0; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for (type = !swappiness; type < ANON_AND_FILE; type++) { ++ unsigned long seq; ++ ++ for (seq = min_seq[type]; seq <= max_seq; seq++) { ++ long size = 0; ++ ++ gen = lru_gen_from_seq(seq); ++ ++ for (zone = 0; zone <= sc->reclaim_idx; zone++) ++ size += READ_ONCE(lrugen->sizes[gen][type][zone]); ++ ++ max += size; ++ if (type && max_seq - seq >= MIN_NR_GENS) ++ min += size; ++ } ++ } ++ ++ *low = max_seq - min_seq[1] <= MIN_NR_GENS && min < MIN_BATCH_SIZE; ++ ++ return max > 0 ? max : 0; ++} ++ ++static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, ++ unsigned long min_ttl) ++{ ++ bool low; ++ long nr_to_scan; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ int swappiness = get_swappiness(memcg); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (mem_cgroup_below_min(memcg)) ++ return false; ++ ++ if (min_ttl) { ++ int gen = lru_gen_from_seq(min_seq[1]); ++ unsigned long birth = READ_ONCE(lruvec->evictable.timestamps[gen]); ++ ++ if (time_is_after_jiffies(birth + min_ttl)) ++ return false; ++ } ++ ++ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); ++ if (!nr_to_scan) ++ return false; ++ ++ nr_to_scan >>= sc->priority; ++ ++ if (!mem_cgroup_online(memcg)) ++ nr_to_scan++; ++ ++ if (nr_to_scan && low && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim)) ++ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true); ++ ++ return true; ++} ++ ++/* Protect the working set accessed within the last N milliseconds. */ ++static unsigned long lru_gen_min_ttl __read_mostly; ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ struct mem_cgroup *memcg; ++ bool success = false; ++ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); ++ ++ VM_BUG_ON(!current_is_kswapd()); ++ ++ if (!sc->force_deactivate) { ++ sc->force_deactivate = 1; ++ return; ++ } ++ ++ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ ++ if (age_lruvec(lruvec, sc, min_ttl)) ++ success = true; ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ if (!success && mutex_trylock(&oom_lock)) { ++ struct oom_control oc = { ++ .gfp_mask = sc->gfp_mask, ++ .order = sc->order, ++ }; ++ ++ /* to avoid overkilling */ ++ if (!oom_reaping_in_progress()) ++ out_of_memory(&oc); ++ ++ mutex_unlock(&oom_lock); ++ } ++ ++ current->reclaim_state->mm_walk_args = NULL; ++} ++ ++/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */ ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++ int i; ++ pte_t *pte; ++ struct page *page; ++ int old_gen, new_gen; ++ unsigned long start; ++ unsigned long end; ++ unsigned long addr; ++ struct mm_walk_args *args; ++ int worth = 0; ++ struct mem_cgroup *memcg = page_memcg(pvmw->page); ++ struct pglist_data *pgdat = page_pgdat(pvmw->page); ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ DEFINE_MAX_SEQ(lruvec); ++ ++ lockdep_assert_held(pvmw->ptl); ++ VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page); ++ ++ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; ++ if (!args) ++ return; ++ ++ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); ++ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; ++ ++ if (end - start > MIN_BATCH_SIZE * PAGE_SIZE) { ++ if (pvmw->address - start < MIN_BATCH_SIZE * PAGE_SIZE / 2) ++ end = start + MIN_BATCH_SIZE * PAGE_SIZE; ++ else if (end - pvmw->address < MIN_BATCH_SIZE * PAGE_SIZE / 2) ++ start = end - MIN_BATCH_SIZE * PAGE_SIZE; ++ else { ++ start = pvmw->address - MIN_BATCH_SIZE * PAGE_SIZE / 2; ++ end = pvmw->address + MIN_BATCH_SIZE * PAGE_SIZE / 2; ++ } ++ } ++ ++ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; ++ new_gen = lru_gen_from_seq(max_seq); ++ ++ lock_page_memcg(pvmw->page); ++ arch_enter_lazy_mmu_mode(); ++ ++ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ unsigned long pfn = pte_pfn(pte[i]); ++ ++ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) ++ continue; ++ ++ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) ++ continue; ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) ++ continue; ++ ++ worth++; ++ ++ if (!pte_young(pte[i])) ++ continue; ++ ++ page = compound_head(pfn_to_page(pfn)); ++ if (page_to_nid(page) != pgdat->node_id) ++ continue; ++ ++ if (page_memcg_rcu(page) != memcg) ++ continue; ++ ++ VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end); ++ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) ++ continue; ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) ++ __set_bit(i, args->bitmap); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ unlock_page_memcg(pvmw->page); ++ ++ if (worth >= MIN_BATCH_SIZE / 2) ++ set_bloom_filter(lruvec, max_seq, pvmw->pmd); ++ ++ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) ++ set_page_dirty(pte_page(pte[i])); ++ ++ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); ++} ++ ++/****************************************************************************** + * state change + ******************************************************************************/ + +@@ -3477,6 +4414,12 @@ static int __init init_lru_gen(void) + }; + late_initcall(init_lru_gen); + ++#else ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -4333,6 +5276,11 @@ static void age_active_anon(struct pglis + struct mem_cgroup *memcg; + struct lruvec *lruvec; + ++ if (lru_gen_enabled()) { ++ lru_gen_age_node(pgdat, sc); ++ return; ++ } ++ + if (!can_age_anon_pages(pgdat, sc)) + return; + diff --git a/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch b/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch new file mode 100644 index 0000000000..09e4315dcc --- /dev/null +++ b/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch @@ -0,0 +1,1002 @@ +From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Tue, 29 Jun 2021 20:46:47 -0600 +Subject: [PATCH 07/10] mm: multigenerational lru: eviction + +The eviction consumes old generations. Given an lruvec, the eviction +scans pages on lrugen->lists indexed by anon and file min_seq[] +(modulo MAX_NR_GENS). It first tries to select a type based on the +values of min_seq[]. If they are equal, it selects the type that has +a lower refaulted %. The eviction sorts a page according to its +updated generation number if the aging has found this page accessed. +It also moves a page to the next generation if this page is from an +upper tier that has a higher refaulted % than the base tier. The +eviction increments min_seq[] of a selected type when it finds +lrugen->lists indexed by min_seq[] of this selected type are empty. + +Each generation is divided into multiple tiers. Tiers represent +different ranges of numbers of accesses from file descriptors only. +Pages accessed N times via file descriptors belong to tier +order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers, +and they require additional MAX_NR_TIERS-2 bits in page->flags. In +contrast to moving between generations which requires list operations, +moving between tiers only involves operations on page->flags and +therefore has a negligible cost. A feedback loop modeled after the PID +controller monitors refaulted % across all tiers and decides when to +protect pages from which tiers. + +Unmapped pages are initially added to the oldest generation and then +conditionally protected by tiers. Each tier keeps track of how many +pages from it have refaulted. Tier 0 is the base tier and pages from +it are evicted unconditionally because there are no better candidates. +Pages from an upper tier are either evicted or moved to the next +generation, depending on whether this upper tier has a higher +refaulted % than the base tier. This model has the following +advantages: + 1) It removes the cost in the buffered access path and reduces the + overall cost of protection because pages are conditionally protected + in the reclaim path. + 2) It takes mapped pages into account and avoids overprotecting + pages accessed multiple times via file descriptors. + 3 Additional tiers improve the protection of pages accessed more + than twice. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac +--- + include/linux/mm_inline.h | 10 + + include/linux/mmzone.h | 33 +++ + mm/swap.c | 42 +++ + mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++- + mm/workingset.c | 120 ++++++++- + 5 files changed, 757 insertions(+), 3 deletions(-) + +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi + return seq % NR_HIST_GENS; + } + ++/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */ ++static inline int lru_tier_from_refs(int refs) ++{ ++ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH)); ++ ++ return order_base_2(refs + 1); ++} ++ + /* The youngest and the second youngest generations are counted as active. */ + static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) + { +@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru + gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + + new_flags &= ~LRU_GEN_MASK; ++ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS) ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + /* for shrink_page_list() */ + if (reclaiming) + new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -319,6 +319,30 @@ struct page_vma_mapped_walk; + #define MIN_NR_GENS 2 + #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) + ++/* ++ * Each generation is divided into multiple tiers. Tiers represent different ++ * ranges of numbers of accesses from file descriptors, i.e., ++ * mark_page_accessed(). In contrast to moving between generations which ++ * requires the lru lock, moving between tiers only involves an atomic ++ * operation on page->flags and therefore has a negligible cost. ++ * ++ * The purposes of tiers are to: ++ * 1) estimate whether pages accessed multiple times via file descriptors are ++ * more active than pages accessed only via page tables by separating the two ++ * access types into upper tiers and the base tier, and comparing refaulted % ++ * across all tiers. ++ * 2) improve buffered io performance by deferring the protection of pages ++ * accessed multiple times until the eviction. That is the protection happens ++ * in the reclaim path, not the access path. ++ * ++ * Pages accessed N times via file descriptors belong to tier order_base_2(N). ++ * The base tier may be marked by PageReferenced(). All upper tiers are marked ++ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are ++ * used to support more than one upper tier. ++ */ ++#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) ++#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) ++ + /* Whether to keep stats for historical generations. */ + #ifdef CONFIG_LRU_GEN_STATS + #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) +@@ -337,6 +361,15 @@ struct lrugen { + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the sizes of the multigenerational lru lists in pages */ + unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the exponential moving average of refaulted */ ++ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the exponential moving average of protected+evicted */ ++ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the base tier isn't protected, hence the minus one */ ++ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; ++ /* incremented without holding the lru lock */ ++ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + /* whether the multigenerational lru is enabled */ + bool enabled[ANON_AND_FILE]; + }; +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st + local_unlock(&lru_pvecs.lock); + } + ++#ifdef CONFIG_LRU_GEN ++static void page_inc_refs(struct page *page) ++{ ++ unsigned long refs; ++ unsigned long old_flags, new_flags; ++ ++ if (PageUnevictable(page)) ++ return; ++ ++ /* see the comment on MAX_NR_TIERS */ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ ++ if (!(new_flags & BIT(PG_referenced))) { ++ new_flags |= BIT(PG_referenced); ++ continue; ++ } ++ ++ if (!(new_flags & BIT(PG_workingset))) { ++ new_flags |= BIT(PG_workingset); ++ continue; ++ } ++ ++ refs = new_flags & LRU_REFS_MASK; ++ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK); ++ ++ new_flags &= ~LRU_REFS_MASK; ++ new_flags |= refs; ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++} ++#else ++static void page_inc_refs(struct page *page) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + /* + * Mark a page as having seen activity. + * +@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag + { + page = compound_head(page); + ++ if (lru_gen_enabled()) { ++ page_inc_refs(page); ++ return; ++ } ++ + if (!PageReferenced(page)) { + SetPageReferenced(page); + } else if (PageUnevictable(page)) { +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; +- mem_cgroup_swapout(page, swap); ++ ++ /* get a shadow entry before page_memcg() is cleared */ + if (reclaimed && !mapping_exiting(mapping)) + shadow = workingset_eviction(page, target_memcg); ++ mem_cgroup_swapout(page, swap); + __delete_from_swap_cache(page, swap, shadow); + xa_unlock_irq(&mapping->i_pages); + put_swap_page(page, swap); +@@ -1410,6 +1412,11 @@ retry: + if (!sc->may_unmap && page_mapped(page)) + goto keep_locked; + ++ /* lru_gen_look_around() has updated this page? */ ++ if (lru_gen_enabled() && !ignore_references && ++ page_mapped(page) && PageReferenced(page)) ++ goto keep_locked; ++ + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + +@@ -2570,6 +2577,9 @@ static void prepare_scan_count(pg_data_t + unsigned long file; + struct lruvec *target_lruvec; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* +@@ -2910,6 +2920,17 @@ static int page_lru_gen(struct page *pag + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + } + ++static int page_lru_tier(struct page *page) ++{ ++ int refs; ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ? ++ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0; ++ ++ return lru_tier_from_refs(refs); ++} ++ + static int get_swappiness(struct mem_cgroup *memcg) + { + return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? +@@ -3246,6 +3267,91 @@ done: + } + + /****************************************************************************** ++ * refault feedback loop ++ ******************************************************************************/ ++ ++/* ++ * A feedback loop modeled after the PID controller. Currently supports the ++ * proportional (P) and the integral (I) terms; the derivative (D) term can be ++ * added if necessary. The setpoint (SP) is the desired position; the process ++ * variable (PV) is the measured position. The error is the difference between ++ * the SP and the PV. A positive error results in a positive control output ++ * correction, which, in our case, is to allow eviction. ++ * ++ * The P term is refaulted % of the current generation being evicted. The I ++ * term is the exponential moving average of refaulted % of previously evicted ++ * generations, using the smoothing factor 1/2. ++ * ++ * Our goal is to maintain proportional refaulted % across all tiers. ++ */ ++struct ctrl_pos { ++ unsigned long refaulted; ++ unsigned long total; ++ int gain; ++}; ++ ++static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, ++ struct ctrl_pos *pos) ++{ ++ struct lrugen *lrugen = &lruvec->evictable; ++ int hist = lru_hist_from_seq(lrugen->min_seq[type]); ++ ++ pos->refaulted = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ pos->total = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ pos->total += lrugen->protected[hist][type][tier - 1]; ++ pos->gain = gain; ++} ++ ++static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type) ++{ ++ int tier; ++ int hist = lru_hist_from_seq(gen); ++ struct lrugen *lrugen = &lruvec->evictable; ++ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); ++ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; ++ ++ if (!carryover && !clear) ++ return; ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ if (carryover) { ++ unsigned long sum; ++ ++ sum = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); ++ ++ sum = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ sum += lrugen->protected[hist][type][tier - 1]; ++ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); ++ } ++ ++ if (clear) { ++ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); ++ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); ++ if (tier) ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); ++ } ++ } ++} ++ ++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) ++{ ++ /* ++ * Allow eviction if the PV has a limited number of refaulted pages or a ++ * lower refaulted % than the SP. ++ */ ++ return pv->refaulted < MIN_BATCH_SIZE || ++ pv->refaulted * max(sp->total, 1UL) * sp->gain <= ++ sp->refaulted * max(pv->total, 1UL) * pv->gain; ++} ++ ++/****************************************************************************** + * the aging + ******************************************************************************/ + +@@ -3265,6 +3371,7 @@ static int page_update_gen(struct page * + + new_flags &= ~LRU_GEN_MASK; + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + } while (new_flags != old_flags && + cmpxchg(&page->flags, old_flags, new_flags) != old_flags); + +@@ -3296,6 +3403,7 @@ static void page_inc_gen(struct page *pa + + new_flags &= ~LRU_GEN_MASK; + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + /* for end_page_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); +@@ -3787,6 +3895,7 @@ static bool inc_min_seq(struct lruvec *l + } + } + ++ reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); + + return true; +@@ -3824,6 +3933,8 @@ next: + if (min_seq[type] == lrugen->min_seq[type]) + continue; + ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } +@@ -3885,6 +3996,9 @@ static void inc_max_seq(struct lruvec *l + } + } + ++ for (type = 0; type < ANON_AND_FILE; type++) ++ reset_ctrl_pos(lruvec, gen, type); ++ + WRITE_ONCE(lrugen->timestamps[gen], jiffies); + /* make sure all preceding modifications appear first */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); +@@ -4166,6 +4280,433 @@ void lru_gen_look_around(struct page_vma + } + + /****************************************************************************** ++ * the eviction ++ ******************************************************************************/ ++ ++static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx) ++{ ++ bool success; ++ int gen = page_lru_gen(page); ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int tier = page_lru_tier(page); ++ int delta = thp_nr_pages(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page); ++ ++ /* an mlocked page? */ ++ if (!page_evictable(page)) { ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ SetPageUnevictable(page); ++ add_page_to_lru_list(page, lruvec); ++ __count_vm_events(UNEVICTABLE_PGCULLED, delta); ++ return true; ++ } ++ ++ /* a lazy-free page that has been written into? */ ++ if (type && PageDirty(page) && PageAnon(page)) { ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ SetPageSwapBacked(page); ++ add_page_to_lru_list_tail(page, lruvec); ++ return true; ++ } ++ ++ /* page_update_gen() has updated this page? */ ++ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { ++ list_move(&page->lru, &lrugen->lists[gen][type][zone]); ++ return true; ++ } ++ ++ /* protect this page if its tier has a higher refaulted % */ ++ if (tier > tier_idx) { ++ int hist = lru_hist_from_seq(gen); ++ ++ page_inc_gen(page, lruvec, false); ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], ++ lrugen->protected[hist][type][tier - 1] + delta); ++ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); ++ return true; ++ } ++ ++ /* mark this page for reclaim if it's pending writeback */ ++ if (PageWriteback(page) || (type && PageDirty(page))) { ++ page_inc_gen(page, lruvec, true); ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) ++{ ++ bool success; ++ ++ if (!sc->may_unmap && page_mapped(page)) ++ return false; ++ ++ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && ++ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) ++ return false; ++ ++ if (!get_page_unless_zero(page)) ++ return false; ++ ++ if (!TestClearPageLRU(page)) { ++ put_page(page); ++ return false; ++ } ++ ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ ++ return true; ++} ++ ++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, ++ int type, int tier, struct list_head *list) ++{ ++ int gen, zone; ++ enum vm_event_item item; ++ int sorted = 0; ++ int scanned = 0; ++ int isolated = 0; ++ int remaining = MAX_BATCH_SIZE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ VM_BUG_ON(!list_empty(list)); ++ ++ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) ++ return 0; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = sc->reclaim_idx; zone >= 0; zone--) { ++ LIST_HEAD(moved); ++ int skipped = 0; ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ int delta = thp_nr_pages(page); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ scanned += delta; ++ ++ if (sort_page(page, lruvec, tier)) ++ sorted += delta; ++ else if (isolate_page(page, lruvec, sc)) { ++ list_add(&page->lru, list); ++ isolated += delta; ++ } else { ++ list_move(&page->lru, &moved); ++ skipped += delta; ++ } ++ ++ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE) ++ break; ++ } ++ ++ if (skipped) { ++ list_splice(&moved, head); ++ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); ++ } ++ ++ if (!remaining || isolated >= MIN_BATCH_SIZE) ++ break; ++ } ++ ++ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; ++ if (!cgroup_reclaim(sc)) { ++ __count_vm_events(item, isolated); ++ __count_vm_events(PGREFILL, sorted); ++ } ++ __count_memcg_events(memcg, item, isolated); ++ __count_memcg_events(memcg, PGREFILL, sorted); ++ __count_vm_events(PGSCAN_ANON + type, isolated); ++ ++ /* ++ * We may have trouble finding eligible pages due to reclaim_idx, ++ * may_unmap and may_writepage. Check `remaining` to make sure we won't ++ * be stuck if we aren't making enough progress. ++ */ ++ return isolated || !remaining ? scanned : 0; ++} ++ ++static int get_tier_idx(struct lruvec *lruvec, int type) ++{ ++ int tier; ++ struct ctrl_pos sp, pv; ++ ++ /* ++ * Ideally we don't want to evict upper tiers that have higher refaulted ++ * %. However, we need to leave a margin for the fluctuation in ++ * refaulted %. So we use a larger gain factor to make sure upper tiers ++ * are indeed more active. We choose 2 because the lowest upper tier ++ * would have twice of refaulted % of the base tier, according to their ++ * numbers of accesses. ++ */ ++ read_ctrl_pos(lruvec, type, 0, 1, &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, 2, &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ return tier - 1; ++} ++ ++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) ++{ ++ int type, tier; ++ struct ctrl_pos sp, pv; ++ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; ++ ++ /* ++ * Compare refaulted % between the base tiers of anon and file to ++ * determine which type to evict. Also need to compare refaulted % of ++ * the upper tiers of the selected type with that of the base tier of ++ * the other type to determine which tier of the selected type to evict. ++ */ ++ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp); ++ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv); ++ type = positive_ctrl_err(&sp, &pv); ++ ++ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ *tier_idx = tier - 1; ++ ++ return type; ++} ++ ++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ int *type_scanned, struct list_head *list) ++{ ++ int i; ++ int type; ++ int scanned; ++ int tier = -1; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ /* ++ * Try to select a type based on generations and swappiness, and if that ++ * fails, fall back to get_type_to_scan(). When anon and file are both ++ * available from the same generation, swappiness 200 is interpreted as ++ * anon first and swappiness 1 is interpreted as file first. ++ */ ++ if (!swappiness) ++ type = 1; ++ else if (min_seq[0] < min_seq[1]) ++ type = 0; ++ else if (swappiness == 1) ++ type = 1; ++ else if (swappiness == 200) ++ type = 0; ++ else ++ type = get_type_to_scan(lruvec, swappiness, &tier); ++ ++ for (i = !swappiness; i < ANON_AND_FILE; i++) { ++ if (tier < 0) ++ tier = get_tier_idx(lruvec, type); ++ ++ scanned = scan_pages(lruvec, sc, type, tier, list); ++ if (scanned) ++ break; ++ ++ type = !type; ++ tier = -1; ++ } ++ ++ *type_scanned = type; ++ ++ return scanned; ++} ++ ++/* Main function used by the foreground, the background and the user-triggered eviction. */ ++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ int type; ++ int scanned; ++ int reclaimed; ++ LIST_HEAD(list); ++ struct page *page; ++ enum vm_event_item item; ++ struct reclaim_stat stat; ++ struct mm_walk_args *args; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); ++ ++ if (try_to_inc_min_seq(lruvec, swappiness)) ++ scanned++; ++ ++ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS) ++ scanned = 0; ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ if (list_empty(&list)) ++ return scanned; ++ ++ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); ++ /* ++ * We need to prevent rejected pages from being added back to the same ++ * lists they were isolated from. Otherwise we may risk looping on them ++ * forever. ++ */ ++ list_for_each_entry(page, &list, lru) { ++ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) ++ SetPageActive(page); ++ ++ ClearPageReferenced(page); ++ ClearPageWorkingset(page); ++ } ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ move_pages_to_lru(lruvec, &list); ++ ++ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; ++ if (args && args->batch_size) ++ reset_batch_size(lruvec, args); ++ ++ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; ++ if (!cgroup_reclaim(sc)) ++ __count_vm_events(item, reclaimed); ++ __count_memcg_events(memcg, item, reclaimed); ++ __count_vm_events(PGSTEAL_ANON + type, reclaimed); ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ mem_cgroup_uncharge_list(&list); ++ free_unref_page_list(&list); ++ ++ sc->nr_reclaimed += reclaimed; ++ ++ return scanned; ++} ++ ++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ bool low; ++ long nr_to_scan; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ int priority = sc->priority; ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (mem_cgroup_below_min(memcg) || ++ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) ++ return 0; ++ ++ if (sc->nr_reclaimed >= sc->nr_to_reclaim) { ++ priority = DEF_PRIORITY; ++ sc->force_deactivate = 0; ++ } ++ ++ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); ++ if (!nr_to_scan) ++ return 0; ++ ++ nr_to_scan >>= priority; ++ ++ if (!mem_cgroup_online(memcg)) ++ nr_to_scan++; ++ ++ if (!nr_to_scan) ++ return 0; ++ ++ if (current_is_kswapd()) { ++ /* leave the work to lru_gen_age_node() */ ++ if (max_seq - min_seq[1] < MIN_NR_GENS) ++ return 0; ++ ++ if (!low) ++ sc->force_deactivate = 0; ++ ++ return nr_to_scan; ++ } ++ ++ if (max_seq - min_seq[1] >= MIN_NR_GENS) ++ return nr_to_scan; ++ ++ /* move onto slab and other memcgs if we haven't tried them all */ ++ if (!sc->force_deactivate) { ++ sc->skipped_deactivate = 1; ++ return 0; ++ } ++ ++ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0; ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ struct blk_plug plug; ++ long scanned = 0; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ lru_add_drain(); ++ ++ if (current_is_kswapd()) ++ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; ++ ++ blk_start_plug(&plug); ++ ++ while (true) { ++ int delta; ++ int swappiness; ++ long nr_to_scan; ++ ++ if (sc->may_swap) ++ swappiness = get_swappiness(memcg); ++ else if (!cgroup_reclaim(sc) && get_swappiness(memcg)) ++ swappiness = 1; ++ else ++ swappiness = 0; ++ ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); ++ if (!nr_to_scan) ++ break; ++ ++ delta = evict_pages(lruvec, sc, swappiness); ++ if (!delta) ++ break; ++ ++ scanned += delta; ++ if (scanned >= nr_to_scan) ++ break; ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++ ++ if (current_is_kswapd()) ++ current->reclaim_state->mm_walk_args = NULL; ++} ++ ++/****************************************************************************** + * state change + ******************************************************************************/ + +@@ -4420,6 +4961,10 @@ static void lru_gen_age_node(struct pgli + { + } + ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -4433,6 +4978,11 @@ static void shrink_lruvec(struct lruvec + struct blk_plug plug; + bool scan_adjusted; + ++ if (lru_gen_enabled()) { ++ lru_gen_shrink_lruvec(lruvec, sc); ++ return; ++ } ++ + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ +@@ -4906,6 +5456,9 @@ static void snapshot_refaults(struct mem + struct lruvec *target_lruvec; + unsigned long refaults; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[0] = refaults; +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_ + static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) + { +- eviction >>= bucket_order; + eviction &= EVICTION_MASK; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; +@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow, + + *memcgidp = memcgid; + *pgdat = NODE_DATA(nid); +- *evictionp = entry << bucket_order; ++ *evictionp = entry; + *workingsetp = workingset; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static int page_lru_refs(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); ++ ++ /* see the comment on MAX_NR_TIERS */ ++ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0; ++} ++ ++/* Return a token to be stored in the shadow entry of a page being evicted. */ ++static void *lru_gen_eviction(struct page *page) ++{ ++ int hist, tier; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ int type = page_is_file_lru(page); ++ int refs = page_lru_refs(page); ++ int delta = thp_nr_pages(page); ++ bool workingset = PageWorkingset(page); ++ struct mem_cgroup *memcg = page_memcg(page); ++ struct pglist_data *pgdat = page_pgdat(page); ++ ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ token = (min_seq << LRU_REFS_WIDTH) | refs; ++ ++ hist = lru_hist_from_seq(min_seq); ++ tier = lru_tier_from_refs(refs + workingset); ++ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); ++ ++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); ++} ++ ++/* Count a refaulted page based on the token stored in its shadow entry. */ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++ int hist, tier, refs; ++ int memcg_id; ++ bool workingset; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ struct mem_cgroup *memcg; ++ struct pglist_data *pgdat; ++ int type = page_is_file_lru(page); ++ int delta = thp_nr_pages(page); ++ ++ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); ++ if (page_pgdat(page) != pgdat) ++ return; ++ ++ rcu_read_lock(); ++ memcg = page_memcg_rcu(page); ++ if (mem_cgroup_id(memcg) != memcg_id) ++ goto unlock; ++ ++ refs = token & (BIT(LRU_REFS_WIDTH) - 1); ++ if (refs && !workingset) ++ goto unlock; ++ ++ token >>= LRU_REFS_WIDTH; ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) ++ goto unlock; ++ ++ hist = lru_hist_from_seq(min_seq); ++ tier = lru_tier_from_refs(refs + workingset); ++ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); ++ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); ++ ++ /* ++ * Tiers don't offer any protection to pages accessed via page tables. ++ * That's what generations do. Tiers can't fully protect pages after ++ * their numbers of accesses has exceeded the max value. Conservatively ++ * count these two conditions as stalls even though they might not ++ * indicate any real memory pressure. ++ */ ++ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) { ++ SetPageWorkingset(page); ++ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); ++ } ++unlock: ++ rcu_read_unlock(); ++} ++ ++#else ++ ++static void *lru_gen_eviction(struct page *page) ++{ ++ return NULL; ++} ++ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + /** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @lruvec: the lruvec that was aged +@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + ++ if (lru_gen_enabled()) ++ return lru_gen_eviction(page); ++ + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->nonresident_age); ++ eviction >>= bucket_order; + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); + } +@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag + bool workingset; + int memcgid; + ++ if (lru_gen_enabled()) { ++ lru_gen_refault(page, shadow); ++ return; ++ } ++ + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); ++ eviction <<= bucket_order; + + rcu_read_lock(); + /* diff --git a/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch b/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch new file mode 100644 index 0000000000..a1a749fc38 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch @@ -0,0 +1,496 @@ +From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 25 Jan 2021 21:38:02 -0700 +Subject: [PATCH 08/10] mm: multigenerational lru: user interface + +Add /sys/kernel/mm/lru_gen/enabled to enable and disable the +multigenerational lru at runtime. + +Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a +given number of milliseconds. The OOM killer is invoked if this +working set cannot be kept in memory. + +Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and +invoke the aging and the eviction. This file has the following output: + memcg memcg_id memcg_path + node node_id + min_gen birth_time anon_size file_size + ... + max_gen birth_time anon_size file_size + +min_gen is the oldest generation number and max_gen is the youngest +generation number. birth_time is in milliseconds. anon_size and +file_size are in pages. + +This file takes the following input: + + memcg_id node_id max_gen [swappiness] [use_bloom_filter] + - memcg_id node_id min_gen [swappiness] [nr_to_reclaim] + +The first command line invokes the aging, which scans PTEs for +accessed pages and then creates the next generation max_gen+1. A swap +file and a non-zero swappiness, which overrides vm.swappiness, are +required to scan PTEs mapping anon pages. The second command line +invokes the eviction, which evicts generations less than or equal to +min_gen. min_gen should be less than max_gen-1 as max_gen and +max_gen-1 are not fully aged and therefore cannot be evicted. +Setting nr_to_reclaim to N limits the number of pages to evict. +Setting use_bloom_filter to 0 overrides the default behavior which +only scans PTE tables found populated. Multiple command lines are +supported, as is concatenation with delimiters "," and ";". + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3 +--- + include/linux/nodemask.h | 1 + + mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++ + 2 files changed, 416 insertions(+) + +--- a/include/linux/nodemask.h ++++ b/include/linux/nodemask.h +@@ -485,6 +485,7 @@ static inline int num_node_state(enum no + #define first_online_node 0 + #define first_memory_node 0 + #define next_online_node(nid) (MAX_NUMNODES) ++#define next_memory_node(nid) (MAX_NUMNODES) + #define nr_node_ids 1U + #define nr_online_nodes 1U + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -53,6 +53,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -4882,6 +4884,413 @@ unlock: + } + + /****************************************************************************** ++ * sysfs interface ++ ******************************************************************************/ ++ ++static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); ++} ++ ++static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ unsigned int msecs; ++ ++ if (kstrtouint(buf, 10, &msecs)) ++ return -EINVAL; ++ ++ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( ++ min_ttl_ms, 0644, show_min_ttl, store_min_ttl ++); ++ ++static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled()); ++} ++ ++static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ bool enable; ++ ++ if (kstrtobool(buf, &enable)) ++ return -EINVAL; ++ ++ lru_gen_change_state(enable, true, false); ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_enabled_attr = __ATTR( ++ enabled, 0644, show_enable, store_enable ++); ++ ++static struct attribute *lru_gen_attrs[] = { ++ &lru_gen_min_ttl_attr.attr, ++ &lru_gen_enabled_attr.attr, ++ NULL ++}; ++ ++static struct attribute_group lru_gen_attr_group = { ++ .name = "lru_gen", ++ .attrs = lru_gen_attrs, ++}; ++ ++/****************************************************************************** ++ * debugfs interface ++ ******************************************************************************/ ++ ++static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct mem_cgroup *memcg; ++ loff_t nr_to_skip = *pos; ++ ++ m->private = kvmalloc(PATH_MAX, GFP_KERNEL); ++ if (!m->private) ++ return ERR_PTR(-ENOMEM); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node_state(nid, N_MEMORY) { ++ if (!nr_to_skip--) ++ return get_lruvec(nid, memcg); ++ } ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ return NULL; ++} ++ ++static void lru_gen_seq_stop(struct seq_file *m, void *v) ++{ ++ if (!IS_ERR_OR_NULL(v)) ++ mem_cgroup_iter_break(NULL, lruvec_memcg(v)); ++ ++ kvfree(m->private); ++ m->private = NULL; ++} ++ ++static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ int nid = lruvec_pgdat(v)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(v); ++ ++ ++*pos; ++ ++ nid = next_memory_node(nid); ++ if (nid == MAX_NUMNODES) { ++ memcg = mem_cgroup_iter(NULL, memcg, NULL); ++ if (!memcg) ++ return NULL; ++ ++ nid = first_memory_node; ++ } ++ ++ return get_lruvec(nid, memcg); ++} ++ ++static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, ++ unsigned long max_seq, unsigned long *min_seq, ++ unsigned long seq) ++{ ++ int i; ++ int type, tier; ++ int hist = lru_hist_from_seq(seq); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ seq_printf(m, " %10d", tier); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ unsigned long n[3] = {}; ++ ++ if (seq == max_seq) { ++ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); ++ n[1] = READ_ONCE(lrugen->avg_total[type][tier]); ++ ++ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]); ++ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { ++ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); ++ ++ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]); ++ } else ++ seq_puts(m, " 0 0 0 "); ++ } ++ seq_putc(m, '\n'); ++ } ++ ++ seq_puts(m, " "); ++ for (i = 0; i < NR_MM_STATS; i++) { ++ if (seq == max_seq && NR_HIST_GENS == 1) ++ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), ++ toupper(MM_STAT_CODES[i])); ++ else if (seq != max_seq && NR_HIST_GENS > 1) ++ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), ++ MM_STAT_CODES[i]); ++ else ++ seq_puts(m, " 0 "); ++ } ++ seq_putc(m, '\n'); ++} ++ ++static int lru_gen_seq_show(struct seq_file *m, void *v) ++{ ++ unsigned long seq; ++ bool full = !debugfs_real_fops(m->file)->write; ++ struct lruvec *lruvec = v; ++ struct lrugen *lrugen = &lruvec->evictable; ++ int nid = lruvec_pgdat(lruvec)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (nid == first_memory_node) { ++ const char *path = memcg ? m->private : ""; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); ++#endif ++ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); ++ } ++ ++ seq_printf(m, " node %5d\n", nid); ++ ++ if (!full) ++ seq = min_seq[0]; ++ else if (max_seq >= MAX_NR_GENS) ++ seq = max_seq - MAX_NR_GENS + 1; ++ else ++ seq = 0; ++ ++ for (; seq <= max_seq; seq++) { ++ int gen, type, zone; ++ unsigned int msecs; ++ ++ gen = lru_gen_from_seq(seq); ++ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen])); ++ ++ seq_printf(m, " %10lu %10u", seq, msecs); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ long size = 0; ++ ++ if (seq < min_seq[type]) { ++ seq_puts(m, " -0 "); ++ continue; ++ } ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ size += READ_ONCE(lrugen->sizes[gen][type][zone]); ++ ++ seq_printf(m, " %10lu ", max(size, 0L)); ++ } ++ ++ seq_putc(m, '\n'); ++ ++ if (full) ++ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); ++ } ++ ++ return 0; ++} ++ ++static const struct seq_operations lru_gen_seq_ops = { ++ .start = lru_gen_seq_start, ++ .stop = lru_gen_seq_stop, ++ .next = lru_gen_seq_next, ++ .show = lru_gen_seq_show, ++}; ++ ++static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long seq, bool use_filter) ++{ ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (seq == max_seq) ++ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter); ++ ++ return seq > max_seq ? -EINVAL : 0; ++} ++ ++static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long seq, unsigned long nr_to_reclaim) ++{ ++ struct blk_plug plug; ++ int err = -EINTR; ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (seq >= max_seq - 1) ++ return -EINVAL; ++ ++ sc->nr_reclaimed = 0; ++ ++ blk_start_plug(&plug); ++ ++ while (!signal_pending(current)) { ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim || ++ !evict_pages(lruvec, sc, swappiness)) { ++ err = 0; ++ break; ++ } ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++ ++ return err; ++} ++ ++static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc, ++ int swappiness, unsigned long seq, unsigned long opt) ++{ ++ struct lruvec *lruvec; ++ int err = -EINVAL; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (!mem_cgroup_disabled()) { ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_id(memcg_id); ++#ifdef CONFIG_MEMCG ++ if (memcg && !css_tryget(&memcg->css)) ++ memcg = NULL; ++#endif ++ rcu_read_unlock(); ++ ++ if (!memcg) ++ goto done; ++ } ++ if (memcg_id != mem_cgroup_id(memcg)) ++ goto done; ++ ++ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) ++ goto done; ++ ++ lruvec = get_lruvec(nid, memcg); ++ ++ if (swappiness < 0) ++ swappiness = get_swappiness(memcg); ++ else if (swappiness > 200) ++ goto done; ++ ++ switch (cmd) { ++ case '+': ++ err = run_aging(lruvec, sc, swappiness, seq, opt); ++ break; ++ case '-': ++ err = run_eviction(lruvec, sc, swappiness, seq, opt); ++ break; ++ } ++done: ++ mem_cgroup_put(memcg); ++ ++ return err; ++} ++ ++static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, ++ size_t len, loff_t *pos) ++{ ++ void *buf; ++ char *cur, *next; ++ unsigned int flags; ++ int err = 0; ++ struct scan_control sc = { ++ .may_writepage = 1, ++ .may_unmap = 1, ++ .may_swap = 1, ++ .reclaim_idx = MAX_NR_ZONES - 1, ++ .gfp_mask = GFP_KERNEL, ++ }; ++ ++ buf = kvmalloc(len + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ ++ if (copy_from_user(buf, src, len)) { ++ kvfree(buf); ++ return -EFAULT; ++ } ++ ++ next = buf; ++ next[len] = '\0'; ++ ++ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args(); ++ if (!sc.reclaim_state.mm_walk_args) { ++ kvfree(buf); ++ return -ENOMEM; ++ } ++ ++ flags = memalloc_noreclaim_save(); ++ set_task_reclaim_state(current, &sc.reclaim_state); ++ ++ while ((cur = strsep(&next, ",;\n"))) { ++ int n; ++ int end; ++ char cmd; ++ unsigned int memcg_id; ++ unsigned int nid; ++ unsigned long seq; ++ unsigned int swappiness = -1; ++ unsigned long opt = -1; ++ ++ cur = skip_spaces(cur); ++ if (!*cur) ++ continue; ++ ++ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, ++ &seq, &end, &swappiness, &end, &opt, &end); ++ if (n < 4 || cur[end]) { ++ err = -EINVAL; ++ break; ++ } ++ ++ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt); ++ if (err) ++ break; ++ } ++ ++ set_task_reclaim_state(current, NULL); ++ memalloc_noreclaim_restore(flags); ++ ++ free_mm_walk_args(sc.reclaim_state.mm_walk_args); ++ kvfree(buf); ++ ++ return err ? : len; ++} ++ ++static int lru_gen_seq_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &lru_gen_seq_ops); ++} ++ ++static const struct file_operations lru_gen_rw_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .write = lru_gen_seq_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static const struct file_operations lru_gen_ro_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++/****************************************************************************** + * initialization + ******************************************************************************/ + +@@ -4951,6 +5360,12 @@ static int __init init_lru_gen(void) + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); + ++ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) ++ pr_err("lru_gen: failed to create sysfs group\n"); ++ ++ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); ++ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); ++ + return 0; + }; + late_initcall(init_lru_gen); diff --git a/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch b/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch new file mode 100644 index 0000000000..4462549f99 --- /dev/null +++ b/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch @@ -0,0 +1,80 @@ +From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 25 Jan 2021 21:47:24 -0700 +Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig + +Add configuration options for the multigenerational lru. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635 +--- + mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 59 insertions(+) + +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -899,4 +899,63 @@ config SECRETMEM + + source "mm/damon/Kconfig" + ++# the multigenerational lru { ++config LRU_GEN ++ bool "Multigenerational LRU" ++ depends on MMU ++ # the following options may leave not enough spare bits in page->flags ++ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP) ++ help ++ A high performance LRU implementation to heavily overcommit workloads ++ that are not IO bound. See Documentation/vm/multigen_lru.rst for ++ details. ++ ++ Warning: do not enable this option unless you plan to use it because ++ it introduces a small per-process and per-memcg and per-node memory ++ overhead. ++ ++config LRU_GEN_ENABLED ++ bool "Turn on by default" ++ depends on LRU_GEN ++ help ++ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option ++ changes it to 1. ++ ++ Warning: the default value is the fast path. See ++ Documentation/static-keys.txt for details. ++ ++config LRU_GEN_STATS ++ bool "Full stats for debugging" ++ depends on LRU_GEN ++ help ++ This option keeps full stats for each generation, which can be read ++ from /sys/kernel/debug/lru_gen_full. ++ ++ Warning: do not enable this option unless you plan to use it because ++ it introduces an additional small per-process and per-memcg and ++ per-node memory overhead. ++ ++config NR_LRU_GENS ++ int "Max number of generations" ++ depends on LRU_GEN ++ range 4 31 ++ default 7 ++ help ++ This will use order_base_2(N+1) spare bits from page flags. ++ ++ Warning: do not use numbers larger than necessary because each ++ generation introduces a small per-node and per-memcg memory overhead. ++ ++config TIERS_PER_GEN ++ int "Number of tiers per generation" ++ depends on LRU_GEN ++ range 2 5 ++ default 4 ++ help ++ This will use N-2 spare bits from page flags. ++ ++ Larger values generally offer better protection to active pages under ++ heavy buffered I/O workloads. ++# } ++ + endmenu diff --git a/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch b/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch new file mode 100644 index 0000000000..f4716fb68d --- /dev/null +++ b/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch @@ -0,0 +1,161 @@ +From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Tue, 2 Feb 2021 01:27:45 -0700 +Subject: [PATCH 10/10] mm: multigenerational lru: documentation + +Add Documentation/vm/multigen_lru.rst. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e +--- + Documentation/vm/index.rst | 1 + + Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++ + 2 files changed, 133 insertions(+) + create mode 100644 Documentation/vm/multigen_lru.rst + +--- a/Documentation/vm/index.rst ++++ b/Documentation/vm/index.rst +@@ -17,6 +17,7 @@ various features of the Linux memory man + + swap_numa + zswap ++ multigen_lru + + Kernel developers MM documentation + ================================== +--- /dev/null ++++ b/Documentation/vm/multigen_lru.rst +@@ -0,0 +1,132 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++===================== ++Multigenerational LRU ++===================== ++ ++Quick Start ++=========== ++Build Configurations ++-------------------- ++:Required: Set ``CONFIG_LRU_GEN=y``. ++ ++:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by ++ default. ++ ++Runtime Configurations ++---------------------- ++:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the ++ feature was not turned on by default. ++ ++:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to ++ protect the working set of ``N`` milliseconds. The OOM killer is ++ invoked if this working set cannot be kept in memory. ++ ++:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature ++ is turned on. This file has the following output: ++ ++:: ++ ++ memcg memcg_id memcg_path ++ node node_id ++ min_gen birth_time anon_size file_size ++ ... ++ max_gen birth_time anon_size file_size ++ ++``min_gen`` is the oldest generation number and ``max_gen`` is the ++youngest generation number. ``birth_time`` is in milliseconds. ++``anon_size`` and ``file_size`` are in pages. ++ ++Phones/Laptops/Workstations ++--------------------------- ++No additional configurations required. ++ ++Servers/Data Centers ++-------------------- ++:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a ++ larger number. ++ ++:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger ++ number. ++ ++:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``. ++ ++:Working set estimation: Write ``+ memcg_id node_id max_gen ++ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to ++ invoke the aging, which scans PTEs for accessed pages and then ++ creates the next generation ``max_gen+1``. A swap file and a non-zero ++ ``swappiness``, which overrides ``vm.swappiness``, are required to ++ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to ++ override the default behavior which only scans PTE tables found ++ populated. ++ ++:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness] ++ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the ++ eviction, which evicts generations less than or equal to ``min_gen``. ++ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and ++ ``max_gen-1`` are not fully aged and therefore cannot be evicted. ++ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple ++ command lines are supported, so does concatenation with delimiters ++ ``,`` and ``;``. ++ ++Framework ++========= ++For each ``lruvec``, evictable pages are divided into multiple ++generations. The youngest generation number is stored in ++``lrugen->max_seq`` for both anon and file types as they are aged on ++an equal footing. The oldest generation numbers are stored in ++``lrugen->min_seq[]`` separately for anon and file types as clean ++file pages can be evicted regardless of swap and writeback ++constraints. These three variables are monotonically increasing. ++Generation numbers are truncated into ++``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into ++``page->flags``. The sliding window technique is used to prevent ++truncated generation numbers from overlapping. Each truncated ++generation number is an index to an array of per-type and per-zone ++lists ``lrugen->lists``. ++ ++Each generation is divided into multiple tiers. Tiers represent ++different ranges of numbers of accesses from file descriptors only. ++Pages accessed ``N`` times via file descriptors belong to tier ++``order_base_2(N)``. Each generation contains at most ++``CONFIG_TIERS_PER_GEN`` tiers, and they require additional ++``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to ++moving between generations which requires list operations, moving ++between tiers only involves operations on ``page->flags`` and ++therefore has a negligible cost. A feedback loop modeled after the PID ++controller monitors refaulted % across all tiers and decides when to ++protect pages from which tiers. ++ ++The framework comprises two conceptually independent components: the ++aging and the eviction, which can be invoked separately from user ++space for the purpose of working set estimation and proactive reclaim. ++ ++Aging ++----- ++The aging produces young generations. Given an ``lruvec``, the aging ++traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()`` ++to scan PTEs for accessed pages (a ``mm_struct`` list is maintained ++for each ``memcg``). Upon finding one, the aging updates its ++generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``). ++After each round of traversal, the aging increments ``max_seq``. The ++aging is due when ``min_seq[]`` reaches ``max_seq-1``. ++ ++Eviction ++-------- ++The eviction consumes old generations. Given an ``lruvec``, the ++eviction scans pages on the per-zone lists indexed by anon and file ++``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to ++select a type based on the values of ``min_seq[]``. If they are ++equal, it selects the type that has a lower refaulted %. The eviction ++sorts a page according to its updated generation number if the aging ++has found this page accessed. It also moves a page to the next ++generation if this page is from an upper tier that has a higher ++refaulted % than the base tier. The eviction increments ``min_seq[]`` ++of a selected type when it finds all the per-zone lists indexed by ++``min_seq[]`` of this selected type are empty. ++ ++To-do List ++========== ++KVM Optimization ++---------------- ++Support shadow page table walk.