From: Christian Marangi Date: Fri, 21 Oct 2022 22:24:49 +0000 (+0200) Subject: generic: 5.15: move MGLRU patches from pending to backport X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=2a7bdde29bebcaeb18c8595d31ca0d89b5719a1a;p=openwrt%2Fstaging%2Fsvanheule.git generic: 5.15: move MGLRU patches from pending to backport Move MGLRU patches from pending to backport as they got merged upstream. These are direct porting from one of the dev so it's better to just move than trying to backport them again from upstream. Signed-off-by: Christian Marangi --- diff --git a/target/linux/generic/backport-5.15/020-v6.1-01-mm-x86-arm64-add-arch_has_hw_pte_young.patch b/target/linux/generic/backport-5.15/020-v6.1-01-mm-x86-arm64-add-arch_has_hw_pte_young.patch new file mode 100644 index 0000000000..48bcaf3e3e --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-01-mm-x86-arm64-add-arch_has_hw_pte_young.patch @@ -0,0 +1,169 @@ +From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Wed, 4 Aug 2021 01:31:34 -0600 +Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young() + +Some architectures automatically set the accessed bit in PTEs, e.g., +x86 and arm64 v8.2. On architectures that do not have this capability, +clearing the accessed bit in a PTE triggers a page fault following the +TLB miss of this PTE. + +Being aware of this capability can help make better decisions, i.e., +whether to limit the size of each batch of PTEs and the burst of +batches when clearing the accessed bit. + +Signed-off-by: Yu Zhao +Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a +--- + arch/arm64/include/asm/cpufeature.h | 5 +++++ + arch/arm64/include/asm/pgtable.h | 13 ++++++++----- + arch/arm64/kernel/cpufeature.c | 10 ++++++++++ + arch/arm64/tools/cpucaps | 1 + + arch/x86/include/asm/pgtable.h | 6 +++--- + include/linux/pgtable.h | 13 +++++++++++++ + mm/memory.c | 14 +------------- + 7 files changed, 41 insertions(+), 21 deletions(-) + +--- a/arch/arm64/include/asm/cpufeature.h ++++ b/arch/arm64/include/asm/cpufeature.h +@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r + cpus_have_const_cap(ARM64_HAS_TLB_RANGE); + } + ++static inline bool system_has_hw_af(void) ++{ ++ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF); ++} ++ + extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); + + static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru + * page after fork() + CoW for pfn mappings. We don't always have a + * hardware-managed access flag on arm64. + */ +-static inline bool arch_faults_on_old_pte(void) ++static inline bool arch_has_hw_pte_young(bool local) + { +- WARN_ON(preemptible()); ++ if (local) { ++ WARN_ON(preemptible()); ++ return cpu_has_hw_af(); ++ } + +- return !cpu_has_hw_af(); ++ return system_has_hw_af(); + } +-#define arch_faults_on_old_pte arch_faults_on_old_pte ++#define arch_has_hw_pte_young arch_has_hw_pte_young + + /* + * Experimentally, it's cheap to set the access flag in hardware and we +@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt + */ + static inline bool arch_wants_old_prefaulted_pte(void) + { +- return !arch_faults_on_old_pte(); ++ return arch_has_hw_pte_young(true); + } + #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte + +--- a/arch/arm64/kernel/cpufeature.c ++++ b/arch/arm64/kernel/cpufeature.c +@@ -2187,6 +2187,16 @@ static const struct arm64_cpu_capabiliti + .matches = has_hw_dbm, + .cpu_enable = cpu_enable_hw_dbm, + }, ++ { ++ .desc = "Hardware update of the Access flag", ++ .type = ARM64_CPUCAP_SYSTEM_FEATURE, ++ .capability = ARM64_HW_AF, ++ .sys_reg = SYS_ID_AA64MMFR1_EL1, ++ .sign = FTR_UNSIGNED, ++ .field_pos = ID_AA64MMFR1_HADBS_SHIFT, ++ .min_field_value = 1, ++ .matches = has_cpuid_feature, ++ }, + #endif + { + .desc = "CRC32 instructions", +--- a/arch/arm64/tools/cpucaps ++++ b/arch/arm64/tools/cpucaps +@@ -35,6 +35,7 @@ HAS_STAGE2_FWB + HAS_SYSREG_GIC_CPUIF + HAS_TLB_RANGE + HAS_VIRT_HOST_EXTN ++HW_AF + HW_DBM + KVM_PROTECTED_MODE + MISMATCHED_CACHE_TYPE +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c + return boot_cpu_has_bug(X86_BUG_L1TF); + } + +-#define arch_faults_on_old_pte arch_faults_on_old_pte +-static inline bool arch_faults_on_old_pte(void) ++#define arch_has_hw_pte_young arch_has_hw_pte_young ++static inline bool arch_has_hw_pte_young(bool local) + { +- return false; ++ return true; + } + + #endif /* __ASSEMBLY__ */ +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #endif + ++#ifndef arch_has_hw_pte_young ++/* ++ * Return whether the accessed bit is supported by the local CPU or all CPUs. ++ * ++ * Those arches which have hw access flag feature need to implement their own ++ * helper. By default, "false" means pagefault will be hit on old pte. ++ */ ++static inline bool arch_has_hw_pte_young(bool local) ++{ ++ return false; ++} ++#endif ++ + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR + static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly = + 2; + #endif + +-#ifndef arch_faults_on_old_pte +-static inline bool arch_faults_on_old_pte(void) +-{ +- /* +- * Those arches which don't have hw access flag feature need to +- * implement their own helper. By default, "true" means pagefault +- * will be hit on old pte. +- */ +- return true; +-} +-#endif +- + #ifndef arch_wants_old_prefaulted_pte + static inline bool arch_wants_old_prefaulted_pte(void) + { +@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct + * On architectures with software "accessed" bits, we would + * take a double page fault, so mark it accessed here. + */ +- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { ++ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) { + pte_t entry; + + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); diff --git a/target/linux/generic/backport-5.15/020-v6.1-02-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch b/target/linux/generic/backport-5.15/020-v6.1-02-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch new file mode 100644 index 0000000000..785af275f5 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-02-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch @@ -0,0 +1,111 @@ +From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Sat, 26 Sep 2020 21:17:18 -0600 +Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG + +Some architectures support the accessed bit on non-leaf PMD entries, +e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using +it as part of linear address translation [1]. As an optimization, page +table walkers who are interested in the accessed bit can skip the PTEs +under a non-leaf PMD entry if the accessed bit is cleared on this PMD +entry. + +Although an inline function may be preferable, this capability is +added as a configuration option to look consistent when used with the +existing macros. + +[1]: Intel 64 and IA-32 Architectures Software Developer's Manual + Volume 3 (June 2021), section 4.8 + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8 +--- + arch/Kconfig | 9 +++++++++ + arch/x86/Kconfig | 1 + + arch/x86/include/asm/pgtable.h | 3 ++- + arch/x86/mm/pgtable.c | 5 ++++- + include/linux/pgtable.h | 4 ++-- + 5 files changed, 18 insertions(+), 4 deletions(-) + +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT + config ARCH_HAS_PARANOID_L1D_FLUSH + bool + ++config ARCH_HAS_NONLEAF_PMD_YOUNG ++ bool ++ depends on PGTABLE_LEVELS > 2 ++ help ++ Architectures that select this are able to set the accessed bit on ++ non-leaf PMD entries in addition to leaf PTE entries where pages are ++ mapped. For them, page table walkers that clear the accessed bit may ++ stop at non-leaf PMD entries if they do not see the accessed bit. ++ + source "kernel/gcov/Kconfig" + + source "scripts/gcc-plugins/Kconfig" +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -84,6 +84,7 @@ config X86 + select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_DEVMAP if X86_64 + select ARCH_HAS_PTE_SPECIAL ++ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64 + select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 + select ARCH_HAS_COPY_MC if X86_64 + select ARCH_HAS_SET_MEMORY +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad + + static inline int pmd_bad(pmd_t pmd) + { +- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; ++ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != ++ (_KERNPG_TABLE & ~_PAGE_ACCESSED); + } + + static inline unsigned long pages_to_mb(unsigned long npg) +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_ + return ret; + } + +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { +@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_ + + return ret; + } ++#endif ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) + { +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo + #endif + + #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) + static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo + BUILD_BUG(); + return 0; + } +-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ++#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ + #endif + + #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH diff --git a/target/linux/generic/backport-5.15/020-v6.1-03-mm-vmscan.c-refactor-shrink_node.patch b/target/linux/generic/backport-5.15/020-v6.1-03-mm-vmscan.c-refactor-shrink_node.patch new file mode 100644 index 0000000000..f466f1105c --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-03-mm-vmscan.c-refactor-shrink_node.patch @@ -0,0 +1,224 @@ +From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Sun, 27 Sep 2020 20:49:08 -0600 +Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node() + +This patch refactors shrink_node(). This will make the upcoming +changes to mm/vmscan.c more readable. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04 +--- + mm/vmscan.c | 186 +++++++++++++++++++++++++++------------------------- + 1 file changed, 98 insertions(+), 88 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2562,6 +2562,103 @@ enum scan_balance { + SCAN_FILE, + }; + ++static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) ++{ ++ unsigned long file; ++ struct lruvec *target_lruvec; ++ ++ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); ++ ++ /* ++ * Determine the scan balance between anon and file LRUs. ++ */ ++ spin_lock_irq(&target_lruvec->lru_lock); ++ sc->anon_cost = target_lruvec->anon_cost; ++ sc->file_cost = target_lruvec->file_cost; ++ spin_unlock_irq(&target_lruvec->lru_lock); ++ ++ /* ++ * Target desirable inactive:active list ratios for the anon ++ * and file LRU lists. ++ */ ++ if (!sc->force_deactivate) { ++ unsigned long refaults; ++ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_ANON); ++ if (refaults != target_lruvec->refaults[0] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) ++ sc->may_deactivate |= DEACTIVATE_ANON; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_ANON; ++ ++ /* ++ * When refaults are being observed, it means a new ++ * workingset is being established. Deactivate to get ++ * rid of any stale active pages quickly. ++ */ ++ refaults = lruvec_page_state(target_lruvec, ++ WORKINGSET_ACTIVATE_FILE); ++ if (refaults != target_lruvec->refaults[1] || ++ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) ++ sc->may_deactivate |= DEACTIVATE_FILE; ++ else ++ sc->may_deactivate &= ~DEACTIVATE_FILE; ++ } else ++ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; ++ ++ /* ++ * If we have plenty of inactive file pages that aren't ++ * thrashing, try to reclaim those first before touching ++ * anonymous pages. ++ */ ++ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); ++ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) ++ sc->cache_trim_mode = 1; ++ else ++ sc->cache_trim_mode = 0; ++ ++ /* ++ * Prevent the reclaimer from falling into the cache trap: as ++ * cache pages start out inactive, every cache fault will tip ++ * the scan balance towards the file LRU. And as the file LRU ++ * shrinks, so does the window for rotation from references. ++ * This means we have a runaway feedback loop where a tiny ++ * thrashing file LRU becomes infinitely more attractive than ++ * anon pages. Try to detect this based on file LRU size. ++ */ ++ if (!cgroup_reclaim(sc)) { ++ unsigned long total_high_wmark = 0; ++ unsigned long free, anon; ++ int z; ++ ++ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); ++ file = node_page_state(pgdat, NR_ACTIVE_FILE) + ++ node_page_state(pgdat, NR_INACTIVE_FILE); ++ ++ for (z = 0; z < MAX_NR_ZONES; z++) { ++ struct zone *zone = &pgdat->node_zones[z]; ++ ++ if (!managed_zone(zone)) ++ continue; ++ ++ total_high_wmark += high_wmark_pages(zone); ++ } ++ ++ /* ++ * Consider anon: if that's low too, this isn't a ++ * runaway file reclaim problem, but rather just ++ * extreme pressure. Reclaim as per usual then. ++ */ ++ anon = node_page_state(pgdat, NR_INACTIVE_ANON); ++ ++ sc->file_is_tiny = ++ file + free <= total_high_wmark && ++ !(sc->may_deactivate & DEACTIVATE_ANON) && ++ anon >> sc->priority; ++ } ++} ++ + /* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined +@@ -3032,7 +3129,6 @@ static void shrink_node(pg_data_t *pgdat + unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; + bool reclaimable = false; +- unsigned long file; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + +@@ -3048,93 +3144,7 @@ again: + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + +- /* +- * Determine the scan balance between anon and file LRUs. +- */ +- spin_lock_irq(&target_lruvec->lru_lock); +- sc->anon_cost = target_lruvec->anon_cost; +- sc->file_cost = target_lruvec->file_cost; +- spin_unlock_irq(&target_lruvec->lru_lock); +- +- /* +- * Target desirable inactive:active list ratios for the anon +- * and file LRU lists. +- */ +- if (!sc->force_deactivate) { +- unsigned long refaults; +- +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_ANON); +- if (refaults != target_lruvec->refaults[0] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) +- sc->may_deactivate |= DEACTIVATE_ANON; +- else +- sc->may_deactivate &= ~DEACTIVATE_ANON; +- +- /* +- * When refaults are being observed, it means a new +- * workingset is being established. Deactivate to get +- * rid of any stale active pages quickly. +- */ +- refaults = lruvec_page_state(target_lruvec, +- WORKINGSET_ACTIVATE_FILE); +- if (refaults != target_lruvec->refaults[1] || +- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) +- sc->may_deactivate |= DEACTIVATE_FILE; +- else +- sc->may_deactivate &= ~DEACTIVATE_FILE; +- } else +- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; +- +- /* +- * If we have plenty of inactive file pages that aren't +- * thrashing, try to reclaim those first before touching +- * anonymous pages. +- */ +- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); +- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) +- sc->cache_trim_mode = 1; +- else +- sc->cache_trim_mode = 0; +- +- /* +- * Prevent the reclaimer from falling into the cache trap: as +- * cache pages start out inactive, every cache fault will tip +- * the scan balance towards the file LRU. And as the file LRU +- * shrinks, so does the window for rotation from references. +- * This means we have a runaway feedback loop where a tiny +- * thrashing file LRU becomes infinitely more attractive than +- * anon pages. Try to detect this based on file LRU size. +- */ +- if (!cgroup_reclaim(sc)) { +- unsigned long total_high_wmark = 0; +- unsigned long free, anon; +- int z; +- +- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); +- file = node_page_state(pgdat, NR_ACTIVE_FILE) + +- node_page_state(pgdat, NR_INACTIVE_FILE); +- +- for (z = 0; z < MAX_NR_ZONES; z++) { +- struct zone *zone = &pgdat->node_zones[z]; +- if (!managed_zone(zone)) +- continue; +- +- total_high_wmark += high_wmark_pages(zone); +- } +- +- /* +- * Consider anon: if that's low too, this isn't a +- * runaway file reclaim problem, but rather just +- * extreme pressure. Reclaim as per usual then. +- */ +- anon = node_page_state(pgdat, NR_INACTIVE_ANON); +- +- sc->file_is_tiny = +- file + free <= total_high_wmark && +- !(sc->may_deactivate & DEACTIVATE_ANON) && +- anon >> sc->priority; +- } ++ prepare_scan_count(pgdat, sc); + + shrink_node_memcgs(pgdat, sc); + diff --git a/target/linux/generic/backport-5.15/020-v6.1-04-mm-multigenerational-lru-groundwork.patch b/target/linux/generic/backport-5.15/020-v6.1-04-mm-multigenerational-lru-groundwork.patch new file mode 100644 index 0000000000..fbc15d8c91 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-04-mm-multigenerational-lru-groundwork.patch @@ -0,0 +1,996 @@ +From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 25 Jan 2021 21:12:33 -0700 +Subject: [PATCH 04/10] mm: multigenerational lru: groundwork + +For each lruvec, evictable pages are divided into multiple +generations. The youngest generation number is stored in +lrugen->max_seq for both anon and file types as they are aged on an +equal footing. The oldest generation numbers are stored in +lrugen->min_seq[] separately for anon and file types as clean file +pages can be evicted regardless of swap constraints. These three +variables are monotonically increasing. Generation numbers are +truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into +page->flags. The sliding window technique is used to prevent truncated +generation numbers from overlapping. Each truncated generation number +is an index to +lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. + +The framework comprises two conceptually independent components: the +aging, which produces young generations, and the eviction, which +consumes old generations. Both can be invoked independently from user +space for the purpose of working set estimation and proactive reclaim. + +The protection of hot pages and the selection of cold pages are based +on page access types and patterns. There are two access types: one via +page tables and the other via file descriptors. The protection of the +former type is by design stronger because: + 1) The uncertainty in determining the access patterns of the former + type is higher due to the coalesced nature of the accessed bit. + 2) The cost of evicting the former type is higher due to the TLB + flushes required and the likelihood of involving I/O. + 3) The penalty of under-protecting the former type is higher because + applications usually do not prepare themselves for major faults like + they do for blocked I/O. For example, client applications commonly + dedicate blocked I/O to separate threads to avoid UI janks that + negatively affect user experience. + +There are also two access patterns: one with temporal locality and the +other without. The latter pattern, e.g., random and sequential, needs +to be explicitly excluded to avoid weakening the protection of the +former pattern. Generally the former type follows the former pattern +unless MADV_SEQUENTIAL is specified and the latter type follows the +latter pattern unless outlying refaults have been observed. + +Upon faulting, a page is added to the youngest generation, which +provides the strongest protection as the eviction will not consider +this page before the aging has scanned it at least twice. The first +scan clears the accessed bit set during the initial fault. And the +second scan makes sure this page has not been used since the first +scan. A page from any other generations is brought back to the +youngest generation whenever the aging finds the accessed bit set on +any of the PTEs mapping this page. + +Unmapped pages are initially added to the oldest generation and then +conditionally protected by tiers. This is done later [PATCH 07/10]. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7 +--- + fs/fuse/dev.c | 3 +- + include/linux/cgroup.h | 15 +- + include/linux/mm.h | 36 ++++ + include/linux/mm_inline.h | 182 ++++++++++++++++++++ + include/linux/mmzone.h | 70 ++++++++ + include/linux/page-flags-layout.h | 19 ++- + include/linux/page-flags.h | 4 +- + include/linux/sched.h | 3 + + kernel/bounds.c | 3 + + kernel/cgroup/cgroup-internal.h | 1 - + mm/huge_memory.c | 3 +- + mm/memcontrol.c | 1 + + mm/memory.c | 7 + + mm/mm_init.c | 6 +- + mm/page_alloc.c | 1 + + mm/swap.c | 9 +- + mm/swapfile.c | 2 + + mm/vmscan.c | 268 ++++++++++++++++++++++++++++++ + 18 files changed, 618 insertions(+), 15 deletions(-) + +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -785,7 +785,8 @@ static int fuse_check_page(struct page * + 1 << PG_active | + 1 << PG_workingset | + 1 << PG_reclaim | +- 1 << PG_waiters))) { ++ 1 << PG_waiters | ++ LRU_GEN_MASK | LRU_REFS_MASK))) { + dump_page(page, "fuse: trying to steal weird page"); + return 1; + } +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr + css_put(&cgrp->self); + } + ++extern struct mutex cgroup_mutex; ++ ++static inline void cgroup_lock(void) ++{ ++ mutex_lock(&cgroup_mutex); ++} ++ ++static inline void cgroup_unlock(void) ++{ ++ mutex_unlock(&cgroup_mutex); ++} ++ + /** + * task_css_set_check - obtain a task's css_set with extra access conditions + * @task: the task to obtain css_set for +@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr + * as locks used during the cgroup_subsys::attach() methods. + */ + #ifdef CONFIG_PROVE_RCU +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + #define task_css_set_check(task, __c) \ + rcu_dereference_check((task)->cgroups, \ +@@ -707,6 +718,8 @@ struct cgroup; + static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } + static inline void css_get(struct cgroup_subsys_state *css) {} + static inline void css_put(struct cgroup_subsys_state *css) {} ++static inline void cgroup_lock(void) {} ++static inline void cgroup_unlock(void) {} + static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) { return 0; } + static inline int cgroupstats_build(struct cgroupstats *stats, +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v + #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) + #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) + #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) ++#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) ++#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) + + /* + * Define the bit shifts to access each section. For non-existent +@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s + loff_t const holebegin, loff_t const holelen, int even_cows) { } + #endif + ++#ifdef CONFIG_LRU_GEN ++static inline void task_enter_nonseq_fault(void) ++{ ++ WARN_ON(current->in_nonseq_fault); ++ ++ current->in_nonseq_fault = 1; ++} ++ ++static inline void task_exit_nonseq_fault(void) ++{ ++ WARN_ON(!current->in_nonseq_fault); ++ ++ current->in_nonseq_fault = 0; ++} ++ ++static inline bool task_in_nonseq_fault(void) ++{ ++ return current->in_nonseq_fault; ++} ++#else ++static inline void task_enter_nonseq_fault(void) ++{ ++} ++ ++static inline void task_exit_nonseq_fault(void) ++{ ++} ++ ++static inline bool task_in_nonseq_fault(void) ++{ ++ return false; ++} ++#endif /* CONFIG_LRU_GEN */ ++ + static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) + { +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag + return lru; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static inline bool lru_gen_enabled(void) ++{ ++#ifdef CONFIG_LRU_GEN_ENABLED ++ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); ++ ++ return static_branch_likely(&lru_gen_static_key); ++#else ++ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); ++ ++ return static_branch_unlikely(&lru_gen_static_key); ++#endif ++} ++ ++/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */ ++static inline int lru_gen_from_seq(unsigned long seq) ++{ ++ return seq % MAX_NR_GENS; ++} ++ ++/* The youngest and the second youngest generations are counted as active. */ ++static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) ++{ ++ unsigned long max_seq = lruvec->evictable.max_seq; ++ ++ VM_BUG_ON(gen >= MAX_NR_GENS); ++ ++ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); ++} ++ ++/* Update the sizes of the multigenerational lru lists. */ ++static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, ++ int old_gen, int new_gen) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ enum lru_list lru = type * LRU_FILE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lockdep_assert_held(&lruvec->lru_lock); ++ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); ++ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); ++ VM_BUG_ON(old_gen == -1 && new_gen == -1); ++ ++ if (old_gen >= 0) ++ WRITE_ONCE(lrugen->sizes[old_gen][type][zone], ++ lrugen->sizes[old_gen][type][zone] - delta); ++ if (new_gen >= 0) ++ WRITE_ONCE(lrugen->sizes[new_gen][type][zone], ++ lrugen->sizes[new_gen][type][zone] + delta); ++ ++ if (old_gen < 0) { ++ if (lru_gen_is_active(lruvec, new_gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, delta); ++ return; ++ } ++ ++ if (new_gen < 0) { ++ if (lru_gen_is_active(lruvec, old_gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, -delta); ++ return; ++ } ++ ++ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { ++ update_lru_size(lruvec, lru, zone, -delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); ++ } ++ ++ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); ++} ++ ++/* Add a page to one of the multigenerational lru lists. Return true on success. */ ++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int gen; ++ unsigned long old_flags, new_flags; ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ if (PageUnevictable(page) || !lrugen->enabled[type]) ++ return false; ++ /* ++ * If a page shouldn't be considered for eviction, i.e., a page mapped ++ * upon fault during which the accessed bit is set, add it to the ++ * youngest generation. ++ * ++ * If a page can't be evicted immediately, i.e., an anon page not in ++ * swap cache or a dirty page pending writeback, add it to the second ++ * oldest generation. ++ * ++ * If a page could be evicted immediately, e.g., a clean page, add it to ++ * the oldest generation. ++ */ ++ if (PageActive(page)) ++ gen = lru_gen_from_seq(lrugen->max_seq); ++ else if ((!type && !PageSwapCache(page)) || ++ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))) ++ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); ++ else ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page); ++ ++ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, -1, gen); ++ /* for rotate_reclaimable_page() */ ++ if (reclaiming) ++ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); ++ else ++ list_add(&page->lru, &lrugen->lists[gen][type][zone]); ++ ++ return true; ++} ++ ++/* Delete a page from one of the multigenerational lru lists. Return true on success. */ ++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int gen; ++ unsigned long old_flags, new_flags; ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ if (!(new_flags & LRU_GEN_MASK)) ++ return false; ++ ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ ++ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ ++ new_flags &= ~LRU_GEN_MASK; ++ /* for shrink_page_list() */ ++ if (reclaiming) ++ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); ++ else if (lru_gen_is_active(lruvec, gen)) ++ new_flags |= BIT(PG_active); ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, gen, -1); ++ list_del(&page->lru); ++ ++ return true; ++} ++ ++#else ++ ++static inline bool lru_gen_enabled(void) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ return false; ++} ++ ++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static __always_inline void add_page_to_lru_list(struct page *page, + struct lruvec *lruvec) + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(page, lruvec, false)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add(&page->lru, &lruvec->lists[lru]); + } +@@ -93,6 +269,9 @@ static __always_inline void add_page_to_ + { + enum lru_list lru = page_lru(page); + ++ if (lru_gen_add_page(page, lruvec, true)) ++ return; ++ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add_tail(&page->lru, &lruvec->lists[lru]); + } +@@ -100,6 +279,9 @@ static __always_inline void add_page_to_ + static __always_inline void del_page_from_lru_list(struct page *page, + struct lruvec *lruvec) + { ++ if (lru_gen_del_page(page, lruvec, false)) ++ return; ++ + list_del(&page->lru); + update_lru_size(lruvec, page_lru(page), page_zonenum(page), + -thp_nr_pages(page)); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -294,6 +294,72 @@ enum lruvec_flags { + */ + }; + ++struct lruvec; ++ ++#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) ++#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) ++ ++#ifdef CONFIG_LRU_GEN ++ ++/* ++ * For each lruvec, evictable pages are divided into multiple generations. The ++ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are ++ * monotonically increasing. The sliding window technique is used to track at ++ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the ++ * window, AKA gen, indexes an array of per-type and per-zone lists for the ++ * corresponding generation. The counter in page->flags stores gen+1 while a ++ * page is on one of the multigenerational lru lists. Otherwise, it stores 0. ++ * ++ * After a page is faulted in, the aging must check the accessed bit at least ++ * twice before the eviction would consider it. The first check clears the ++ * accessed bit set during the initial fault. The second check makes sure this ++ * page hasn't been used since then. ++ */ ++#define MIN_NR_GENS 2 ++#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) ++ ++struct lrugen { ++ /* the aging increments the max generation number */ ++ unsigned long max_seq; ++ /* the eviction increments the min generation numbers */ ++ unsigned long min_seq[ANON_AND_FILE]; ++ /* the birth time of each generation in jiffies */ ++ unsigned long timestamps[MAX_NR_GENS]; ++ /* the multigenerational lru lists */ ++ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the sizes of the multigenerational lru lists in pages */ ++ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* whether the multigenerational lru is enabled */ ++ bool enabled[ANON_AND_FILE]; ++}; ++ ++#define MAX_BATCH_SIZE 8192 ++ ++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); ++void lru_gen_change_state(bool enable, bool main, bool swap); ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg); ++#endif ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) ++{ ++} ++ ++static inline void lru_gen_change_state(bool enable, bool main, bool swap) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++} ++#endif ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct lruvec { + struct list_head lists[NR_LRU_LISTS]; + /* per lruvec lru_lock for memcg */ +@@ -311,6 +377,10 @@ struct lruvec { + unsigned long refaults[ANON_AND_FILE]; + /* Various lruvec state flags (enum lruvec_flags) */ + unsigned long flags; ++#ifdef CONFIG_LRU_GEN ++ /* unevictable pages are on LRU_UNEVICTABLE */ ++ struct lrugen evictable; ++#endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; + #endif +--- a/include/linux/page-flags-layout.h ++++ b/include/linux/page-flags-layout.h +@@ -26,6 +26,14 @@ + + #define ZONES_WIDTH ZONES_SHIFT + ++#ifdef CONFIG_LRU_GEN ++/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ ++#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) ++#else ++#define LRU_GEN_WIDTH 0 ++#define LRU_REFS_WIDTH 0 ++#endif /* CONFIG_LRU_GEN */ ++ + #ifdef CONFIG_SPARSEMEM + #include + #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) +@@ -55,7 +63,8 @@ + #define SECTIONS_WIDTH 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ ++ <= BITS_PER_LONG - NR_PAGEFLAGS + #define NODES_WIDTH NODES_SHIFT + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) + #error "Vmemmap: No space for nodes field in page flags" +@@ -89,8 +98,8 @@ + #define LAST_CPUPID_SHIFT 0 + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ +- <= BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS + #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT + #else + #define LAST_CPUPID_WIDTH 0 +@@ -100,8 +109,8 @@ + #define LAST_CPUPID_NOT_IN_PAGE_FLAGS + #endif + +-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ +- > BITS_PER_LONG - NR_PAGEFLAGS ++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ ++ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS + #error "Not enough bits in page flags" + #endif + +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall + 1UL << PG_private | 1UL << PG_private_2 | \ + 1UL << PG_writeback | 1UL << PG_reserved | \ + 1UL << PG_slab | 1UL << PG_active | \ +- 1UL << PG_unevictable | __PG_MLOCKED) ++ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) + + /* + * Flags checked when a page is prepped for return by the page allocator. +@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall + * alloc-free cycle to prevent from reusing the page. + */ + #define PAGE_FLAGS_CHECK_AT_PREP \ +- (PAGEFLAGS_MASK & ~__PG_HWPOISON) ++ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) + + #define PAGE_FLAGS_PRIVATE \ + (1UL << PG_private | 1UL << PG_private_2) +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -911,6 +911,9 @@ struct task_struct { + #ifdef CONFIG_MEMCG + unsigned in_user_fault:1; + #endif ++#ifdef CONFIG_LRU_GEN ++ unsigned in_nonseq_fault:1; ++#endif + #ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; + #endif +--- a/kernel/bounds.c ++++ b/kernel/bounds.c +@@ -22,6 +22,9 @@ int main(void) + DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + #endif + DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); ++#ifdef CONFIG_LRU_GEN ++ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); ++#endif + /* End of constants */ + + return 0; +--- a/kernel/cgroup/cgroup-internal.h ++++ b/kernel/cgroup/cgroup-internal.h +@@ -165,7 +165,6 @@ struct cgroup_mgctx { + #define DEFINE_CGROUP_MGCTX(name) \ + struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) + +-extern struct mutex cgroup_mutex; + extern spinlock_t css_set_lock; + extern struct cgroup_subsys *cgroup_subsys[]; + extern struct list_head cgroup_roots; +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc + #ifdef CONFIG_64BIT + (1L << PG_arch_2) | + #endif +- (1L << PG_dirty))); ++ (1L << PG_dirty) | ++ LRU_GEN_MASK | LRU_REFS_MASK)); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all + memcg->deferred_split_queue.split_queue_len = 0; + #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); ++ lru_gen_init_memcg(memcg); + return memcg; + fail: + mem_cgroup_id_remove(memcg); +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are + unsigned int flags, struct pt_regs *regs) + { + vm_fault_t ret; ++ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); + + __set_current_state(TASK_RUNNING); + +@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are + if (flags & FAULT_FLAG_USER) + mem_cgroup_enter_user_fault(); + ++ if (nonseq_fault) ++ task_enter_nonseq_fault(); ++ + if (unlikely(is_vm_hugetlb_page(vma))) + ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + else + ret = __handle_mm_fault(vma, address, flags); + ++ if (nonseq_fault) ++ task_exit_nonseq_fault(); ++ + if (flags & FAULT_FLAG_USER) { + mem_cgroup_exit_user_fault(); + /* +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH +- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; ++ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", +- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", ++ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, ++ LRU_GEN_WIDTH, ++ LRU_REFS_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -7456,6 +7456,7 @@ static void __meminit pgdat_init_interna + + pgdat_page_ext_init(pgdat); + lruvec_init(&pgdat->__lruvec); ++ lru_gen_init_state(NULL, &pgdat->__lruvec); + } + + static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page) + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + ++ /* see the comment in lru_gen_add_page() */ ++ if (lru_gen_enabled() && !PageUnevictable(page) && ++ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) ++ SetPageActive(page); ++ + get_page(page); + local_lock(&lru_pvecs.lock); + pvec = this_cpu_ptr(&lru_pvecs.lru_add); +@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc + + static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) + { +- if (PageActive(page) && !PageUnevictable(page)) { ++ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + int nr_pages = thp_nr_pages(page); + + del_page_from_lru_list(page, lruvec); +@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p + */ + void deactivate_page(struct page *page) + { +- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { ++ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { + struct pagevec *pvec; + + local_lock(&lru_pvecs.lock); +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us + err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ lru_gen_change_state(false, false, true); + + out_dput: + filp_close(victim, NULL); +@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use + mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); ++ lru_gen_change_state(true, false, true); + + error = 0; + goto out; +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -50,6 +50,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg + return can_demote(pgdat->node_id, sc); + } + ++#ifdef CONFIG_LRU_GEN ++ ++/****************************************************************************** ++ * shorthand helpers ++ ******************************************************************************/ ++ ++#define for_each_gen_type_zone(gen, type, zone) \ ++ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ ++ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ ++ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) ++ ++static int page_lru_gen(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) ++{ ++ struct pglist_data *pgdat = NODE_DATA(nid); ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) { ++ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; ++ ++ if (lruvec->pgdat != pgdat) ++ lruvec->pgdat = pgdat; ++ ++ return lruvec; ++ } ++#endif ++ return pgdat ? &pgdat->__lruvec : NULL; ++} ++ ++static int get_nr_gens(struct lruvec *lruvec, int type) ++{ ++ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; ++} ++ ++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) ++{ ++ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS && ++ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) && ++ get_nr_gens(lruvec, 0) <= MAX_NR_GENS; ++} ++ ++/****************************************************************************** ++ * state change ++ ******************************************************************************/ ++ ++#ifdef CONFIG_LRU_GEN_ENABLED ++DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); ++#else ++DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); ++#endif ++ ++static int lru_gen_nr_swapfiles; ++ ++static bool __maybe_unused state_is_valid(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ enum lru_list lru; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for_each_evictable_lru(lru) { ++ type = is_file_lru(lru); ++ ++ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) ++ return false; ++ } ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) ++ return false; ++ ++ /* unlikely but not a bug when reset_batch_size() is pending */ ++ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); ++ } ++ ++ return true; ++} ++ ++static bool fill_lists(struct lruvec *lruvec) ++{ ++ enum lru_list lru; ++ int remaining = MAX_BATCH_SIZE; ++ ++ for_each_evictable_lru(lru) { ++ int type = is_file_lru(lru); ++ bool active = is_active_lru(lru); ++ struct list_head *head = &lruvec->lists[lru]; ++ ++ if (!lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page) != active, page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ del_page_from_lru_list(page, lruvec); ++ success = lru_gen_add_page(page, lruvec, false); ++ VM_BUG_ON(!success); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static bool drain_lists(struct lruvec *lruvec) ++{ ++ int gen, type, zone; ++ int remaining = MAX_BATCH_SIZE; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; ++ ++ if (lruvec->evictable.enabled[type]) ++ continue; ++ ++ while (!list_empty(head)) { ++ bool success; ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ success = lru_gen_del_page(page, lruvec, false); ++ VM_BUG_ON(!success); ++ add_page_to_lru_list(page, lruvec); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++/* ++ * For file page tracking, we enable/disable it according to the main switch. ++ * For anon page tracking, we only enabled it when the main switch is on and ++ * there is at least one swapfile; we disable it when there are no swapfiles ++ * regardless of the value of the main switch. Otherwise, we will eventually ++ * reach the max size of the sliding window and have to call inc_min_seq(). ++ */ ++void lru_gen_change_state(bool enable, bool main, bool swap) ++{ ++ static DEFINE_MUTEX(state_mutex); ++ ++ struct mem_cgroup *memcg; ++ ++ mem_hotplug_begin(); ++ cgroup_lock(); ++ mutex_lock(&state_mutex); ++ ++ if (swap) { ++ if (enable) ++ swap = !lru_gen_nr_swapfiles++; ++ else ++ swap = !--lru_gen_nr_swapfiles; ++ } ++ ++ if (main && enable != lru_gen_enabled()) { ++ if (enable) ++ static_branch_enable(&lru_gen_static_key); ++ else ++ static_branch_disable(&lru_gen_static_key); ++ } else if (!swap || !lru_gen_enabled()) ++ goto unlock; ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ if (!lruvec) ++ continue; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ VM_BUG_ON(!state_is_valid(lruvec)); ++ ++ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lruvec->evictable.enabled[1] = lru_gen_enabled(); ++ ++ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++unlock: ++ mutex_unlock(&state_mutex); ++ cgroup_unlock(); ++ mem_hotplug_done(); ++} ++ ++/****************************************************************************** ++ * initialization ++ ******************************************************************************/ ++ ++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) ++{ ++ int i; ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ lrugen->max_seq = MIN_NR_GENS + 1; ++ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; ++ lrugen->enabled[1] = lru_gen_enabled(); ++ ++ for (i = 0; i <= MIN_NR_GENS + 1; i++) ++ lrugen->timestamps[i] = jiffies; ++ ++ for_each_gen_type_zone(gen, type, zone) ++ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_init_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ lru_gen_init_state(memcg, lruvec); ++ } ++} ++#endif ++ ++static int __init init_lru_gen(void) ++{ ++ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); ++ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ ++ return 0; ++}; ++late_initcall(init_lru_gen); ++ ++#endif /* CONFIG_LRU_GEN */ ++ + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) + { + unsigned long nr[NR_LRU_LISTS]; diff --git a/target/linux/generic/backport-5.15/020-v6.1-05-mm-multigenerational-lru-mm_struct-list.patch b/target/linux/generic/backport-5.15/020-v6.1-05-mm-multigenerational-lru-mm_struct-list.patch new file mode 100644 index 0000000000..2592f18e06 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-05-mm-multigenerational-lru-mm_struct-list.patch @@ -0,0 +1,760 @@ +From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 5 Apr 2021 04:17:41 -0600 +Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list + +To scan PTEs for accessed pages, a mm_struct list is maintained for +each memcg. When multiple threads traverse the same memcg->mm_list, +each of them gets a unique mm_struct and therefore they can run +walk_page_range() concurrently to reach page tables of all processes +of this memcg. + +This infrastructure also provides the following optimizations: + 1) it allows walkers to skip processes that have been sleeping since + the last walk by tracking the usage of mm_struct between context + switches. + 2) it allows walkers to add interesting items they find during a + walk to a Bloom filter so that they can skip uninteresting items + during the next walk by testing whether an item is in this Bloom + filter. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8 +--- + fs/exec.c | 2 + + include/linux/memcontrol.h | 4 + + include/linux/mm_inline.h | 6 + + include/linux/mm_types.h | 75 +++++++++ + include/linux/mmzone.h | 63 +++++++ + kernel/exit.c | 1 + + kernel/fork.c | 9 + + kernel/sched/core.c | 1 + + mm/memcontrol.c | 25 +++ + mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++ + 10 files changed, 517 insertions(+) + +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m + active_mm = tsk->active_mm; + tsk->active_mm = mm; + tsk->mm = mm; ++ lru_gen_add_mm(mm); + /* + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for +@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m + if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + activate_mm(active_mm, mm); ++ lru_gen_activate_mm(mm); + if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + tsk->mm->vmacache_seqnum = 0; +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -348,6 +348,10 @@ struct mem_cgroup { + struct deferred_split deferred_split_queue; + #endif + ++#ifdef CONFIG_LRU_GEN ++ struct lru_gen_mm_list mm_list; ++#endif ++ + struct mem_cgroup_per_node *nodeinfo[]; + }; + +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig + return seq % MAX_NR_GENS; + } + ++/* Return a proper index regardless whether we keep stats for historical generations. */ ++static inline int lru_hist_from_seq(unsigned long seq) ++{ ++ return seq % NR_HIST_GENS; ++} ++ + /* The youngest and the second youngest generations are counted as active. */ + static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) + { +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -3,6 +3,7 @@ + #define _LINUX_MM_TYPES_H + + #include ++#include + + #include + #include +@@ -15,6 +16,8 @@ + #include + #include + #include ++#include ++#include + + #include + +@@ -580,6 +583,18 @@ struct mm_struct { + #ifdef CONFIG_IOMMU_SUPPORT + u32 pasid; + #endif ++#ifdef CONFIG_LRU_GEN ++ struct { ++ /* the node of a global or per-memcg mm_struct list */ ++ struct list_head list; ++#ifdef CONFIG_MEMCG ++ /* points to the memcg of the owner task above */ ++ struct mem_cgroup *memcg; ++#endif ++ /* whether this mm_struct has been used since the last walk */ ++ nodemask_t nodes; ++ } lrugen; ++#endif /* CONFIG_LRU_GEN */ + } __randomize_layout; + + /* +@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru + return (struct cpumask *)&mm->cpu_bitmap; + } + ++#ifdef CONFIG_LRU_GEN ++ ++struct lru_gen_mm_list { ++ /* a global or per-memcg mm_struct list */ ++ struct list_head fifo; ++ /* protects the list above */ ++ spinlock_t lock; ++}; ++ ++void lru_gen_add_mm(struct mm_struct *mm); ++void lru_gen_del_mm(struct mm_struct *mm); ++#ifdef CONFIG_MEMCG ++void lru_gen_migrate_mm(struct mm_struct *mm); ++#endif ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++ INIT_LIST_HEAD(&mm->lrugen.list); ++#ifdef CONFIG_MEMCG ++ mm->lrugen.memcg = NULL; ++#endif ++ nodes_clear(mm->lrugen.nodes); ++} ++ ++/* Track the usage of each mm_struct so that we can skip inactive ones. */ ++static inline void lru_gen_activate_mm(struct mm_struct *mm) ++{ ++ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */ ++ VM_WARN_ON(list_empty(&mm->lrugen.list)); ++ ++ if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes)) ++ nodes_setall(mm->lrugen.nodes); ++} ++ ++#else /* !CONFIG_LRU_GEN */ ++ ++static inline void lru_gen_add_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_del_mm(struct mm_struct *mm) ++{ ++} ++ ++#ifdef CONFIG_MEMCG ++static inline void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++} ++#endif ++ ++static inline void lru_gen_init_mm(struct mm_struct *mm) ++{ ++} ++ ++static inline void lru_gen_activate_mm(struct mm_struct *mm) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -318,6 +318,13 @@ struct lruvec; + #define MIN_NR_GENS 2 + #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) + ++/* Whether to keep stats for historical generations. */ ++#ifdef CONFIG_LRU_GEN_STATS ++#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) ++#else ++#define NR_HIST_GENS 1U ++#endif ++ + struct lrugen { + /* the aging increments the max generation number */ + unsigned long max_seq; +@@ -333,13 +340,63 @@ struct lrugen { + bool enabled[ANON_AND_FILE]; + }; + ++enum { ++ MM_LEAF_TOTAL, /* total leaf entries */ ++ MM_LEAF_OLD, /* old leaf entries */ ++ MM_LEAF_YOUNG, /* young leaf entries */ ++ MM_NONLEAF_TOTAL, /* total non-leaf entries */ ++ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */ ++ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */ ++ NR_MM_STATS ++}; ++ ++/* mnemonic codes for the stats above */ ++#define MM_STAT_CODES "toydpc" ++ ++/* double buffering bloom filters */ ++#define NR_BLOOM_FILTERS 2 ++ ++struct lru_gen_mm_walk { ++ /* set to max_seq after each round of walk */ ++ unsigned long seq; ++ /* the next mm_struct on the list to walk */ ++ struct list_head *head; ++ /* the first mm_struct never walked before */ ++ struct list_head *tail; ++ /* to wait for the last walker to finish */ ++ struct wait_queue_head wait; ++ /* bloom filters flip after each round of walk */ ++ unsigned long *filters[NR_BLOOM_FILTERS]; ++ /* page table stats for debugging */ ++ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; ++ /* the number of concurrent walkers */ ++ int nr_walkers; ++}; ++ ++#define MIN_BATCH_SIZE 64 + #define MAX_BATCH_SIZE 8192 + ++struct mm_walk_args { ++ struct mem_cgroup *memcg; ++ unsigned long max_seq; ++ unsigned long start_pfn; ++ unsigned long end_pfn; ++ unsigned long next_addr; ++ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)]; ++ int node_id; ++ int swappiness; ++ int batch_size; ++ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ int mm_stats[NR_MM_STATS]; ++ bool use_filter; ++}; ++ + void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); + void lru_gen_change_state(bool enable, bool main, bool swap); + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg); ++void lru_gen_free_memcg(struct mem_cgroup *memcg); + #endif + + #else /* !CONFIG_LRU_GEN */ +@@ -356,6 +413,10 @@ static inline void lru_gen_change_state( + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { + } ++ ++static inline void lru_gen_free_memcg(struct mem_cgroup *memcg) ++{ ++} + #endif + + #endif /* CONFIG_LRU_GEN */ +@@ -380,6 +441,8 @@ struct lruvec { + #ifdef CONFIG_LRU_GEN + /* unevictable pages are on LRU_UNEVICTABLE */ + struct lrugen evictable; ++ /* state for mm list and page table walks */ ++ struct lru_gen_mm_walk mm_walk; + #endif + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -422,6 +422,7 @@ assign_new_owner: + goto retry; + } + WRITE_ONCE(mm->owner, c); ++ lru_gen_migrate_mm(mm); + task_unlock(c); + put_task_struct(c); + } +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1080,6 +1080,7 @@ static struct mm_struct *mm_init(struct + goto fail_nocontext; + + mm->user_ns = get_user_ns(user_ns); ++ lru_gen_init_mm(mm); + return mm; + + fail_nocontext: +@@ -1122,6 +1123,7 @@ static inline void __mmput(struct mm_str + } + if (mm->binfmt) + module_put(mm->binfmt->module); ++ lru_gen_del_mm(mm); + mmdrop(mm); + } + +@@ -2617,6 +2619,13 @@ pid_t kernel_clone(struct kernel_clone_a + get_task_struct(p); + } + ++ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { ++ /* lock the task to synchronize with memcg migration */ ++ task_lock(p); ++ lru_gen_add_mm(p->mm); ++ task_unlock(p); ++ } ++ + wake_up_new_task(p); + + /* forking complete and child started to run, tell ptracer */ +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4978,6 +4978,7 @@ context_switch(struct rq *rq, struct tas + * finish_task_switch()'s mmdrop(). + */ + switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ lru_gen_activate_mm(next->mm); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5163,6 +5163,7 @@ static void __mem_cgroup_free(struct mem + + static void mem_cgroup_free(struct mem_cgroup *memcg) + { ++ lru_gen_free_memcg(memcg); + memcg_wb_domain_exit(memcg); + __mem_cgroup_free(memcg); + } +@@ -6195,6 +6196,29 @@ static void mem_cgroup_move_task(void) + } + #endif + ++#ifdef CONFIG_LRU_GEN ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++ struct cgroup_subsys_state *css; ++ struct task_struct *task = NULL; ++ ++ cgroup_taskset_for_each_leader(task, css, tset) ++ break; ++ ++ if (!task) ++ return; ++ ++ task_lock(task); ++ if (task->mm && task->mm->owner == task) ++ lru_gen_migrate_mm(task->mm); ++ task_unlock(task); ++} ++#else ++static void mem_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) + { + if (value == PAGE_COUNTER_MAX) +@@ -6538,6 +6562,7 @@ struct cgroup_subsys memory_cgrp_subsys + .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, + .can_attach = mem_cgroup_can_attach, ++ .attach = mem_cgroup_attach, + .cancel_attach = mem_cgroup_cancel_attach, + .post_attach = mem_cgroup_move_task, + .dfl_cftypes = memory_files, +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2929,6 +2929,306 @@ static bool __maybe_unused seq_is_valid( + } + + /****************************************************************************** ++ * mm_struct list ++ ******************************************************************************/ ++ ++static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) ++{ ++ static struct lru_gen_mm_list mm_list = { ++ .fifo = LIST_HEAD_INIT(mm_list.fifo), ++ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), ++ }; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ return &memcg->mm_list; ++#endif ++ return &mm_list; ++} ++ ++void lru_gen_add_mm(struct mm_struct *mm) ++{ ++ int nid; ++ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); ++ ++ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm); ++#ifdef CONFIG_MEMCG ++ VM_BUG_ON_MM(mm->lrugen.memcg, mm); ++ mm->lrugen.memcg = memcg; ++#endif ++ spin_lock(&mm_list->lock); ++ ++ list_add_tail(&mm->lrugen.list, &mm_list->fifo); ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ if (!lruvec) ++ continue; ++ ++ if (lruvec->mm_walk.tail == &mm_list->fifo) ++ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev; ++ } ++ ++ spin_unlock(&mm_list->lock); ++} ++ ++void lru_gen_del_mm(struct mm_struct *mm) ++{ ++ int nid; ++ struct lru_gen_mm_list *mm_list; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (list_empty(&mm->lrugen.list)) ++ return; ++ ++#ifdef CONFIG_MEMCG ++ memcg = mm->lrugen.memcg; ++#endif ++ mm_list = get_mm_list(memcg); ++ ++ spin_lock(&mm_list->lock); ++ ++ for_each_node(nid) { ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ if (!lruvec) ++ continue; ++ ++ if (lruvec->mm_walk.tail == &mm->lrugen.list) ++ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next; ++ ++ if (lruvec->mm_walk.head != &mm->lrugen.list) ++ continue; ++ ++ lruvec->mm_walk.head = lruvec->mm_walk.head->next; ++ if (lruvec->mm_walk.head == &mm_list->fifo) ++ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1); ++ } ++ ++ list_del_init(&mm->lrugen.list); ++ ++ spin_unlock(&mm_list->lock); ++ ++#ifdef CONFIG_MEMCG ++ mem_cgroup_put(mm->lrugen.memcg); ++ mm->lrugen.memcg = NULL; ++#endif ++} ++ ++#ifdef CONFIG_MEMCG ++void lru_gen_migrate_mm(struct mm_struct *mm) ++{ ++ struct mem_cgroup *memcg; ++ ++ lockdep_assert_held(&mm->owner->alloc_lock); ++ ++ if (mem_cgroup_disabled()) ++ return; ++ ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_task(mm->owner); ++ rcu_read_unlock(); ++ if (memcg == mm->lrugen.memcg) ++ return; ++ ++ VM_BUG_ON_MM(!mm->lrugen.memcg, mm); ++ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm); ++ ++ lru_gen_del_mm(mm); ++ lru_gen_add_mm(mm); ++} ++#endif ++ ++#define BLOOM_FILTER_SHIFT 15 ++ ++static inline int filter_gen_from_seq(unsigned long seq) ++{ ++ return seq % NR_BLOOM_FILTERS; ++} ++ ++static void get_item_key(void *item, int *key) ++{ ++ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); ++ ++ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); ++ ++ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); ++ key[1] = hash >> BLOOM_FILTER_SHIFT; ++} ++ ++static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq) ++{ ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); ++ ++ filter = lruvec->mm_walk.filters[gen]; ++ if (filter) { ++ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); ++ return; ++ } ++ ++ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC); ++ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter); ++} ++ ++static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); ++ if (!filter) ++ return; ++ ++ get_item_key(item, key); ++ ++ if (!test_bit(key[0], filter)) ++ set_bit(key[0], filter); ++ if (!test_bit(key[1], filter)) ++ set_bit(key[1], filter); ++} ++ ++static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) ++{ ++ int key[2]; ++ unsigned long *filter; ++ int gen = filter_gen_from_seq(seq); ++ ++ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); ++ if (!filter) ++ return false; ++ ++ get_item_key(item, key); ++ ++ return test_bit(key[0], filter) && test_bit(key[1], filter); ++} ++ ++static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args) ++{ ++ int i; ++ int hist = lru_hist_from_seq(args->max_seq); ++ ++ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); ++ ++ for (i = 0; i < NR_MM_STATS; i++) { ++ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], ++ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]); ++ args->mm_stats[i] = 0; ++ } ++ ++ if (!last || NR_HIST_GENS == 1) ++ return; ++ ++ hist = lru_hist_from_seq(args->max_seq + 1); ++ for (i = 0; i < NR_MM_STATS; i++) ++ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0); ++} ++ ++static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args) ++{ ++ int type; ++ unsigned long size = 0; ++ ++ if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes)) ++ return true; ++ ++ if (mm_is_oom_victim(mm)) ++ return true; ++ ++ for (type = !args->swappiness; type < ANON_AND_FILE; type++) { ++ size += type ? get_mm_counter(mm, MM_FILEPAGES) : ++ get_mm_counter(mm, MM_ANONPAGES) + ++ get_mm_counter(mm, MM_SHMEMPAGES); ++ } ++ ++ if (size < MIN_BATCH_SIZE) ++ return true; ++ ++ if (!mmget_not_zero(mm)) ++ return true; ++ ++ node_clear(args->node_id, mm->lrugen.nodes); ++ ++ return false; ++} ++ ++/* To support multiple walkers that concurrently walk an mm_struct list. */ ++static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args, ++ struct mm_struct **iter) ++{ ++ bool first = false; ++ bool last = true; ++ struct mm_struct *mm = NULL; ++ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk; ++ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg); ++ ++ if (*iter) ++ mmput_async(*iter); ++ else if (args->max_seq <= READ_ONCE(mm_walk->seq)) ++ return false; ++ ++ spin_lock(&mm_list->lock); ++ ++ VM_BUG_ON(args->max_seq > mm_walk->seq + 1); ++ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq); ++ VM_BUG_ON(*iter && !mm_walk->nr_walkers); ++ ++ if (args->max_seq <= mm_walk->seq) { ++ if (!*iter) ++ last = false; ++ goto done; ++ } ++ ++ if (mm_walk->head == &mm_list->fifo) { ++ VM_BUG_ON(mm_walk->nr_walkers); ++ mm_walk->head = mm_walk->head->next; ++ first = true; ++ } ++ ++ while (!mm && mm_walk->head != &mm_list->fifo) { ++ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list); ++ ++ mm_walk->head = mm_walk->head->next; ++ ++ if (mm_walk->tail == &mm->lrugen.list) { ++ mm_walk->tail = mm_walk->tail->next; ++ args->use_filter = false; ++ } ++ ++ if (should_skip_mm(mm, args)) ++ mm = NULL; ++ } ++ ++ if (mm_walk->head == &mm_list->fifo) ++ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1); ++done: ++ if (*iter && !mm) ++ mm_walk->nr_walkers--; ++ if (!*iter && mm) ++ mm_walk->nr_walkers++; ++ ++ if (mm_walk->nr_walkers) ++ last = false; ++ ++ if (mm && first) ++ clear_bloom_filter(lruvec, args->max_seq + 1); ++ ++ if (*iter || last) ++ reset_mm_stats(lruvec, last, args); ++ ++ spin_unlock(&mm_list->lock); ++ ++ *iter = mm; ++ ++ return last; ++} ++ ++/****************************************************************************** + * state change + ******************************************************************************/ + +@@ -3112,6 +3412,7 @@ void lru_gen_init_state(struct mem_cgrou + int i; + int gen, type, zone; + struct lrugen *lrugen = &lruvec->evictable; ++ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; +@@ -3122,6 +3423,17 @@ void lru_gen_init_state(struct mem_cgrou + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); ++ ++ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) ++ spin_lock(&mm_list->lock); ++ ++ lruvec->mm_walk.seq = MIN_NR_GENS; ++ lruvec->mm_walk.head = &mm_list->fifo; ++ lruvec->mm_walk.tail = &mm_list->fifo; ++ init_waitqueue_head(&lruvec->mm_walk.wait); ++ ++ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) ++ spin_unlock(&mm_list->lock); + } + + #ifdef CONFIG_MEMCG +@@ -3129,18 +3441,37 @@ void lru_gen_init_memcg(struct mem_cgrou + { + int nid; + ++ INIT_LIST_HEAD(&memcg->mm_list.fifo); ++ spin_lock_init(&memcg->mm_list.lock); ++ + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(nid, memcg); + + lru_gen_init_state(memcg, lruvec); + } + } ++ ++void lru_gen_free_memcg(struct mem_cgroup *memcg) ++{ ++ int nid; ++ ++ for_each_node(nid) { ++ int i; ++ struct lruvec *lruvec = get_lruvec(nid, memcg); ++ ++ for (i = 0; i < NR_BLOOM_FILTERS; i++) { ++ bitmap_free(lruvec->mm_walk.filters[i]); ++ lruvec->mm_walk.filters[i] = NULL; ++ } ++ } ++} + #endif + + static int __init init_lru_gen(void) + { + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); ++ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); + + return 0; + }; diff --git a/target/linux/generic/backport-5.15/020-v6.1-06-mm-multigenerational-lru-aging.patch b/target/linux/generic/backport-5.15/020-v6.1-06-mm-multigenerational-lru-aging.patch new file mode 100644 index 0000000000..6fc93d9422 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-06-mm-multigenerational-lru-aging.patch @@ -0,0 +1,1176 @@ +From 8217cd2238c40cf77208aa27a7cc09879e685890 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 5 Apr 2021 04:35:07 -0600 +Subject: [PATCH 06/10] mm: multigenerational lru: aging + +The aging produces young generations. Given an lruvec, the aging +traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan +PTEs for accessed pages. Upon finding one, the aging updates its +generation number to max_seq (modulo MAX_NR_GENS). After each round of +traversal, the aging increments max_seq. The aging is due when +min_seq[] reaches max_seq-1. + +The aging uses the following optimizations when walking page tables: + 1) It skips non-leaf PMD entries that have the accessed bit cleared + when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. + 2) It does not zigzag between a PGD table and the same PMD or PTE + table spanning multiple VMAs. In other words, it finishes all the + VMAs within the range of the same PMD or PTE table before it returns + to this PGD table. This optimizes workloads that have large numbers + of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I3ae8abc3100d023cecb3a699d86020ae6fc10a45 +--- + include/linux/memcontrol.h | 3 + + include/linux/mmzone.h | 9 + + include/linux/oom.h | 16 + + include/linux/swap.h | 3 + + mm/memcontrol.c | 5 + + mm/oom_kill.c | 4 +- + mm/rmap.c | 8 + + mm/vmscan.c | 948 +++++++++++++++++++++++++++++++++++++ + 8 files changed, 994 insertions(+), 2 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -1367,10 +1367,13 @@ mem_cgroup_print_oom_meminfo(struct mem_ + + static inline void lock_page_memcg(struct page *page) + { ++ /* to match page_memcg_rcu() */ ++ rcu_read_lock(); + } + + static inline void unlock_page_memcg(struct page *page) + { ++ rcu_read_unlock(); + } + + static inline void mem_cgroup_handle_over_high(void) +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -295,6 +295,7 @@ enum lruvec_flags { + }; + + struct lruvec; ++struct page_vma_mapped_walk; + + #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) + #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +@@ -393,6 +394,7 @@ struct mm_walk_args { + + void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); + void lru_gen_change_state(bool enable, bool main, bool swap); ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); + + #ifdef CONFIG_MEMCG + void lru_gen_init_memcg(struct mem_cgroup *memcg); +@@ -409,6 +411,10 @@ static inline void lru_gen_change_state( + { + } + ++static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++} ++ + #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) + { +@@ -1028,6 +1034,9 @@ typedef struct pglist_data { + + unsigned long flags; + ++#ifdef CONFIG_LRU_GEN ++ struct mm_walk_args mm_walk_args; ++#endif + ZONE_PADDING(_pad2_) + + /* Per-node vmstats */ +--- a/include/linux/oom.h ++++ b/include/linux/oom.h +@@ -57,6 +57,22 @@ struct oom_control { + extern struct mutex oom_lock; + extern struct mutex oom_adj_mutex; + ++#ifdef CONFIG_MMU ++extern struct task_struct *oom_reaper_list; ++extern struct wait_queue_head oom_reaper_wait; ++ ++static inline bool oom_reaping_in_progress(void) ++{ ++ /* racy check to see if oom reaping could be in progress */ ++ return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait); ++} ++#else ++static inline bool oom_reaping_in_progress(void) ++{ ++ return false; ++} ++#endif ++ + static inline void set_current_oom_origin(void) + { + current->signal->oom_flag_origin = true; +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -137,6 +137,9 @@ union swap_header { + */ + struct reclaim_state { + unsigned long reclaimed_slab; ++#ifdef CONFIG_LRU_GEN ++ struct mm_walk_args *mm_walk_args; ++#endif + }; + + #ifdef __KERNEL__ +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -1304,12 +1304,17 @@ void mem_cgroup_update_lru_size(struct l + *lru_size += nr_pages; + + size = *lru_size; ++#ifdef CONFIG_LRU_GEN ++ /* unlikely but not a bug when reset_batch_size() is pending */ ++ VM_WARN_ON(size + MAX_BATCH_SIZE < 0); ++#else + if (WARN_ONCE(size < 0, + "%s(%p, %d, %d): lru_size %ld\n", + __func__, lruvec, lru, nr_pages, size)) { + VM_BUG_ON(1); + *lru_size = 0; + } ++#endif + + if (nr_pages > 0) + *lru_size += nr_pages; +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struc + * victim (if that is possible) to help the OOM killer to move on. + */ + static struct task_struct *oom_reaper_th; +-static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +-static struct task_struct *oom_reaper_list; ++DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); ++struct task_struct *oom_reaper_list; + static DEFINE_SPINLOCK(oom_reaper_lock); + + bool __oom_reap_task_mm(struct mm_struct *mm) +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -73,6 +73,7 @@ + #include + #include + #include ++#include + + #include + +@@ -793,6 +794,13 @@ static bool page_referenced_one(struct p + } + + if (pvmw.pte) { ++ /* the multigenerational lru exploits the spatial locality */ ++ if (lru_gen_enabled() && pte_young(*pvmw.pte) && ++ !(vma->vm_flags & VM_SEQ_READ)) { ++ lru_gen_look_around(&pvmw); ++ referenced++; ++ } ++ + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -51,6 +51,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -2887,6 +2889,15 @@ static bool can_age_anon_pages(struct pg + * shorthand helpers + ******************************************************************************/ + ++#define DEFINE_MAX_SEQ(lruvec) \ ++ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq) ++ ++#define DEFINE_MIN_SEQ(lruvec) \ ++ unsigned long min_seq[ANON_AND_FILE] = { \ ++ READ_ONCE((lruvec)->evictable.min_seq[0]), \ ++ READ_ONCE((lruvec)->evictable.min_seq[1]), \ ++ } ++ + #define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ +@@ -2899,6 +2910,12 @@ static int page_lru_gen(struct page *pag + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + } + ++static int get_swappiness(struct mem_cgroup *memcg) ++{ ++ return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? ++ mem_cgroup_swappiness(memcg) : 0; ++} ++ + static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) + { + struct pglist_data *pgdat = NODE_DATA(nid); +@@ -3229,6 +3246,926 @@ done: + } + + /****************************************************************************** ++ * the aging ++ ******************************************************************************/ ++ ++static int page_update_gen(struct page *page, int gen) ++{ ++ unsigned long old_flags, new_flags; ++ ++ VM_BUG_ON(gen >= MAX_NR_GENS); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ ++ if (!(new_flags & LRU_GEN_MASK)) { ++ new_flags |= BIT(PG_referenced); ++ continue; ++ } ++ ++ new_flags &= ~LRU_GEN_MASK; ++ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++} ++ ++static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming) ++{ ++ int old_gen, new_gen; ++ unsigned long old_flags, new_flags; ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ old_gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page); ++ ++ new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; ++ /* page_update_gen() has updated this page? */ ++ if (new_gen >= 0 && new_gen != old_gen) { ++ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); ++ return; ++ } ++ ++ new_gen = (old_gen + 1) % MAX_NR_GENS; ++ ++ new_flags &= ~LRU_GEN_MASK; ++ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; ++ /* for end_page_writeback() */ ++ if (reclaiming) ++ new_flags |= BIT(PG_reclaim); ++ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++ ++ lru_gen_update_size(page, lruvec, old_gen, new_gen); ++ if (reclaiming) ++ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); ++ else ++ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); ++} ++ ++static void update_batch_size(struct page *page, int old_gen, int new_gen, ++ struct mm_walk_args *args) ++{ ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int delta = thp_nr_pages(page); ++ ++ VM_BUG_ON(old_gen >= MAX_NR_GENS); ++ VM_BUG_ON(new_gen >= MAX_NR_GENS); ++ ++ args->batch_size++; ++ ++ args->nr_pages[old_gen][type][zone] -= delta; ++ args->nr_pages[new_gen][type][zone] += delta; ++} ++ ++static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args) ++{ ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ args->batch_size = 0; ++ ++ for_each_gen_type_zone(gen, type, zone) { ++ enum lru_list lru = type * LRU_FILE; ++ int delta = args->nr_pages[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ args->nr_pages[gen][type][zone] = 0; ++ WRITE_ONCE(lrugen->sizes[gen][type][zone], ++ lrugen->sizes[gen][type][zone] + delta); ++ ++ if (lru_gen_is_active(lruvec, gen)) ++ lru += LRU_ACTIVE; ++ update_lru_size(lruvec, lru, zone, delta); ++ } ++} ++ ++static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk) ++{ ++ struct address_space *mapping; ++ struct vm_area_struct *vma = walk->vma; ++ struct mm_walk_args *args = walk->private; ++ ++ if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) || ++ (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ))) ++ return true; ++ ++ if (vma_is_anonymous(vma)) ++ return !args->swappiness; ++ ++ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) ++ return true; ++ ++ mapping = vma->vm_file->f_mapping; ++ if (!mapping->a_ops->writepage) ++ return true; ++ ++ return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping); ++} ++ ++/* ++ * Some userspace memory allocators create many single-page VMAs. So instead of ++ * returning back to the PGD table for each of such VMAs, we finish at least an ++ * entire PMD table and therefore avoid many zigzags. ++ */ ++static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size, ++ unsigned long *start, unsigned long *end) ++{ ++ unsigned long next = round_up(*end, size); ++ ++ VM_BUG_ON(mask & size); ++ VM_BUG_ON(*start >= *end); ++ VM_BUG_ON((next & mask) != (*start & mask)); ++ ++ while (walk->vma) { ++ if (next >= walk->vma->vm_end) { ++ walk->vma = walk->vma->vm_next; ++ continue; ++ } ++ ++ if ((next & mask) != (walk->vma->vm_start & mask)) ++ return false; ++ ++ if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) { ++ walk->vma = walk->vma->vm_next; ++ continue; ++ } ++ ++ *start = max(next, walk->vma->vm_start); ++ next = (next | ~mask) + 1; ++ /* rounded-up boundaries can wrap to 0 */ ++ *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end; ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pte_t *pte; ++ spinlock_t *ptl; ++ unsigned long addr; ++ int worth = 0; ++ struct mm_walk_args *args = walk->private; ++ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); ++ ++ VM_BUG_ON(pmd_leaf(*pmd)); ++ ++ pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl); ++ arch_enter_lazy_mmu_mode(); ++restart: ++ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ struct page *page; ++ unsigned long pfn = pte_pfn(pte[i]); ++ ++ args->mm_stats[MM_LEAF_TOTAL]++; ++ ++ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) ++ continue; ++ ++ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) ++ continue; ++ ++ if (!pte_young(pte[i])) { ++ args->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) ++ continue; ++ ++ page = compound_head(pfn_to_page(pfn)); ++ if (page_to_nid(page) != args->node_id) ++ continue; ++ ++ if (page_memcg_rcu(page) != args->memcg) ++ continue; ++ ++ VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end); ++ if (!ptep_test_and_clear_young(walk->vma, addr, pte + i)) ++ continue; ++ ++ args->mm_stats[MM_LEAF_YOUNG]++; ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) ++ set_page_dirty(page); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ ++ worth++; ++ } ++ ++ if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end)) ++ goto restart; ++ ++ arch_leave_lazy_mmu_mode(); ++ pte_unmap_unlock(pte, ptl); ++ ++ return worth >= MIN_BATCH_SIZE / 2; ++} ++ ++/* ++ * We scan PMD entries in two passes. The first pass reaches to PTE tables and ++ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD ++ * entries and needs to take the PMD lock. ++ */ ++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) ++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, ++ struct vm_area_struct *vma, struct mm_walk *walk) ++{ ++ int i; ++ pmd_t *pmd; ++ spinlock_t *ptl; ++ struct mm_walk_args *args = walk->private; ++ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); ++ ++ VM_BUG_ON(pud_leaf(*pud)); ++ ++ start = (start & PUD_MASK) + offset * PMD_SIZE; ++ pmd = pmd_offset(pud, start); ++ ptl = pmd_lock(walk->mm, pmd); ++ arch_enter_lazy_mmu_mode(); ++ ++ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) { ++ struct page *page; ++ unsigned long pfn = pmd_pfn(pmd[i]); ++ unsigned long addr = start + i * PMD_SIZE; ++ ++ if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i])) ++ continue; ++ ++ if (WARN_ON_ONCE(pmd_devmap(pmd[i]))) ++ continue; ++ ++ if (!pmd_trans_huge(pmd[i])) { ++ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) ++ pmdp_test_and_clear_young(vma, addr, pmd + i); ++ continue; ++ } ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) ++ continue; ++ ++ page = pfn_to_page(pfn); ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ if (page_to_nid(page) != args->node_id) ++ continue; ++ ++ if (page_memcg_rcu(page) != args->memcg) ++ continue; ++ ++ VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end); ++ if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) ++ continue; ++ ++ args->mm_stats[MM_LEAF_YOUNG]++; ++ ++ if (pmd_dirty(pmd[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) ++ set_page_dirty(page); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ spin_unlock(ptl); ++ ++ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); ++} ++#else ++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, ++ struct vm_area_struct *vma, struct mm_walk *walk) ++{ ++} ++#endif ++ ++static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pmd_t *pmd; ++ unsigned long next; ++ unsigned long addr; ++ struct vm_area_struct *vma; ++ int offset = -1; ++ bool reset = false; ++ struct mm_walk_args *args = walk->private; ++ struct lruvec *lruvec = get_lruvec(args->node_id, args->memcg); ++ ++ VM_BUG_ON(pud_leaf(*pud)); ++ ++ pmd = pmd_offset(pud, start & PUD_MASK); ++restart: ++ vma = walk->vma; ++ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { ++ pmd_t val = pmd_read_atomic(pmd + i); ++ ++ /* for pmd_read_atomic() */ ++ barrier(); ++ ++ next = pmd_addr_end(addr, end); ++ ++ if (!pmd_present(val)) { ++ args->mm_stats[MM_LEAF_TOTAL]++; ++ continue; ++ } ++ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ if (pmd_trans_huge(val)) { ++ unsigned long pfn = pmd_pfn(val); ++ ++ args->mm_stats[MM_LEAF_TOTAL]++; ++ ++ if (is_huge_zero_pmd(val)) ++ continue; ++ ++ if (!pmd_young(val)) { ++ args->mm_stats[MM_LEAF_OLD]++; ++ continue; ++ } ++ ++ if (pfn < args->start_pfn || pfn >= args->end_pfn) ++ continue; ++ ++ if (offset < 0) ++ offset = i; ++ else if (i - offset >= MIN_BATCH_SIZE) { ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++ offset = i; ++ } ++ __set_bit(i - offset, args->bitmap); ++ reset = true; ++ continue; ++ } ++#endif ++ args->mm_stats[MM_NONLEAF_TOTAL]++; ++ ++#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG ++ if (!pmd_young(val)) ++ continue; ++ ++ if (offset < 0) ++ offset = i; ++ else if (i - offset >= MIN_BATCH_SIZE) { ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++ offset = i; ++ reset = false; ++ } ++ __set_bit(i - offset, args->bitmap); ++#endif ++ if (args->use_filter && !test_bloom_filter(lruvec, args->max_seq, pmd + i)) ++ continue; ++ ++ args->mm_stats[MM_NONLEAF_PREV]++; ++ ++ if (!walk_pte_range(&val, addr, next, walk)) ++ continue; ++ ++ args->mm_stats[MM_NONLEAF_CUR]++; ++ ++ set_bloom_filter(lruvec, args->max_seq + 1, pmd + i); ++ } ++ ++ if (reset) { ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++ offset = -1; ++ reset = false; ++ } ++ ++ if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end)) ++ goto restart; ++ ++ if (offset >= 0) ++ walk_pmd_range_locked(pud, start, offset, vma, walk); ++} ++ ++static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, ++ struct mm_walk *walk) ++{ ++ int i; ++ pud_t *pud; ++ unsigned long addr; ++ unsigned long next; ++ struct mm_walk_args *args = walk->private; ++ ++ VM_BUG_ON(p4d_leaf(*p4d)); ++ ++ pud = pud_offset(p4d, start & P4D_MASK); ++restart: ++ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { ++ pud_t val = READ_ONCE(pud[i]); ++ ++ next = pud_addr_end(addr, end); ++ ++ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) ++ continue; ++ ++ walk_pmd_range(&val, addr, next, walk); ++ ++ if (args->batch_size >= MAX_BATCH_SIZE) { ++ end = (addr | ~PUD_MASK) + 1; ++ goto done; ++ } ++ } ++ ++ if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end)) ++ goto restart; ++ ++ end = round_up(end, P4D_SIZE); ++done: ++ /* rounded-up boundaries can wrap to 0 */ ++ args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0; ++ ++ return -EAGAIN; ++} ++ ++static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct mm_walk_args *args) ++{ ++ static const struct mm_walk_ops mm_walk_ops = { ++ .test_walk = should_skip_vma, ++ .p4d_entry = walk_pud_range, ++ }; ++ ++ int err; ++ ++ args->next_addr = FIRST_USER_ADDRESS; ++ ++ do { ++ unsigned long start = args->next_addr; ++ unsigned long end = mm->highest_vm_end; ++ ++ err = -EBUSY; ++ ++ rcu_read_lock(); ++#ifdef CONFIG_MEMCG ++ if (args->memcg && atomic_read(&args->memcg->moving_account)) ++ goto contended; ++#endif ++ if (!mmap_read_trylock(mm)) ++ goto contended; ++ ++ err = walk_page_range(mm, start, end, &mm_walk_ops, args); ++ ++ mmap_read_unlock(mm); ++ ++ if (args->batch_size) { ++ spin_lock_irq(&lruvec->lru_lock); ++ reset_batch_size(lruvec, args); ++ spin_unlock_irq(&lruvec->lru_lock); ++ } ++contended: ++ rcu_read_unlock(); ++ ++ cond_resched(); ++ } while (err == -EAGAIN && args->next_addr && !mm_is_oom_victim(mm)); ++} ++ ++static struct mm_walk_args *alloc_mm_walk_args(void) ++{ ++ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) ++ return kvzalloc(sizeof(struct mm_walk_args), GFP_KERNEL); ++ ++ return current->reclaim_state->mm_walk_args; ++} ++ ++static void free_mm_walk_args(struct mm_walk_args *args) ++{ ++ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) ++ kvfree(args); ++} ++ ++static bool inc_min_seq(struct lruvec *lruvec, int type) ++{ ++ int gen, zone; ++ int remaining = MAX_BATCH_SIZE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ if (get_nr_gens(lruvec, type) != MAX_NR_GENS) ++ return true; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ page_inc_gen(page, lruvec, false); ++ ++ if (!--remaining) ++ return false; ++ } ++ } ++ ++ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); ++ ++ return true; ++} ++ ++static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) ++{ ++ int gen, type, zone; ++ bool success = false; ++ struct lrugen *lrugen = &lruvec->evictable; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ while (lrugen->max_seq - min_seq[type] >= MIN_NR_GENS) { ++ gen = lru_gen_from_seq(min_seq[type]); ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ if (!list_empty(&lrugen->lists[gen][type][zone])) ++ goto next; ++ } ++ ++ min_seq[type]++; ++ } ++next: ++ ; ++ } ++ ++ min_seq[0] = min(min_seq[0], min_seq[1]); ++ if (swappiness) ++ min_seq[1] = max(min_seq[0], lrugen->min_seq[1]); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ if (min_seq[type] == lrugen->min_seq[type]) ++ continue; ++ ++ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); ++ success = true; ++ } ++ ++ return success; ++} ++ ++static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq) ++{ ++ int gen, type, zone; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ if (max_seq != lrugen->max_seq) ++ goto unlock; ++ ++ if (!try_to_inc_min_seq(lruvec, true)) { ++ for (type = ANON_AND_FILE - 1; type >= 0; type--) { ++ while (!inc_min_seq(lruvec, type)) { ++ spin_unlock_irq(&lruvec->lru_lock); ++ cond_resched(); ++ spin_lock_irq(&lruvec->lru_lock); ++ } ++ } ++ } ++ ++ gen = lru_gen_from_seq(lrugen->max_seq - 1); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ enum lru_list lru = type * LRU_FILE; ++ long delta = lrugen->sizes[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ WARN_ON_ONCE(delta != (int)delta); ++ ++ update_lru_size(lruvec, lru, zone, delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); ++ } ++ } ++ ++ gen = lru_gen_from_seq(lrugen->max_seq + 1); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) { ++ enum lru_list lru = type * LRU_FILE; ++ long delta = lrugen->sizes[gen][type][zone]; ++ ++ if (!delta) ++ continue; ++ ++ WARN_ON_ONCE(delta != (int)delta); ++ ++ update_lru_size(lruvec, lru, zone, -delta); ++ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); ++ } ++ } ++ ++ WRITE_ONCE(lrugen->timestamps[gen], jiffies); ++ /* make sure all preceding modifications appear first */ ++ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); ++unlock: ++ spin_unlock_irq(&lruvec->lru_lock); ++} ++ ++/* Main function used by the foreground, the background and the user-triggered aging. */ ++static bool try_to_inc_max_seq(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long max_seq, bool use_filter) ++{ ++ bool last; ++ struct mm_walk_args *args; ++ struct mm_struct *mm = NULL; ++ struct lrugen *lrugen = &lruvec->evictable; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ int nid = pgdat->node_id; ++ ++ VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq)); ++ ++ /* ++ * If we are not from run_aging() and clearing the accessed bit may ++ * trigger page faults, then don't proceed to clearing all accessed ++ * PTEs. Instead, fallback to lru_gen_look_around(), which only clears a ++ * handful of accessed PTEs. This is less efficient but causes fewer ++ * page faults on CPUs that don't have the capability. ++ */ ++ if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young(false)) { ++ inc_max_seq(lruvec, max_seq); ++ return true; ++ } ++ ++ args = alloc_mm_walk_args(); ++ if (!args) ++ return false; ++ ++ args->memcg = memcg; ++ args->max_seq = max_seq; ++ args->start_pfn = pgdat->node_start_pfn; ++ args->end_pfn = pgdat_end_pfn(pgdat); ++ args->node_id = nid; ++ args->swappiness = swappiness; ++ args->use_filter = use_filter; ++ ++ do { ++ last = get_next_mm(lruvec, args, &mm); ++ if (mm) ++ walk_mm(lruvec, mm, args); ++ ++ cond_resched(); ++ } while (mm); ++ ++ free_mm_walk_args(args); ++ ++ if (!last) { ++ /* don't wait unless we may have trouble reclaiming */ ++ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2) ++ wait_event_killable(lruvec->mm_walk.wait, ++ max_seq < READ_ONCE(lrugen->max_seq)); ++ ++ return max_seq < READ_ONCE(lrugen->max_seq); ++ } ++ ++ VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq)); ++ ++ inc_max_seq(lruvec, max_seq); ++ /* either we see any waiters or they will see updated max_seq */ ++ if (wq_has_sleeper(&lruvec->mm_walk.wait)) ++ wake_up_all(&lruvec->mm_walk.wait); ++ ++ wakeup_flusher_threads(WB_REASON_VMSCAN); ++ ++ return true; ++} ++ ++static long get_nr_evictable(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long max_seq, unsigned long *min_seq, bool *low) ++{ ++ int gen, type, zone; ++ long max = 0; ++ long min = 0; ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for (type = !swappiness; type < ANON_AND_FILE; type++) { ++ unsigned long seq; ++ ++ for (seq = min_seq[type]; seq <= max_seq; seq++) { ++ long size = 0; ++ ++ gen = lru_gen_from_seq(seq); ++ ++ for (zone = 0; zone <= sc->reclaim_idx; zone++) ++ size += READ_ONCE(lrugen->sizes[gen][type][zone]); ++ ++ max += size; ++ if (type && max_seq - seq >= MIN_NR_GENS) ++ min += size; ++ } ++ } ++ ++ *low = max_seq - min_seq[1] <= MIN_NR_GENS && min < MIN_BATCH_SIZE; ++ ++ return max > 0 ? max : 0; ++} ++ ++static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, ++ unsigned long min_ttl) ++{ ++ bool low; ++ long nr_to_scan; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ int swappiness = get_swappiness(memcg); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (mem_cgroup_below_min(memcg)) ++ return false; ++ ++ if (min_ttl) { ++ int gen = lru_gen_from_seq(min_seq[1]); ++ unsigned long birth = READ_ONCE(lruvec->evictable.timestamps[gen]); ++ ++ if (time_is_after_jiffies(birth + min_ttl)) ++ return false; ++ } ++ ++ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); ++ if (!nr_to_scan) ++ return false; ++ ++ nr_to_scan >>= sc->priority; ++ ++ if (!mem_cgroup_online(memcg)) ++ nr_to_scan++; ++ ++ if (nr_to_scan && low && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim)) ++ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true); ++ ++ return true; ++} ++ ++/* Protect the working set accessed within the last N milliseconds. */ ++static unsigned long lru_gen_min_ttl __read_mostly; ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++ struct mem_cgroup *memcg; ++ bool success = false; ++ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); ++ ++ VM_BUG_ON(!current_is_kswapd()); ++ ++ if (!sc->force_deactivate) { ++ sc->force_deactivate = 1; ++ return; ++ } ++ ++ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ ++ if (age_lruvec(lruvec, sc, min_ttl)) ++ success = true; ++ ++ cond_resched(); ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ if (!success && mutex_trylock(&oom_lock)) { ++ struct oom_control oc = { ++ .gfp_mask = sc->gfp_mask, ++ .order = sc->order, ++ }; ++ ++ /* to avoid overkilling */ ++ if (!oom_reaping_in_progress()) ++ out_of_memory(&oc); ++ ++ mutex_unlock(&oom_lock); ++ } ++ ++ current->reclaim_state->mm_walk_args = NULL; ++} ++ ++/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */ ++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) ++{ ++ int i; ++ pte_t *pte; ++ struct page *page; ++ int old_gen, new_gen; ++ unsigned long start; ++ unsigned long end; ++ unsigned long addr; ++ struct mm_walk_args *args; ++ int worth = 0; ++ struct mem_cgroup *memcg = page_memcg(pvmw->page); ++ struct pglist_data *pgdat = page_pgdat(pvmw->page); ++ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ DEFINE_MAX_SEQ(lruvec); ++ ++ lockdep_assert_held(pvmw->ptl); ++ VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page); ++ ++ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; ++ if (!args) ++ return; ++ ++ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); ++ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; ++ ++ if (end - start > MIN_BATCH_SIZE * PAGE_SIZE) { ++ if (pvmw->address - start < MIN_BATCH_SIZE * PAGE_SIZE / 2) ++ end = start + MIN_BATCH_SIZE * PAGE_SIZE; ++ else if (end - pvmw->address < MIN_BATCH_SIZE * PAGE_SIZE / 2) ++ start = end - MIN_BATCH_SIZE * PAGE_SIZE; ++ else { ++ start = pvmw->address - MIN_BATCH_SIZE * PAGE_SIZE / 2; ++ end = pvmw->address + MIN_BATCH_SIZE * PAGE_SIZE / 2; ++ } ++ } ++ ++ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; ++ new_gen = lru_gen_from_seq(max_seq); ++ ++ lock_page_memcg(pvmw->page); ++ arch_enter_lazy_mmu_mode(); ++ ++ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { ++ unsigned long pfn = pte_pfn(pte[i]); ++ ++ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) ++ continue; ++ ++ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) ++ continue; ++ ++ VM_BUG_ON(!pfn_valid(pfn)); ++ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) ++ continue; ++ ++ worth++; ++ ++ if (!pte_young(pte[i])) ++ continue; ++ ++ page = compound_head(pfn_to_page(pfn)); ++ if (page_to_nid(page) != pgdat->node_id) ++ continue; ++ ++ if (page_memcg_rcu(page) != memcg) ++ continue; ++ ++ VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end); ++ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) ++ continue; ++ ++ if (pte_dirty(pte[i]) && !PageDirty(page) && ++ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) ++ __set_bit(i, args->bitmap); ++ ++ old_gen = page_update_gen(page, new_gen); ++ if (old_gen >= 0 && old_gen != new_gen) ++ update_batch_size(page, old_gen, new_gen, args); ++ } ++ ++ arch_leave_lazy_mmu_mode(); ++ unlock_page_memcg(pvmw->page); ++ ++ if (worth >= MIN_BATCH_SIZE / 2) ++ set_bloom_filter(lruvec, max_seq, pvmw->pmd); ++ ++ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) ++ set_page_dirty(pte_page(pte[i])); ++ ++ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); ++} ++ ++/****************************************************************************** + * state change + ******************************************************************************/ + +@@ -3477,6 +4414,12 @@ static int __init init_lru_gen(void) + }; + late_initcall(init_lru_gen); + ++#else ++ ++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -4333,6 +5276,11 @@ static void age_active_anon(struct pglis + struct mem_cgroup *memcg; + struct lruvec *lruvec; + ++ if (lru_gen_enabled()) { ++ lru_gen_age_node(pgdat, sc); ++ return; ++ } ++ + if (!can_age_anon_pages(pgdat, sc)) + return; + diff --git a/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch b/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch new file mode 100644 index 0000000000..09e4315dcc --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-07-mm-multigenerational-lru-eviction.patch @@ -0,0 +1,1002 @@ +From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Tue, 29 Jun 2021 20:46:47 -0600 +Subject: [PATCH 07/10] mm: multigenerational lru: eviction + +The eviction consumes old generations. Given an lruvec, the eviction +scans pages on lrugen->lists indexed by anon and file min_seq[] +(modulo MAX_NR_GENS). It first tries to select a type based on the +values of min_seq[]. If they are equal, it selects the type that has +a lower refaulted %. The eviction sorts a page according to its +updated generation number if the aging has found this page accessed. +It also moves a page to the next generation if this page is from an +upper tier that has a higher refaulted % than the base tier. The +eviction increments min_seq[] of a selected type when it finds +lrugen->lists indexed by min_seq[] of this selected type are empty. + +Each generation is divided into multiple tiers. Tiers represent +different ranges of numbers of accesses from file descriptors only. +Pages accessed N times via file descriptors belong to tier +order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers, +and they require additional MAX_NR_TIERS-2 bits in page->flags. In +contrast to moving between generations which requires list operations, +moving between tiers only involves operations on page->flags and +therefore has a negligible cost. A feedback loop modeled after the PID +controller monitors refaulted % across all tiers and decides when to +protect pages from which tiers. + +Unmapped pages are initially added to the oldest generation and then +conditionally protected by tiers. Each tier keeps track of how many +pages from it have refaulted. Tier 0 is the base tier and pages from +it are evicted unconditionally because there are no better candidates. +Pages from an upper tier are either evicted or moved to the next +generation, depending on whether this upper tier has a higher +refaulted % than the base tier. This model has the following +advantages: + 1) It removes the cost in the buffered access path and reduces the + overall cost of protection because pages are conditionally protected + in the reclaim path. + 2) It takes mapped pages into account and avoids overprotecting + pages accessed multiple times via file descriptors. + 3 Additional tiers improve the protection of pages accessed more + than twice. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac +--- + include/linux/mm_inline.h | 10 + + include/linux/mmzone.h | 33 +++ + mm/swap.c | 42 +++ + mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++- + mm/workingset.c | 120 ++++++++- + 5 files changed, 757 insertions(+), 3 deletions(-) + +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi + return seq % NR_HIST_GENS; + } + ++/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */ ++static inline int lru_tier_from_refs(int refs) ++{ ++ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH)); ++ ++ return order_base_2(refs + 1); ++} ++ + /* The youngest and the second youngest generations are counted as active. */ + static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) + { +@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru + gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + + new_flags &= ~LRU_GEN_MASK; ++ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS) ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + /* for shrink_page_list() */ + if (reclaiming) + new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -319,6 +319,30 @@ struct page_vma_mapped_walk; + #define MIN_NR_GENS 2 + #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) + ++/* ++ * Each generation is divided into multiple tiers. Tiers represent different ++ * ranges of numbers of accesses from file descriptors, i.e., ++ * mark_page_accessed(). In contrast to moving between generations which ++ * requires the lru lock, moving between tiers only involves an atomic ++ * operation on page->flags and therefore has a negligible cost. ++ * ++ * The purposes of tiers are to: ++ * 1) estimate whether pages accessed multiple times via file descriptors are ++ * more active than pages accessed only via page tables by separating the two ++ * access types into upper tiers and the base tier, and comparing refaulted % ++ * across all tiers. ++ * 2) improve buffered io performance by deferring the protection of pages ++ * accessed multiple times until the eviction. That is the protection happens ++ * in the reclaim path, not the access path. ++ * ++ * Pages accessed N times via file descriptors belong to tier order_base_2(N). ++ * The base tier may be marked by PageReferenced(). All upper tiers are marked ++ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are ++ * used to support more than one upper tier. ++ */ ++#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) ++#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) ++ + /* Whether to keep stats for historical generations. */ + #ifdef CONFIG_LRU_GEN_STATS + #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) +@@ -337,6 +361,15 @@ struct lrugen { + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the sizes of the multigenerational lru lists in pages */ + unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; ++ /* the exponential moving average of refaulted */ ++ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the exponential moving average of protected+evicted */ ++ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; ++ /* the base tier isn't protected, hence the minus one */ ++ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; ++ /* incremented without holding the lru lock */ ++ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; ++ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + /* whether the multigenerational lru is enabled */ + bool enabled[ANON_AND_FILE]; + }; +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st + local_unlock(&lru_pvecs.lock); + } + ++#ifdef CONFIG_LRU_GEN ++static void page_inc_refs(struct page *page) ++{ ++ unsigned long refs; ++ unsigned long old_flags, new_flags; ++ ++ if (PageUnevictable(page)) ++ return; ++ ++ /* see the comment on MAX_NR_TIERS */ ++ do { ++ new_flags = old_flags = READ_ONCE(page->flags); ++ ++ if (!(new_flags & BIT(PG_referenced))) { ++ new_flags |= BIT(PG_referenced); ++ continue; ++ } ++ ++ if (!(new_flags & BIT(PG_workingset))) { ++ new_flags |= BIT(PG_workingset); ++ continue; ++ } ++ ++ refs = new_flags & LRU_REFS_MASK; ++ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK); ++ ++ new_flags &= ~LRU_REFS_MASK; ++ new_flags |= refs; ++ } while (new_flags != old_flags && ++ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); ++} ++#else ++static void page_inc_refs(struct page *page) ++{ ++} ++#endif /* CONFIG_LRU_GEN */ ++ + /* + * Mark a page as having seen activity. + * +@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag + { + page = compound_head(page); + ++ if (lru_gen_enabled()) { ++ page_inc_refs(page); ++ return; ++ } ++ + if (!PageReferenced(page)) { + SetPageReferenced(page); + } else if (PageUnevictable(page)) { +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; +- mem_cgroup_swapout(page, swap); ++ ++ /* get a shadow entry before page_memcg() is cleared */ + if (reclaimed && !mapping_exiting(mapping)) + shadow = workingset_eviction(page, target_memcg); ++ mem_cgroup_swapout(page, swap); + __delete_from_swap_cache(page, swap, shadow); + xa_unlock_irq(&mapping->i_pages); + put_swap_page(page, swap); +@@ -1410,6 +1412,11 @@ retry: + if (!sc->may_unmap && page_mapped(page)) + goto keep_locked; + ++ /* lru_gen_look_around() has updated this page? */ ++ if (lru_gen_enabled() && !ignore_references && ++ page_mapped(page) && PageReferenced(page)) ++ goto keep_locked; ++ + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + +@@ -2570,6 +2577,9 @@ static void prepare_scan_count(pg_data_t + unsigned long file; + struct lruvec *target_lruvec; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* +@@ -2910,6 +2920,17 @@ static int page_lru_gen(struct page *pag + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + } + ++static int page_lru_tier(struct page *page) ++{ ++ int refs; ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ? ++ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0; ++ ++ return lru_tier_from_refs(refs); ++} ++ + static int get_swappiness(struct mem_cgroup *memcg) + { + return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? +@@ -3246,6 +3267,91 @@ done: + } + + /****************************************************************************** ++ * refault feedback loop ++ ******************************************************************************/ ++ ++/* ++ * A feedback loop modeled after the PID controller. Currently supports the ++ * proportional (P) and the integral (I) terms; the derivative (D) term can be ++ * added if necessary. The setpoint (SP) is the desired position; the process ++ * variable (PV) is the measured position. The error is the difference between ++ * the SP and the PV. A positive error results in a positive control output ++ * correction, which, in our case, is to allow eviction. ++ * ++ * The P term is refaulted % of the current generation being evicted. The I ++ * term is the exponential moving average of refaulted % of previously evicted ++ * generations, using the smoothing factor 1/2. ++ * ++ * Our goal is to maintain proportional refaulted % across all tiers. ++ */ ++struct ctrl_pos { ++ unsigned long refaulted; ++ unsigned long total; ++ int gain; ++}; ++ ++static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, ++ struct ctrl_pos *pos) ++{ ++ struct lrugen *lrugen = &lruvec->evictable; ++ int hist = lru_hist_from_seq(lrugen->min_seq[type]); ++ ++ pos->refaulted = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ pos->total = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ pos->total += lrugen->protected[hist][type][tier - 1]; ++ pos->gain = gain; ++} ++ ++static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type) ++{ ++ int tier; ++ int hist = lru_hist_from_seq(gen); ++ struct lrugen *lrugen = &lruvec->evictable; ++ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); ++ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; ++ ++ if (!carryover && !clear) ++ return; ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ if (carryover) { ++ unsigned long sum; ++ ++ sum = lrugen->avg_refaulted[type][tier] + ++ atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); ++ ++ sum = lrugen->avg_total[type][tier] + ++ atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ sum += lrugen->protected[hist][type][tier - 1]; ++ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); ++ } ++ ++ if (clear) { ++ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); ++ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); ++ if (tier) ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); ++ } ++ } ++} ++ ++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) ++{ ++ /* ++ * Allow eviction if the PV has a limited number of refaulted pages or a ++ * lower refaulted % than the SP. ++ */ ++ return pv->refaulted < MIN_BATCH_SIZE || ++ pv->refaulted * max(sp->total, 1UL) * sp->gain <= ++ sp->refaulted * max(pv->total, 1UL) * pv->gain; ++} ++ ++/****************************************************************************** + * the aging + ******************************************************************************/ + +@@ -3265,6 +3371,7 @@ static int page_update_gen(struct page * + + new_flags &= ~LRU_GEN_MASK; + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + } while (new_flags != old_flags && + cmpxchg(&page->flags, old_flags, new_flags) != old_flags); + +@@ -3296,6 +3403,7 @@ static void page_inc_gen(struct page *pa + + new_flags &= ~LRU_GEN_MASK; + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; ++ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); + /* for end_page_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); +@@ -3787,6 +3895,7 @@ static bool inc_min_seq(struct lruvec *l + } + } + ++ reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); + + return true; +@@ -3824,6 +3933,8 @@ next: + if (min_seq[type] == lrugen->min_seq[type]) + continue; + ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } +@@ -3885,6 +3996,9 @@ static void inc_max_seq(struct lruvec *l + } + } + ++ for (type = 0; type < ANON_AND_FILE; type++) ++ reset_ctrl_pos(lruvec, gen, type); ++ + WRITE_ONCE(lrugen->timestamps[gen], jiffies); + /* make sure all preceding modifications appear first */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); +@@ -4166,6 +4280,433 @@ void lru_gen_look_around(struct page_vma + } + + /****************************************************************************** ++ * the eviction ++ ******************************************************************************/ ++ ++static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx) ++{ ++ bool success; ++ int gen = page_lru_gen(page); ++ int type = page_is_file_lru(page); ++ int zone = page_zonenum(page); ++ int tier = page_lru_tier(page); ++ int delta = thp_nr_pages(page); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page); ++ ++ /* an mlocked page? */ ++ if (!page_evictable(page)) { ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ SetPageUnevictable(page); ++ add_page_to_lru_list(page, lruvec); ++ __count_vm_events(UNEVICTABLE_PGCULLED, delta); ++ return true; ++ } ++ ++ /* a lazy-free page that has been written into? */ ++ if (type && PageDirty(page) && PageAnon(page)) { ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ SetPageSwapBacked(page); ++ add_page_to_lru_list_tail(page, lruvec); ++ return true; ++ } ++ ++ /* page_update_gen() has updated this page? */ ++ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { ++ list_move(&page->lru, &lrugen->lists[gen][type][zone]); ++ return true; ++ } ++ ++ /* protect this page if its tier has a higher refaulted % */ ++ if (tier > tier_idx) { ++ int hist = lru_hist_from_seq(gen); ++ ++ page_inc_gen(page, lruvec, false); ++ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], ++ lrugen->protected[hist][type][tier - 1] + delta); ++ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); ++ return true; ++ } ++ ++ /* mark this page for reclaim if it's pending writeback */ ++ if (PageWriteback(page) || (type && PageDirty(page))) { ++ page_inc_gen(page, lruvec, true); ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) ++{ ++ bool success; ++ ++ if (!sc->may_unmap && page_mapped(page)) ++ return false; ++ ++ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && ++ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) ++ return false; ++ ++ if (!get_page_unless_zero(page)) ++ return false; ++ ++ if (!TestClearPageLRU(page)) { ++ put_page(page); ++ return false; ++ } ++ ++ success = lru_gen_del_page(page, lruvec, true); ++ VM_BUG_ON_PAGE(!success, page); ++ ++ return true; ++} ++ ++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, ++ int type, int tier, struct list_head *list) ++{ ++ int gen, zone; ++ enum vm_event_item item; ++ int sorted = 0; ++ int scanned = 0; ++ int isolated = 0; ++ int remaining = MAX_BATCH_SIZE; ++ struct lrugen *lrugen = &lruvec->evictable; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ ++ VM_BUG_ON(!list_empty(list)); ++ ++ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) ++ return 0; ++ ++ gen = lru_gen_from_seq(lrugen->min_seq[type]); ++ ++ for (zone = sc->reclaim_idx; zone >= 0; zone--) { ++ LIST_HEAD(moved); ++ int skipped = 0; ++ struct list_head *head = &lrugen->lists[gen][type][zone]; ++ ++ while (!list_empty(head)) { ++ struct page *page = lru_to_page(head); ++ int delta = thp_nr_pages(page); ++ ++ VM_BUG_ON_PAGE(PageTail(page), page); ++ VM_BUG_ON_PAGE(PageUnevictable(page), page); ++ VM_BUG_ON_PAGE(PageActive(page), page); ++ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); ++ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); ++ ++ prefetchw_prev_lru_page(page, head, flags); ++ ++ scanned += delta; ++ ++ if (sort_page(page, lruvec, tier)) ++ sorted += delta; ++ else if (isolate_page(page, lruvec, sc)) { ++ list_add(&page->lru, list); ++ isolated += delta; ++ } else { ++ list_move(&page->lru, &moved); ++ skipped += delta; ++ } ++ ++ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE) ++ break; ++ } ++ ++ if (skipped) { ++ list_splice(&moved, head); ++ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); ++ } ++ ++ if (!remaining || isolated >= MIN_BATCH_SIZE) ++ break; ++ } ++ ++ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; ++ if (!cgroup_reclaim(sc)) { ++ __count_vm_events(item, isolated); ++ __count_vm_events(PGREFILL, sorted); ++ } ++ __count_memcg_events(memcg, item, isolated); ++ __count_memcg_events(memcg, PGREFILL, sorted); ++ __count_vm_events(PGSCAN_ANON + type, isolated); ++ ++ /* ++ * We may have trouble finding eligible pages due to reclaim_idx, ++ * may_unmap and may_writepage. Check `remaining` to make sure we won't ++ * be stuck if we aren't making enough progress. ++ */ ++ return isolated || !remaining ? scanned : 0; ++} ++ ++static int get_tier_idx(struct lruvec *lruvec, int type) ++{ ++ int tier; ++ struct ctrl_pos sp, pv; ++ ++ /* ++ * Ideally we don't want to evict upper tiers that have higher refaulted ++ * %. However, we need to leave a margin for the fluctuation in ++ * refaulted %. So we use a larger gain factor to make sure upper tiers ++ * are indeed more active. We choose 2 because the lowest upper tier ++ * would have twice of refaulted % of the base tier, according to their ++ * numbers of accesses. ++ */ ++ read_ctrl_pos(lruvec, type, 0, 1, &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, 2, &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ return tier - 1; ++} ++ ++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) ++{ ++ int type, tier; ++ struct ctrl_pos sp, pv; ++ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; ++ ++ /* ++ * Compare refaulted % between the base tiers of anon and file to ++ * determine which type to evict. Also need to compare refaulted % of ++ * the upper tiers of the selected type with that of the base tier of ++ * the other type to determine which tier of the selected type to evict. ++ */ ++ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp); ++ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv); ++ type = positive_ctrl_err(&sp, &pv); ++ ++ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); ++ for (tier = 1; tier < MAX_NR_TIERS; tier++) { ++ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); ++ if (!positive_ctrl_err(&sp, &pv)) ++ break; ++ } ++ ++ *tier_idx = tier - 1; ++ ++ return type; ++} ++ ++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ int *type_scanned, struct list_head *list) ++{ ++ int i; ++ int type; ++ int scanned; ++ int tier = -1; ++ DEFINE_MIN_SEQ(lruvec); ++ ++ VM_BUG_ON(!seq_is_valid(lruvec)); ++ ++ /* ++ * Try to select a type based on generations and swappiness, and if that ++ * fails, fall back to get_type_to_scan(). When anon and file are both ++ * available from the same generation, swappiness 200 is interpreted as ++ * anon first and swappiness 1 is interpreted as file first. ++ */ ++ if (!swappiness) ++ type = 1; ++ else if (min_seq[0] < min_seq[1]) ++ type = 0; ++ else if (swappiness == 1) ++ type = 1; ++ else if (swappiness == 200) ++ type = 0; ++ else ++ type = get_type_to_scan(lruvec, swappiness, &tier); ++ ++ for (i = !swappiness; i < ANON_AND_FILE; i++) { ++ if (tier < 0) ++ tier = get_tier_idx(lruvec, type); ++ ++ scanned = scan_pages(lruvec, sc, type, tier, list); ++ if (scanned) ++ break; ++ ++ type = !type; ++ tier = -1; ++ } ++ ++ *type_scanned = type; ++ ++ return scanned; ++} ++ ++/* Main function used by the foreground, the background and the user-triggered eviction. */ ++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ int type; ++ int scanned; ++ int reclaimed; ++ LIST_HEAD(list); ++ struct page *page; ++ enum vm_event_item item; ++ struct reclaim_stat stat; ++ struct mm_walk_args *args; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); ++ ++ if (try_to_inc_min_seq(lruvec, swappiness)) ++ scanned++; ++ ++ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS) ++ scanned = 0; ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ if (list_empty(&list)) ++ return scanned; ++ ++ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); ++ /* ++ * We need to prevent rejected pages from being added back to the same ++ * lists they were isolated from. Otherwise we may risk looping on them ++ * forever. ++ */ ++ list_for_each_entry(page, &list, lru) { ++ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) ++ SetPageActive(page); ++ ++ ClearPageReferenced(page); ++ ClearPageWorkingset(page); ++ } ++ ++ spin_lock_irq(&lruvec->lru_lock); ++ ++ move_pages_to_lru(lruvec, &list); ++ ++ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; ++ if (args && args->batch_size) ++ reset_batch_size(lruvec, args); ++ ++ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; ++ if (!cgroup_reclaim(sc)) ++ __count_vm_events(item, reclaimed); ++ __count_memcg_events(memcg, item, reclaimed); ++ __count_vm_events(PGSTEAL_ANON + type, reclaimed); ++ ++ spin_unlock_irq(&lruvec->lru_lock); ++ ++ mem_cgroup_uncharge_list(&list); ++ free_unref_page_list(&list); ++ ++ sc->nr_reclaimed += reclaimed; ++ ++ return scanned; ++} ++ ++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) ++{ ++ bool low; ++ long nr_to_scan; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ int priority = sc->priority; ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (mem_cgroup_below_min(memcg) || ++ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) ++ return 0; ++ ++ if (sc->nr_reclaimed >= sc->nr_to_reclaim) { ++ priority = DEF_PRIORITY; ++ sc->force_deactivate = 0; ++ } ++ ++ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); ++ if (!nr_to_scan) ++ return 0; ++ ++ nr_to_scan >>= priority; ++ ++ if (!mem_cgroup_online(memcg)) ++ nr_to_scan++; ++ ++ if (!nr_to_scan) ++ return 0; ++ ++ if (current_is_kswapd()) { ++ /* leave the work to lru_gen_age_node() */ ++ if (max_seq - min_seq[1] < MIN_NR_GENS) ++ return 0; ++ ++ if (!low) ++ sc->force_deactivate = 0; ++ ++ return nr_to_scan; ++ } ++ ++ if (max_seq - min_seq[1] >= MIN_NR_GENS) ++ return nr_to_scan; ++ ++ /* move onto slab and other memcgs if we haven't tried them all */ ++ if (!sc->force_deactivate) { ++ sc->skipped_deactivate = 1; ++ return 0; ++ } ++ ++ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0; ++} ++ ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++ struct blk_plug plug; ++ long scanned = 0; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ ++ lru_add_drain(); ++ ++ if (current_is_kswapd()) ++ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; ++ ++ blk_start_plug(&plug); ++ ++ while (true) { ++ int delta; ++ int swappiness; ++ long nr_to_scan; ++ ++ if (sc->may_swap) ++ swappiness = get_swappiness(memcg); ++ else if (!cgroup_reclaim(sc) && get_swappiness(memcg)) ++ swappiness = 1; ++ else ++ swappiness = 0; ++ ++ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); ++ if (!nr_to_scan) ++ break; ++ ++ delta = evict_pages(lruvec, sc, swappiness); ++ if (!delta) ++ break; ++ ++ scanned += delta; ++ if (scanned >= nr_to_scan) ++ break; ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++ ++ if (current_is_kswapd()) ++ current->reclaim_state->mm_walk_args = NULL; ++} ++ ++/****************************************************************************** + * state change + ******************************************************************************/ + +@@ -4420,6 +4961,10 @@ static void lru_gen_age_node(struct pgli + { + } + ++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) ++{ ++} ++ + #endif /* CONFIG_LRU_GEN */ + + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -4433,6 +4978,11 @@ static void shrink_lruvec(struct lruvec + struct blk_plug plug; + bool scan_adjusted; + ++ if (lru_gen_enabled()) { ++ lru_gen_shrink_lruvec(lruvec, sc); ++ return; ++ } ++ + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ +@@ -4906,6 +5456,9 @@ static void snapshot_refaults(struct mem + struct lruvec *target_lruvec; + unsigned long refaults; + ++ if (lru_gen_enabled()) ++ return; ++ + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[0] = refaults; +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_ + static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) + { +- eviction >>= bucket_order; + eviction &= EVICTION_MASK; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; +@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow, + + *memcgidp = memcgid; + *pgdat = NODE_DATA(nid); +- *evictionp = entry << bucket_order; ++ *evictionp = entry; + *workingsetp = workingset; + } + ++#ifdef CONFIG_LRU_GEN ++ ++static int page_lru_refs(struct page *page) ++{ ++ unsigned long flags = READ_ONCE(page->flags); ++ ++ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); ++ ++ /* see the comment on MAX_NR_TIERS */ ++ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0; ++} ++ ++/* Return a token to be stored in the shadow entry of a page being evicted. */ ++static void *lru_gen_eviction(struct page *page) ++{ ++ int hist, tier; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ int type = page_is_file_lru(page); ++ int refs = page_lru_refs(page); ++ int delta = thp_nr_pages(page); ++ bool workingset = PageWorkingset(page); ++ struct mem_cgroup *memcg = page_memcg(page); ++ struct pglist_data *pgdat = page_pgdat(page); ++ ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ token = (min_seq << LRU_REFS_WIDTH) | refs; ++ ++ hist = lru_hist_from_seq(min_seq); ++ tier = lru_tier_from_refs(refs + workingset); ++ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); ++ ++ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); ++} ++ ++/* Count a refaulted page based on the token stored in its shadow entry. */ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++ int hist, tier, refs; ++ int memcg_id; ++ bool workingset; ++ unsigned long token; ++ unsigned long min_seq; ++ struct lruvec *lruvec; ++ struct lrugen *lrugen; ++ struct mem_cgroup *memcg; ++ struct pglist_data *pgdat; ++ int type = page_is_file_lru(page); ++ int delta = thp_nr_pages(page); ++ ++ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); ++ if (page_pgdat(page) != pgdat) ++ return; ++ ++ rcu_read_lock(); ++ memcg = page_memcg_rcu(page); ++ if (mem_cgroup_id(memcg) != memcg_id) ++ goto unlock; ++ ++ refs = token & (BIT(LRU_REFS_WIDTH) - 1); ++ if (refs && !workingset) ++ goto unlock; ++ ++ token >>= LRU_REFS_WIDTH; ++ lruvec = mem_cgroup_lruvec(memcg, pgdat); ++ lrugen = &lruvec->evictable; ++ min_seq = READ_ONCE(lrugen->min_seq[type]); ++ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) ++ goto unlock; ++ ++ hist = lru_hist_from_seq(min_seq); ++ tier = lru_tier_from_refs(refs + workingset); ++ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); ++ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); ++ ++ /* ++ * Tiers don't offer any protection to pages accessed via page tables. ++ * That's what generations do. Tiers can't fully protect pages after ++ * their numbers of accesses has exceeded the max value. Conservatively ++ * count these two conditions as stalls even though they might not ++ * indicate any real memory pressure. ++ */ ++ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) { ++ SetPageWorkingset(page); ++ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); ++ } ++unlock: ++ rcu_read_unlock(); ++} ++ ++#else ++ ++static void *lru_gen_eviction(struct page *page) ++{ ++ return NULL; ++} ++ ++static void lru_gen_refault(struct page *page, void *shadow) ++{ ++} ++ ++#endif /* CONFIG_LRU_GEN */ ++ + /** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @lruvec: the lruvec that was aged +@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + ++ if (lru_gen_enabled()) ++ return lru_gen_eviction(page); ++ + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->nonresident_age); ++ eviction >>= bucket_order; + workingset_age_nonresident(lruvec, thp_nr_pages(page)); + return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); + } +@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag + bool workingset; + int memcgid; + ++ if (lru_gen_enabled()) { ++ lru_gen_refault(page, shadow); ++ return; ++ } ++ + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); ++ eviction <<= bucket_order; + + rcu_read_lock(); + /* diff --git a/target/linux/generic/backport-5.15/020-v6.1-08-mm-multigenerational-lru-user-interface.patch b/target/linux/generic/backport-5.15/020-v6.1-08-mm-multigenerational-lru-user-interface.patch new file mode 100644 index 0000000000..a1a749fc38 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-08-mm-multigenerational-lru-user-interface.patch @@ -0,0 +1,496 @@ +From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 25 Jan 2021 21:38:02 -0700 +Subject: [PATCH 08/10] mm: multigenerational lru: user interface + +Add /sys/kernel/mm/lru_gen/enabled to enable and disable the +multigenerational lru at runtime. + +Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a +given number of milliseconds. The OOM killer is invoked if this +working set cannot be kept in memory. + +Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and +invoke the aging and the eviction. This file has the following output: + memcg memcg_id memcg_path + node node_id + min_gen birth_time anon_size file_size + ... + max_gen birth_time anon_size file_size + +min_gen is the oldest generation number and max_gen is the youngest +generation number. birth_time is in milliseconds. anon_size and +file_size are in pages. + +This file takes the following input: + + memcg_id node_id max_gen [swappiness] [use_bloom_filter] + - memcg_id node_id min_gen [swappiness] [nr_to_reclaim] + +The first command line invokes the aging, which scans PTEs for +accessed pages and then creates the next generation max_gen+1. A swap +file and a non-zero swappiness, which overrides vm.swappiness, are +required to scan PTEs mapping anon pages. The second command line +invokes the eviction, which evicts generations less than or equal to +min_gen. min_gen should be less than max_gen-1 as max_gen and +max_gen-1 are not fully aged and therefore cannot be evicted. +Setting nr_to_reclaim to N limits the number of pages to evict. +Setting use_bloom_filter to 0 overrides the default behavior which +only scans PTE tables found populated. Multiple command lines are +supported, as is concatenation with delimiters "," and ";". + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3 +--- + include/linux/nodemask.h | 1 + + mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++ + 2 files changed, 416 insertions(+) + +--- a/include/linux/nodemask.h ++++ b/include/linux/nodemask.h +@@ -485,6 +485,7 @@ static inline int num_node_state(enum no + #define first_online_node 0 + #define first_memory_node 0 + #define next_online_node(nid) (MAX_NUMNODES) ++#define next_memory_node(nid) (MAX_NUMNODES) + #define nr_node_ids 1U + #define nr_online_nodes 1U + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -53,6 +53,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -4882,6 +4884,413 @@ unlock: + } + + /****************************************************************************** ++ * sysfs interface ++ ******************************************************************************/ ++ ++static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); ++} ++ ++static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ unsigned int msecs; ++ ++ if (kstrtouint(buf, 10, &msecs)) ++ return -EINVAL; ++ ++ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( ++ min_ttl_ms, 0644, show_min_ttl, store_min_ttl ++); ++ ++static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled()); ++} ++ ++static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) ++{ ++ bool enable; ++ ++ if (kstrtobool(buf, &enable)) ++ return -EINVAL; ++ ++ lru_gen_change_state(enable, true, false); ++ ++ return len; ++} ++ ++static struct kobj_attribute lru_gen_enabled_attr = __ATTR( ++ enabled, 0644, show_enable, store_enable ++); ++ ++static struct attribute *lru_gen_attrs[] = { ++ &lru_gen_min_ttl_attr.attr, ++ &lru_gen_enabled_attr.attr, ++ NULL ++}; ++ ++static struct attribute_group lru_gen_attr_group = { ++ .name = "lru_gen", ++ .attrs = lru_gen_attrs, ++}; ++ ++/****************************************************************************** ++ * debugfs interface ++ ******************************************************************************/ ++ ++static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct mem_cgroup *memcg; ++ loff_t nr_to_skip = *pos; ++ ++ m->private = kvmalloc(PATH_MAX, GFP_KERNEL); ++ if (!m->private) ++ return ERR_PTR(-ENOMEM); ++ ++ memcg = mem_cgroup_iter(NULL, NULL, NULL); ++ do { ++ int nid; ++ ++ for_each_node_state(nid, N_MEMORY) { ++ if (!nr_to_skip--) ++ return get_lruvec(nid, memcg); ++ } ++ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); ++ ++ return NULL; ++} ++ ++static void lru_gen_seq_stop(struct seq_file *m, void *v) ++{ ++ if (!IS_ERR_OR_NULL(v)) ++ mem_cgroup_iter_break(NULL, lruvec_memcg(v)); ++ ++ kvfree(m->private); ++ m->private = NULL; ++} ++ ++static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ int nid = lruvec_pgdat(v)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(v); ++ ++ ++*pos; ++ ++ nid = next_memory_node(nid); ++ if (nid == MAX_NUMNODES) { ++ memcg = mem_cgroup_iter(NULL, memcg, NULL); ++ if (!memcg) ++ return NULL; ++ ++ nid = first_memory_node; ++ } ++ ++ return get_lruvec(nid, memcg); ++} ++ ++static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, ++ unsigned long max_seq, unsigned long *min_seq, ++ unsigned long seq) ++{ ++ int i; ++ int type, tier; ++ int hist = lru_hist_from_seq(seq); ++ struct lrugen *lrugen = &lruvec->evictable; ++ ++ for (tier = 0; tier < MAX_NR_TIERS; tier++) { ++ seq_printf(m, " %10d", tier); ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ unsigned long n[3] = {}; ++ ++ if (seq == max_seq) { ++ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); ++ n[1] = READ_ONCE(lrugen->avg_total[type][tier]); ++ ++ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]); ++ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { ++ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); ++ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); ++ if (tier) ++ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); ++ ++ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]); ++ } else ++ seq_puts(m, " 0 0 0 "); ++ } ++ seq_putc(m, '\n'); ++ } ++ ++ seq_puts(m, " "); ++ for (i = 0; i < NR_MM_STATS; i++) { ++ if (seq == max_seq && NR_HIST_GENS == 1) ++ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), ++ toupper(MM_STAT_CODES[i])); ++ else if (seq != max_seq && NR_HIST_GENS > 1) ++ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), ++ MM_STAT_CODES[i]); ++ else ++ seq_puts(m, " 0 "); ++ } ++ seq_putc(m, '\n'); ++} ++ ++static int lru_gen_seq_show(struct seq_file *m, void *v) ++{ ++ unsigned long seq; ++ bool full = !debugfs_real_fops(m->file)->write; ++ struct lruvec *lruvec = v; ++ struct lrugen *lrugen = &lruvec->evictable; ++ int nid = lruvec_pgdat(lruvec)->node_id; ++ struct mem_cgroup *memcg = lruvec_memcg(lruvec); ++ DEFINE_MAX_SEQ(lruvec); ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (nid == first_memory_node) { ++ const char *path = memcg ? m->private : ""; ++ ++#ifdef CONFIG_MEMCG ++ if (memcg) ++ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); ++#endif ++ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); ++ } ++ ++ seq_printf(m, " node %5d\n", nid); ++ ++ if (!full) ++ seq = min_seq[0]; ++ else if (max_seq >= MAX_NR_GENS) ++ seq = max_seq - MAX_NR_GENS + 1; ++ else ++ seq = 0; ++ ++ for (; seq <= max_seq; seq++) { ++ int gen, type, zone; ++ unsigned int msecs; ++ ++ gen = lru_gen_from_seq(seq); ++ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen])); ++ ++ seq_printf(m, " %10lu %10u", seq, msecs); ++ ++ for (type = 0; type < ANON_AND_FILE; type++) { ++ long size = 0; ++ ++ if (seq < min_seq[type]) { ++ seq_puts(m, " -0 "); ++ continue; ++ } ++ ++ for (zone = 0; zone < MAX_NR_ZONES; zone++) ++ size += READ_ONCE(lrugen->sizes[gen][type][zone]); ++ ++ seq_printf(m, " %10lu ", max(size, 0L)); ++ } ++ ++ seq_putc(m, '\n'); ++ ++ if (full) ++ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); ++ } ++ ++ return 0; ++} ++ ++static const struct seq_operations lru_gen_seq_ops = { ++ .start = lru_gen_seq_start, ++ .stop = lru_gen_seq_stop, ++ .next = lru_gen_seq_next, ++ .show = lru_gen_seq_show, ++}; ++ ++static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long seq, bool use_filter) ++{ ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (seq == max_seq) ++ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter); ++ ++ return seq > max_seq ? -EINVAL : 0; ++} ++ ++static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness, ++ unsigned long seq, unsigned long nr_to_reclaim) ++{ ++ struct blk_plug plug; ++ int err = -EINTR; ++ DEFINE_MAX_SEQ(lruvec); ++ ++ if (seq >= max_seq - 1) ++ return -EINVAL; ++ ++ sc->nr_reclaimed = 0; ++ ++ blk_start_plug(&plug); ++ ++ while (!signal_pending(current)) { ++ DEFINE_MIN_SEQ(lruvec); ++ ++ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim || ++ !evict_pages(lruvec, sc, swappiness)) { ++ err = 0; ++ break; ++ } ++ ++ cond_resched(); ++ } ++ ++ blk_finish_plug(&plug); ++ ++ return err; ++} ++ ++static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc, ++ int swappiness, unsigned long seq, unsigned long opt) ++{ ++ struct lruvec *lruvec; ++ int err = -EINVAL; ++ struct mem_cgroup *memcg = NULL; ++ ++ if (!mem_cgroup_disabled()) { ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_id(memcg_id); ++#ifdef CONFIG_MEMCG ++ if (memcg && !css_tryget(&memcg->css)) ++ memcg = NULL; ++#endif ++ rcu_read_unlock(); ++ ++ if (!memcg) ++ goto done; ++ } ++ if (memcg_id != mem_cgroup_id(memcg)) ++ goto done; ++ ++ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) ++ goto done; ++ ++ lruvec = get_lruvec(nid, memcg); ++ ++ if (swappiness < 0) ++ swappiness = get_swappiness(memcg); ++ else if (swappiness > 200) ++ goto done; ++ ++ switch (cmd) { ++ case '+': ++ err = run_aging(lruvec, sc, swappiness, seq, opt); ++ break; ++ case '-': ++ err = run_eviction(lruvec, sc, swappiness, seq, opt); ++ break; ++ } ++done: ++ mem_cgroup_put(memcg); ++ ++ return err; ++} ++ ++static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, ++ size_t len, loff_t *pos) ++{ ++ void *buf; ++ char *cur, *next; ++ unsigned int flags; ++ int err = 0; ++ struct scan_control sc = { ++ .may_writepage = 1, ++ .may_unmap = 1, ++ .may_swap = 1, ++ .reclaim_idx = MAX_NR_ZONES - 1, ++ .gfp_mask = GFP_KERNEL, ++ }; ++ ++ buf = kvmalloc(len + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ ++ if (copy_from_user(buf, src, len)) { ++ kvfree(buf); ++ return -EFAULT; ++ } ++ ++ next = buf; ++ next[len] = '\0'; ++ ++ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args(); ++ if (!sc.reclaim_state.mm_walk_args) { ++ kvfree(buf); ++ return -ENOMEM; ++ } ++ ++ flags = memalloc_noreclaim_save(); ++ set_task_reclaim_state(current, &sc.reclaim_state); ++ ++ while ((cur = strsep(&next, ",;\n"))) { ++ int n; ++ int end; ++ char cmd; ++ unsigned int memcg_id; ++ unsigned int nid; ++ unsigned long seq; ++ unsigned int swappiness = -1; ++ unsigned long opt = -1; ++ ++ cur = skip_spaces(cur); ++ if (!*cur) ++ continue; ++ ++ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, ++ &seq, &end, &swappiness, &end, &opt, &end); ++ if (n < 4 || cur[end]) { ++ err = -EINVAL; ++ break; ++ } ++ ++ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt); ++ if (err) ++ break; ++ } ++ ++ set_task_reclaim_state(current, NULL); ++ memalloc_noreclaim_restore(flags); ++ ++ free_mm_walk_args(sc.reclaim_state.mm_walk_args); ++ kvfree(buf); ++ ++ return err ? : len; ++} ++ ++static int lru_gen_seq_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &lru_gen_seq_ops); ++} ++ ++static const struct file_operations lru_gen_rw_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .write = lru_gen_seq_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static const struct file_operations lru_gen_ro_fops = { ++ .open = lru_gen_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++/****************************************************************************** + * initialization + ******************************************************************************/ + +@@ -4951,6 +5360,12 @@ static int __init init_lru_gen(void) + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); + ++ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) ++ pr_err("lru_gen: failed to create sysfs group\n"); ++ ++ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); ++ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); ++ + return 0; + }; + late_initcall(init_lru_gen); diff --git a/target/linux/generic/backport-5.15/020-v6.1-09-mm-multigenerational-lru-Kconfig.patch b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multigenerational-lru-Kconfig.patch new file mode 100644 index 0000000000..4462549f99 --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-09-mm-multigenerational-lru-Kconfig.patch @@ -0,0 +1,80 @@ +From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Mon, 25 Jan 2021 21:47:24 -0700 +Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig + +Add configuration options for the multigenerational lru. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635 +--- + mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 59 insertions(+) + +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -899,4 +899,63 @@ config SECRETMEM + + source "mm/damon/Kconfig" + ++# the multigenerational lru { ++config LRU_GEN ++ bool "Multigenerational LRU" ++ depends on MMU ++ # the following options may leave not enough spare bits in page->flags ++ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP) ++ help ++ A high performance LRU implementation to heavily overcommit workloads ++ that are not IO bound. See Documentation/vm/multigen_lru.rst for ++ details. ++ ++ Warning: do not enable this option unless you plan to use it because ++ it introduces a small per-process and per-memcg and per-node memory ++ overhead. ++ ++config LRU_GEN_ENABLED ++ bool "Turn on by default" ++ depends on LRU_GEN ++ help ++ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option ++ changes it to 1. ++ ++ Warning: the default value is the fast path. See ++ Documentation/static-keys.txt for details. ++ ++config LRU_GEN_STATS ++ bool "Full stats for debugging" ++ depends on LRU_GEN ++ help ++ This option keeps full stats for each generation, which can be read ++ from /sys/kernel/debug/lru_gen_full. ++ ++ Warning: do not enable this option unless you plan to use it because ++ it introduces an additional small per-process and per-memcg and ++ per-node memory overhead. ++ ++config NR_LRU_GENS ++ int "Max number of generations" ++ depends on LRU_GEN ++ range 4 31 ++ default 7 ++ help ++ This will use order_base_2(N+1) spare bits from page flags. ++ ++ Warning: do not use numbers larger than necessary because each ++ generation introduces a small per-node and per-memcg memory overhead. ++ ++config TIERS_PER_GEN ++ int "Number of tiers per generation" ++ depends on LRU_GEN ++ range 2 5 ++ default 4 ++ help ++ This will use N-2 spare bits from page flags. ++ ++ Larger values generally offer better protection to active pages under ++ heavy buffered I/O workloads. ++# } ++ + endmenu diff --git a/target/linux/generic/backport-5.15/020-v6.1-10-mm-multigenerational-lru-documentation.patch b/target/linux/generic/backport-5.15/020-v6.1-10-mm-multigenerational-lru-documentation.patch new file mode 100644 index 0000000000..f4716fb68d --- /dev/null +++ b/target/linux/generic/backport-5.15/020-v6.1-10-mm-multigenerational-lru-documentation.patch @@ -0,0 +1,161 @@ +From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Tue, 2 Feb 2021 01:27:45 -0700 +Subject: [PATCH 10/10] mm: multigenerational lru: documentation + +Add Documentation/vm/multigen_lru.rst. + +Signed-off-by: Yu Zhao +Tested-by: Konstantin Kharlamov +Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e +--- + Documentation/vm/index.rst | 1 + + Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++ + 2 files changed, 133 insertions(+) + create mode 100644 Documentation/vm/multigen_lru.rst + +--- a/Documentation/vm/index.rst ++++ b/Documentation/vm/index.rst +@@ -17,6 +17,7 @@ various features of the Linux memory man + + swap_numa + zswap ++ multigen_lru + + Kernel developers MM documentation + ================================== +--- /dev/null ++++ b/Documentation/vm/multigen_lru.rst +@@ -0,0 +1,132 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++===================== ++Multigenerational LRU ++===================== ++ ++Quick Start ++=========== ++Build Configurations ++-------------------- ++:Required: Set ``CONFIG_LRU_GEN=y``. ++ ++:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by ++ default. ++ ++Runtime Configurations ++---------------------- ++:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the ++ feature was not turned on by default. ++ ++:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to ++ protect the working set of ``N`` milliseconds. The OOM killer is ++ invoked if this working set cannot be kept in memory. ++ ++:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature ++ is turned on. This file has the following output: ++ ++:: ++ ++ memcg memcg_id memcg_path ++ node node_id ++ min_gen birth_time anon_size file_size ++ ... ++ max_gen birth_time anon_size file_size ++ ++``min_gen`` is the oldest generation number and ``max_gen`` is the ++youngest generation number. ``birth_time`` is in milliseconds. ++``anon_size`` and ``file_size`` are in pages. ++ ++Phones/Laptops/Workstations ++--------------------------- ++No additional configurations required. ++ ++Servers/Data Centers ++-------------------- ++:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a ++ larger number. ++ ++:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger ++ number. ++ ++:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``. ++ ++:Working set estimation: Write ``+ memcg_id node_id max_gen ++ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to ++ invoke the aging, which scans PTEs for accessed pages and then ++ creates the next generation ``max_gen+1``. A swap file and a non-zero ++ ``swappiness``, which overrides ``vm.swappiness``, are required to ++ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to ++ override the default behavior which only scans PTE tables found ++ populated. ++ ++:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness] ++ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the ++ eviction, which evicts generations less than or equal to ``min_gen``. ++ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and ++ ``max_gen-1`` are not fully aged and therefore cannot be evicted. ++ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple ++ command lines are supported, so does concatenation with delimiters ++ ``,`` and ``;``. ++ ++Framework ++========= ++For each ``lruvec``, evictable pages are divided into multiple ++generations. The youngest generation number is stored in ++``lrugen->max_seq`` for both anon and file types as they are aged on ++an equal footing. The oldest generation numbers are stored in ++``lrugen->min_seq[]`` separately for anon and file types as clean ++file pages can be evicted regardless of swap and writeback ++constraints. These three variables are monotonically increasing. ++Generation numbers are truncated into ++``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into ++``page->flags``. The sliding window technique is used to prevent ++truncated generation numbers from overlapping. Each truncated ++generation number is an index to an array of per-type and per-zone ++lists ``lrugen->lists``. ++ ++Each generation is divided into multiple tiers. Tiers represent ++different ranges of numbers of accesses from file descriptors only. ++Pages accessed ``N`` times via file descriptors belong to tier ++``order_base_2(N)``. Each generation contains at most ++``CONFIG_TIERS_PER_GEN`` tiers, and they require additional ++``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to ++moving between generations which requires list operations, moving ++between tiers only involves operations on ``page->flags`` and ++therefore has a negligible cost. A feedback loop modeled after the PID ++controller monitors refaulted % across all tiers and decides when to ++protect pages from which tiers. ++ ++The framework comprises two conceptually independent components: the ++aging and the eviction, which can be invoked separately from user ++space for the purpose of working set estimation and proactive reclaim. ++ ++Aging ++----- ++The aging produces young generations. Given an ``lruvec``, the aging ++traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()`` ++to scan PTEs for accessed pages (a ``mm_struct`` list is maintained ++for each ``memcg``). Upon finding one, the aging updates its ++generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``). ++After each round of traversal, the aging increments ``max_seq``. The ++aging is due when ``min_seq[]`` reaches ``max_seq-1``. ++ ++Eviction ++-------- ++The eviction consumes old generations. Given an ``lruvec``, the ++eviction scans pages on the per-zone lists indexed by anon and file ++``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to ++select a type based on the values of ``min_seq[]``. If they are ++equal, it selects the type that has a lower refaulted %. The eviction ++sorts a page according to its updated generation number if the aging ++has found this page accessed. It also moves a page to the next ++generation if this page is from an upper tier that has a higher ++refaulted % than the base tier. The eviction increments ``min_seq[]`` ++of a selected type when it finds all the per-zone lists indexed by ++``min_seq[]`` of this selected type are empty. ++ ++To-do List ++========== ++KVM Optimization ++---------------- ++Support shadow page table walk. diff --git a/target/linux/generic/backport-5.15/021-v6.1-mm-mglru-don-t-sync-disk-for-each-aging-cycle.patch b/target/linux/generic/backport-5.15/021-v6.1-mm-mglru-don-t-sync-disk-for-each-aging-cycle.patch new file mode 100644 index 0000000000..269e5eb204 --- /dev/null +++ b/target/linux/generic/backport-5.15/021-v6.1-mm-mglru-don-t-sync-disk-for-each-aging-cycle.patch @@ -0,0 +1,32 @@ +From 14aa8b2d5c2ebead01b542f62d68029023054774 Mon Sep 17 00:00:00 2001 +From: Yu Zhao +Date: Wed, 28 Sep 2022 13:36:58 -0600 +Subject: [PATCH 1/1] mm/mglru: don't sync disk for each aging cycle + +wakeup_flusher_threads() was added under the assumption that if a system +runs out of clean cold pages, it might want to write back dirty pages more +aggressively so that they can become clean and be dropped. + +However, doing so can breach the rate limit a system wants to impose on +writeback, resulting in early SSD wearout. + +Link: https://lkml.kernel.org/r/YzSiWq9UEER5LKup@google.com +Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") +Signed-off-by: Yu Zhao +Reported-by: Axel Rasmussen +Signed-off-by: Andrew Morton +--- + mm/vmscan.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4072,8 +4072,6 @@ static bool try_to_inc_max_seq(struct lr + if (wq_has_sleeper(&lruvec->mm_walk.wait)) + wake_up_all(&lruvec->mm_walk.wait); + +- wakeup_flusher_threads(WB_REASON_VMSCAN); +- + return true; + } + diff --git a/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch b/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch deleted file mode 100644 index 48bcaf3e3e..0000000000 --- a/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch +++ /dev/null @@ -1,169 +0,0 @@ -From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001 -From: Yu Zhao -Date: Wed, 4 Aug 2021 01:31:34 -0600 -Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young() - -Some architectures automatically set the accessed bit in PTEs, e.g., -x86 and arm64 v8.2. On architectures that do not have this capability, -clearing the accessed bit in a PTE triggers a page fault following the -TLB miss of this PTE. - -Being aware of this capability can help make better decisions, i.e., -whether to limit the size of each batch of PTEs and the burst of -batches when clearing the accessed bit. - -Signed-off-by: Yu Zhao -Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a ---- - arch/arm64/include/asm/cpufeature.h | 5 +++++ - arch/arm64/include/asm/pgtable.h | 13 ++++++++----- - arch/arm64/kernel/cpufeature.c | 10 ++++++++++ - arch/arm64/tools/cpucaps | 1 + - arch/x86/include/asm/pgtable.h | 6 +++--- - include/linux/pgtable.h | 13 +++++++++++++ - mm/memory.c | 14 +------------- - 7 files changed, 41 insertions(+), 21 deletions(-) - ---- a/arch/arm64/include/asm/cpufeature.h -+++ b/arch/arm64/include/asm/cpufeature.h -@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r - cpus_have_const_cap(ARM64_HAS_TLB_RANGE); - } - -+static inline bool system_has_hw_af(void) -+{ -+ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF); -+} -+ - extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); - - static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) ---- a/arch/arm64/include/asm/pgtable.h -+++ b/arch/arm64/include/asm/pgtable.h -@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru - * page after fork() + CoW for pfn mappings. We don't always have a - * hardware-managed access flag on arm64. - */ --static inline bool arch_faults_on_old_pte(void) -+static inline bool arch_has_hw_pte_young(bool local) - { -- WARN_ON(preemptible()); -+ if (local) { -+ WARN_ON(preemptible()); -+ return cpu_has_hw_af(); -+ } - -- return !cpu_has_hw_af(); -+ return system_has_hw_af(); - } --#define arch_faults_on_old_pte arch_faults_on_old_pte -+#define arch_has_hw_pte_young arch_has_hw_pte_young - - /* - * Experimentally, it's cheap to set the access flag in hardware and we -@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt - */ - static inline bool arch_wants_old_prefaulted_pte(void) - { -- return !arch_faults_on_old_pte(); -+ return arch_has_hw_pte_young(true); - } - #define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte - ---- a/arch/arm64/kernel/cpufeature.c -+++ b/arch/arm64/kernel/cpufeature.c -@@ -2187,6 +2187,16 @@ static const struct arm64_cpu_capabiliti - .matches = has_hw_dbm, - .cpu_enable = cpu_enable_hw_dbm, - }, -+ { -+ .desc = "Hardware update of the Access flag", -+ .type = ARM64_CPUCAP_SYSTEM_FEATURE, -+ .capability = ARM64_HW_AF, -+ .sys_reg = SYS_ID_AA64MMFR1_EL1, -+ .sign = FTR_UNSIGNED, -+ .field_pos = ID_AA64MMFR1_HADBS_SHIFT, -+ .min_field_value = 1, -+ .matches = has_cpuid_feature, -+ }, - #endif - { - .desc = "CRC32 instructions", ---- a/arch/arm64/tools/cpucaps -+++ b/arch/arm64/tools/cpucaps -@@ -35,6 +35,7 @@ HAS_STAGE2_FWB - HAS_SYSREG_GIC_CPUIF - HAS_TLB_RANGE - HAS_VIRT_HOST_EXTN -+HW_AF - HW_DBM - KVM_PROTECTED_MODE - MISMATCHED_CACHE_TYPE ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c - return boot_cpu_has_bug(X86_BUG_L1TF); - } - --#define arch_faults_on_old_pte arch_faults_on_old_pte --static inline bool arch_faults_on_old_pte(void) -+#define arch_has_hw_pte_young arch_has_hw_pte_young -+static inline bool arch_has_hw_pte_young(bool local) - { -- return false; -+ return true; - } - - #endif /* __ASSEMBLY__ */ ---- a/include/linux/pgtable.h -+++ b/include/linux/pgtable.h -@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - #endif - -+#ifndef arch_has_hw_pte_young -+/* -+ * Return whether the accessed bit is supported by the local CPU or all CPUs. -+ * -+ * Those arches which have hw access flag feature need to implement their own -+ * helper. By default, "false" means pagefault will be hit on old pte. -+ */ -+static inline bool arch_has_hw_pte_young(bool local) -+{ -+ return false; -+} -+#endif -+ - #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR - static inline pte_t ptep_get_and_clear(struct mm_struct *mm, - unsigned long address, ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly = - 2; - #endif - --#ifndef arch_faults_on_old_pte --static inline bool arch_faults_on_old_pte(void) --{ -- /* -- * Those arches which don't have hw access flag feature need to -- * implement their own helper. By default, "true" means pagefault -- * will be hit on old pte. -- */ -- return true; --} --#endif -- - #ifndef arch_wants_old_prefaulted_pte - static inline bool arch_wants_old_prefaulted_pte(void) - { -@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct - * On architectures with software "accessed" bits, we would - * take a double page fault, so mark it accessed here. - */ -- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { -+ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) { - pte_t entry; - - vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); diff --git a/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch b/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch deleted file mode 100644 index 785af275f5..0000000000 --- a/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch +++ /dev/null @@ -1,111 +0,0 @@ -From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001 -From: Yu Zhao -Date: Sat, 26 Sep 2020 21:17:18 -0600 -Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG - -Some architectures support the accessed bit on non-leaf PMD entries, -e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using -it as part of linear address translation [1]. As an optimization, page -table walkers who are interested in the accessed bit can skip the PTEs -under a non-leaf PMD entry if the accessed bit is cleared on this PMD -entry. - -Although an inline function may be preferable, this capability is -added as a configuration option to look consistent when used with the -existing macros. - -[1]: Intel 64 and IA-32 Architectures Software Developer's Manual - Volume 3 (June 2021), section 4.8 - -Signed-off-by: Yu Zhao -Tested-by: Konstantin Kharlamov -Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8 ---- - arch/Kconfig | 9 +++++++++ - arch/x86/Kconfig | 1 + - arch/x86/include/asm/pgtable.h | 3 ++- - arch/x86/mm/pgtable.c | 5 ++++- - include/linux/pgtable.h | 4 ++-- - 5 files changed, 18 insertions(+), 4 deletions(-) - ---- a/arch/Kconfig -+++ b/arch/Kconfig -@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT - config ARCH_HAS_PARANOID_L1D_FLUSH - bool - -+config ARCH_HAS_NONLEAF_PMD_YOUNG -+ bool -+ depends on PGTABLE_LEVELS > 2 -+ help -+ Architectures that select this are able to set the accessed bit on -+ non-leaf PMD entries in addition to leaf PTE entries where pages are -+ mapped. For them, page table walkers that clear the accessed bit may -+ stop at non-leaf PMD entries if they do not see the accessed bit. -+ - source "kernel/gcov/Kconfig" - - source "scripts/gcc-plugins/Kconfig" ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -84,6 +84,7 @@ config X86 - select ARCH_HAS_PMEM_API if X86_64 - select ARCH_HAS_PTE_DEVMAP if X86_64 - select ARCH_HAS_PTE_SPECIAL -+ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64 - select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 - select ARCH_HAS_COPY_MC if X86_64 - select ARCH_HAS_SET_MEMORY ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad - - static inline int pmd_bad(pmd_t pmd) - { -- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; -+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != -+ (_KERNPG_TABLE & ~_PAGE_ACCESSED); - } - - static inline unsigned long pages_to_mb(unsigned long npg) ---- a/arch/x86/mm/pgtable.c -+++ b/arch/x86/mm/pgtable.c -@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_ - return ret; - } - --#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) - int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) - { -@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_ - - return ret; - } -+#endif -+ -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE - int pudp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pud_t *pudp) - { ---- a/include/linux/pgtable.h -+++ b/include/linux/pgtable.h -@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo - #endif - - #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG --#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) - static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp) -@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo - BUILD_BUG(); - return 0; - } --#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ - #endif - - #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH diff --git a/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch b/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch deleted file mode 100644 index f466f1105c..0000000000 --- a/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch +++ /dev/null @@ -1,224 +0,0 @@ -From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001 -From: Yu Zhao -Date: Sun, 27 Sep 2020 20:49:08 -0600 -Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node() - -This patch refactors shrink_node(). This will make the upcoming -changes to mm/vmscan.c more readable. - -Signed-off-by: Yu Zhao -Tested-by: Konstantin Kharlamov -Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04 ---- - mm/vmscan.c | 186 +++++++++++++++++++++++++++------------------------- - 1 file changed, 98 insertions(+), 88 deletions(-) - ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -2562,6 +2562,103 @@ enum scan_balance { - SCAN_FILE, - }; - -+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) -+{ -+ unsigned long file; -+ struct lruvec *target_lruvec; -+ -+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); -+ -+ /* -+ * Determine the scan balance between anon and file LRUs. -+ */ -+ spin_lock_irq(&target_lruvec->lru_lock); -+ sc->anon_cost = target_lruvec->anon_cost; -+ sc->file_cost = target_lruvec->file_cost; -+ spin_unlock_irq(&target_lruvec->lru_lock); -+ -+ /* -+ * Target desirable inactive:active list ratios for the anon -+ * and file LRU lists. -+ */ -+ if (!sc->force_deactivate) { -+ unsigned long refaults; -+ -+ refaults = lruvec_page_state(target_lruvec, -+ WORKINGSET_ACTIVATE_ANON); -+ if (refaults != target_lruvec->refaults[0] || -+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) -+ sc->may_deactivate |= DEACTIVATE_ANON; -+ else -+ sc->may_deactivate &= ~DEACTIVATE_ANON; -+ -+ /* -+ * When refaults are being observed, it means a new -+ * workingset is being established. Deactivate to get -+ * rid of any stale active pages quickly. -+ */ -+ refaults = lruvec_page_state(target_lruvec, -+ WORKINGSET_ACTIVATE_FILE); -+ if (refaults != target_lruvec->refaults[1] || -+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) -+ sc->may_deactivate |= DEACTIVATE_FILE; -+ else -+ sc->may_deactivate &= ~DEACTIVATE_FILE; -+ } else -+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; -+ -+ /* -+ * If we have plenty of inactive file pages that aren't -+ * thrashing, try to reclaim those first before touching -+ * anonymous pages. -+ */ -+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); -+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) -+ sc->cache_trim_mode = 1; -+ else -+ sc->cache_trim_mode = 0; -+ -+ /* -+ * Prevent the reclaimer from falling into the cache trap: as -+ * cache pages start out inactive, every cache fault will tip -+ * the scan balance towards the file LRU. And as the file LRU -+ * shrinks, so does the window for rotation from references. -+ * This means we have a runaway feedback loop where a tiny -+ * thrashing file LRU becomes infinitely more attractive than -+ * anon pages. Try to detect this based on file LRU size. -+ */ -+ if (!cgroup_reclaim(sc)) { -+ unsigned long total_high_wmark = 0; -+ unsigned long free, anon; -+ int z; -+ -+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); -+ file = node_page_state(pgdat, NR_ACTIVE_FILE) + -+ node_page_state(pgdat, NR_INACTIVE_FILE); -+ -+ for (z = 0; z < MAX_NR_ZONES; z++) { -+ struct zone *zone = &pgdat->node_zones[z]; -+ -+ if (!managed_zone(zone)) -+ continue; -+ -+ total_high_wmark += high_wmark_pages(zone); -+ } -+ -+ /* -+ * Consider anon: if that's low too, this isn't a -+ * runaway file reclaim problem, but rather just -+ * extreme pressure. Reclaim as per usual then. -+ */ -+ anon = node_page_state(pgdat, NR_INACTIVE_ANON); -+ -+ sc->file_is_tiny = -+ file + free <= total_high_wmark && -+ !(sc->may_deactivate & DEACTIVATE_ANON) && -+ anon >> sc->priority; -+ } -+} -+ - /* - * Determine how aggressively the anon and file LRU lists should be - * scanned. The relative value of each set of LRU lists is determined -@@ -3032,7 +3129,6 @@ static void shrink_node(pg_data_t *pgdat - unsigned long nr_reclaimed, nr_scanned; - struct lruvec *target_lruvec; - bool reclaimable = false; -- unsigned long file; - - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - -@@ -3048,93 +3144,7 @@ again: - nr_reclaimed = sc->nr_reclaimed; - nr_scanned = sc->nr_scanned; - -- /* -- * Determine the scan balance between anon and file LRUs. -- */ -- spin_lock_irq(&target_lruvec->lru_lock); -- sc->anon_cost = target_lruvec->anon_cost; -- sc->file_cost = target_lruvec->file_cost; -- spin_unlock_irq(&target_lruvec->lru_lock); -- -- /* -- * Target desirable inactive:active list ratios for the anon -- * and file LRU lists. -- */ -- if (!sc->force_deactivate) { -- unsigned long refaults; -- -- refaults = lruvec_page_state(target_lruvec, -- WORKINGSET_ACTIVATE_ANON); -- if (refaults != target_lruvec->refaults[0] || -- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) -- sc->may_deactivate |= DEACTIVATE_ANON; -- else -- sc->may_deactivate &= ~DEACTIVATE_ANON; -- -- /* -- * When refaults are being observed, it means a new -- * workingset is being established. Deactivate to get -- * rid of any stale active pages quickly. -- */ -- refaults = lruvec_page_state(target_lruvec, -- WORKINGSET_ACTIVATE_FILE); -- if (refaults != target_lruvec->refaults[1] || -- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) -- sc->may_deactivate |= DEACTIVATE_FILE; -- else -- sc->may_deactivate &= ~DEACTIVATE_FILE; -- } else -- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; -- -- /* -- * If we have plenty of inactive file pages that aren't -- * thrashing, try to reclaim those first before touching -- * anonymous pages. -- */ -- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); -- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) -- sc->cache_trim_mode = 1; -- else -- sc->cache_trim_mode = 0; -- -- /* -- * Prevent the reclaimer from falling into the cache trap: as -- * cache pages start out inactive, every cache fault will tip -- * the scan balance towards the file LRU. And as the file LRU -- * shrinks, so does the window for rotation from references. -- * This means we have a runaway feedback loop where a tiny -- * thrashing file LRU becomes infinitely more attractive than -- * anon pages. Try to detect this based on file LRU size. -- */ -- if (!cgroup_reclaim(sc)) { -- unsigned long total_high_wmark = 0; -- unsigned long free, anon; -- int z; -- -- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); -- file = node_page_state(pgdat, NR_ACTIVE_FILE) + -- node_page_state(pgdat, NR_INACTIVE_FILE); -- -- for (z = 0; z < MAX_NR_ZONES; z++) { -- struct zone *zone = &pgdat->node_zones[z]; -- if (!managed_zone(zone)) -- continue; -- -- total_high_wmark += high_wmark_pages(zone); -- } -- -- /* -- * Consider anon: if that's low too, this isn't a -- * runaway file reclaim problem, but rather just -- * extreme pressure. Reclaim as per usual then. -- */ -- anon = node_page_state(pgdat, NR_INACTIVE_ANON); -- -- sc->file_is_tiny = -- file + free <= total_high_wmark && -- !(sc->may_deactivate & DEACTIVATE_ANON) && -- anon >> sc->priority; -- } -+ prepare_scan_count(pgdat, sc); - - shrink_node_memcgs(pgdat, sc); - diff --git a/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch b/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch deleted file mode 100644 index fbc15d8c91..0000000000 --- a/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch +++ /dev/null @@ -1,996 +0,0 @@ -From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001 -From: Yu Zhao -Date: Mon, 25 Jan 2021 21:12:33 -0700 -Subject: [PATCH 04/10] mm: multigenerational lru: groundwork - -For each lruvec, evictable pages are divided into multiple -generations. The youngest generation number is stored in -lrugen->max_seq for both anon and file types as they are aged on an -equal footing. The oldest generation numbers are stored in -lrugen->min_seq[] separately for anon and file types as clean file -pages can be evicted regardless of swap constraints. These three -variables are monotonically increasing. Generation numbers are -truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into -page->flags. The sliding window technique is used to prevent truncated -generation numbers from overlapping. Each truncated generation number -is an index to -lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. - -The framework comprises two conceptually independent components: the -aging, which produces young generations, and the eviction, which -consumes old generations. Both can be invoked independently from user -space for the purpose of working set estimation and proactive reclaim. - -The protection of hot pages and the selection of cold pages are based -on page access types and patterns. There are two access types: one via -page tables and the other via file descriptors. The protection of the -former type is by design stronger because: - 1) The uncertainty in determining the access patterns of the former - type is higher due to the coalesced nature of the accessed bit. - 2) The cost of evicting the former type is higher due to the TLB - flushes required and the likelihood of involving I/O. - 3) The penalty of under-protecting the former type is higher because - applications usually do not prepare themselves for major faults like - they do for blocked I/O. For example, client applications commonly - dedicate blocked I/O to separate threads to avoid UI janks that - negatively affect user experience. - -There are also two access patterns: one with temporal locality and the -other without. The latter pattern, e.g., random and sequential, needs -to be explicitly excluded to avoid weakening the protection of the -former pattern. Generally the former type follows the former pattern -unless MADV_SEQUENTIAL is specified and the latter type follows the -latter pattern unless outlying refaults have been observed. - -Upon faulting, a page is added to the youngest generation, which -provides the strongest protection as the eviction will not consider -this page before the aging has scanned it at least twice. The first -scan clears the accessed bit set during the initial fault. And the -second scan makes sure this page has not been used since the first -scan. A page from any other generations is brought back to the -youngest generation whenever the aging finds the accessed bit set on -any of the PTEs mapping this page. - -Unmapped pages are initially added to the oldest generation and then -conditionally protected by tiers. This is done later [PATCH 07/10]. - -Signed-off-by: Yu Zhao -Tested-by: Konstantin Kharlamov -Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7 ---- - fs/fuse/dev.c | 3 +- - include/linux/cgroup.h | 15 +- - include/linux/mm.h | 36 ++++ - include/linux/mm_inline.h | 182 ++++++++++++++++++++ - include/linux/mmzone.h | 70 ++++++++ - include/linux/page-flags-layout.h | 19 ++- - include/linux/page-flags.h | 4 +- - include/linux/sched.h | 3 + - kernel/bounds.c | 3 + - kernel/cgroup/cgroup-internal.h | 1 - - mm/huge_memory.c | 3 +- - mm/memcontrol.c | 1 + - mm/memory.c | 7 + - mm/mm_init.c | 6 +- - mm/page_alloc.c | 1 + - mm/swap.c | 9 +- - mm/swapfile.c | 2 + - mm/vmscan.c | 268 ++++++++++++++++++++++++++++++ - 18 files changed, 618 insertions(+), 15 deletions(-) - ---- a/fs/fuse/dev.c -+++ b/fs/fuse/dev.c -@@ -785,7 +785,8 @@ static int fuse_check_page(struct page * - 1 << PG_active | - 1 << PG_workingset | - 1 << PG_reclaim | -- 1 << PG_waiters))) { -+ 1 << PG_waiters | -+ LRU_GEN_MASK | LRU_REFS_MASK))) { - dump_page(page, "fuse: trying to steal weird page"); - return 1; - } ---- a/include/linux/cgroup.h -+++ b/include/linux/cgroup.h -@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr - css_put(&cgrp->self); - } - -+extern struct mutex cgroup_mutex; -+ -+static inline void cgroup_lock(void) -+{ -+ mutex_lock(&cgroup_mutex); -+} -+ -+static inline void cgroup_unlock(void) -+{ -+ mutex_unlock(&cgroup_mutex); -+} -+ - /** - * task_css_set_check - obtain a task's css_set with extra access conditions - * @task: the task to obtain css_set for -@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr - * as locks used during the cgroup_subsys::attach() methods. - */ - #ifdef CONFIG_PROVE_RCU --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - #define task_css_set_check(task, __c) \ - rcu_dereference_check((task)->cgroups, \ -@@ -707,6 +718,8 @@ struct cgroup; - static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } - static inline void css_get(struct cgroup_subsys_state *css) {} - static inline void css_put(struct cgroup_subsys_state *css) {} -+static inline void cgroup_lock(void) {} -+static inline void cgroup_unlock(void) {} - static inline int cgroup_attach_task_all(struct task_struct *from, - struct task_struct *t) { return 0; } - static inline int cgroupstats_build(struct cgroupstats *stats, ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v - #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) - #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) - #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) -+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) -+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) - - /* - * Define the bit shifts to access each section. For non-existent -@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s - loff_t const holebegin, loff_t const holelen, int even_cows) { } - #endif - -+#ifdef CONFIG_LRU_GEN -+static inline void task_enter_nonseq_fault(void) -+{ -+ WARN_ON(current->in_nonseq_fault); -+ -+ current->in_nonseq_fault = 1; -+} -+ -+static inline void task_exit_nonseq_fault(void) -+{ -+ WARN_ON(!current->in_nonseq_fault); -+ -+ current->in_nonseq_fault = 0; -+} -+ -+static inline bool task_in_nonseq_fault(void) -+{ -+ return current->in_nonseq_fault; -+} -+#else -+static inline void task_enter_nonseq_fault(void) -+{ -+} -+ -+static inline void task_exit_nonseq_fault(void) -+{ -+} -+ -+static inline bool task_in_nonseq_fault(void) -+{ -+ return false; -+} -+#endif /* CONFIG_LRU_GEN */ -+ - static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) - { ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag - return lru; - } - -+#ifdef CONFIG_LRU_GEN -+ -+static inline bool lru_gen_enabled(void) -+{ -+#ifdef CONFIG_LRU_GEN_ENABLED -+ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); -+ -+ return static_branch_likely(&lru_gen_static_key); -+#else -+ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); -+ -+ return static_branch_unlikely(&lru_gen_static_key); -+#endif -+} -+ -+/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */ -+static inline int lru_gen_from_seq(unsigned long seq) -+{ -+ return seq % MAX_NR_GENS; -+} -+ -+/* The youngest and the second youngest generations are counted as active. */ -+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) -+{ -+ unsigned long max_seq = lruvec->evictable.max_seq; -+ -+ VM_BUG_ON(gen >= MAX_NR_GENS); -+ -+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); -+} -+ -+/* Update the sizes of the multigenerational lru lists. */ -+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, -+ int old_gen, int new_gen) -+{ -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int delta = thp_nr_pages(page); -+ enum lru_list lru = type * LRU_FILE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ lockdep_assert_held(&lruvec->lru_lock); -+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); -+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); -+ VM_BUG_ON(old_gen == -1 && new_gen == -1); -+ -+ if (old_gen >= 0) -+ WRITE_ONCE(lrugen->sizes[old_gen][type][zone], -+ lrugen->sizes[old_gen][type][zone] - delta); -+ if (new_gen >= 0) -+ WRITE_ONCE(lrugen->sizes[new_gen][type][zone], -+ lrugen->sizes[new_gen][type][zone] + delta); -+ -+ if (old_gen < 0) { -+ if (lru_gen_is_active(lruvec, new_gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, delta); -+ return; -+ } -+ -+ if (new_gen < 0) { -+ if (lru_gen_is_active(lruvec, old_gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, -delta); -+ return; -+ } -+ -+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { -+ update_lru_size(lruvec, lru, zone, -delta); -+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); -+ } -+ -+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); -+} -+ -+/* Add a page to one of the multigenerational lru lists. Return true on success. */ -+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int gen; -+ unsigned long old_flags, new_flags; -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ if (PageUnevictable(page) || !lrugen->enabled[type]) -+ return false; -+ /* -+ * If a page shouldn't be considered for eviction, i.e., a page mapped -+ * upon fault during which the accessed bit is set, add it to the -+ * youngest generation. -+ * -+ * If a page can't be evicted immediately, i.e., an anon page not in -+ * swap cache or a dirty page pending writeback, add it to the second -+ * oldest generation. -+ * -+ * If a page could be evicted immediately, e.g., a clean page, add it to -+ * the oldest generation. -+ */ -+ if (PageActive(page)) -+ gen = lru_gen_from_seq(lrugen->max_seq); -+ else if ((!type && !PageSwapCache(page)) || -+ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))) -+ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); -+ else -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page); -+ -+ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); -+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, -1, gen); -+ /* for rotate_reclaimable_page() */ -+ if (reclaiming) -+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); -+ else -+ list_add(&page->lru, &lrugen->lists[gen][type][zone]); -+ -+ return true; -+} -+ -+/* Delete a page from one of the multigenerational lru lists. Return true on success. */ -+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int gen; -+ unsigned long old_flags, new_flags; -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ if (!(new_flags & LRU_GEN_MASK)) -+ return false; -+ -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ -+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+ -+ new_flags &= ~LRU_GEN_MASK; -+ /* for shrink_page_list() */ -+ if (reclaiming) -+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); -+ else if (lru_gen_is_active(lruvec, gen)) -+ new_flags |= BIT(PG_active); -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, gen, -1); -+ list_del(&page->lru); -+ -+ return true; -+} -+ -+#else -+ -+static inline bool lru_gen_enabled(void) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - static __always_inline void add_page_to_lru_list(struct page *page, - struct lruvec *lruvec) - { - enum lru_list lru = page_lru(page); - -+ if (lru_gen_add_page(page, lruvec, false)) -+ return; -+ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add(&page->lru, &lruvec->lists[lru]); - } -@@ -93,6 +269,9 @@ static __always_inline void add_page_to_ - { - enum lru_list lru = page_lru(page); - -+ if (lru_gen_add_page(page, lruvec, true)) -+ return; -+ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add_tail(&page->lru, &lruvec->lists[lru]); - } -@@ -100,6 +279,9 @@ static __always_inline void add_page_to_ - static __always_inline void del_page_from_lru_list(struct page *page, - struct lruvec *lruvec) - { -+ if (lru_gen_del_page(page, lruvec, false)) -+ return; -+ - list_del(&page->lru); - update_lru_size(lruvec, page_lru(page), page_zonenum(page), - -thp_nr_pages(page)); ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -294,6 +294,72 @@ enum lruvec_flags { - */ - }; - -+struct lruvec; -+ -+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) -+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -+ -+#ifdef CONFIG_LRU_GEN -+ -+/* -+ * For each lruvec, evictable pages are divided into multiple generations. The -+ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are -+ * monotonically increasing. The sliding window technique is used to track at -+ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the -+ * window, AKA gen, indexes an array of per-type and per-zone lists for the -+ * corresponding generation. The counter in page->flags stores gen+1 while a -+ * page is on one of the multigenerational lru lists. Otherwise, it stores 0. -+ * -+ * After a page is faulted in, the aging must check the accessed bit at least -+ * twice before the eviction would consider it. The first check clears the -+ * accessed bit set during the initial fault. The second check makes sure this -+ * page hasn't been used since then. -+ */ -+#define MIN_NR_GENS 2 -+#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) -+ -+struct lrugen { -+ /* the aging increments the max generation number */ -+ unsigned long max_seq; -+ /* the eviction increments the min generation numbers */ -+ unsigned long min_seq[ANON_AND_FILE]; -+ /* the birth time of each generation in jiffies */ -+ unsigned long timestamps[MAX_NR_GENS]; -+ /* the multigenerational lru lists */ -+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* the sizes of the multigenerational lru lists in pages */ -+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* whether the multigenerational lru is enabled */ -+ bool enabled[ANON_AND_FILE]; -+}; -+ -+#define MAX_BATCH_SIZE 8192 -+ -+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); -+void lru_gen_change_state(bool enable, bool main, bool swap); -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_init_memcg(struct mem_cgroup *memcg); -+#endif -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) -+{ -+} -+ -+static inline void lru_gen_change_state(bool enable, bool main, bool swap) -+{ -+} -+ -+#ifdef CONFIG_MEMCG -+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) -+{ -+} -+#endif -+ -+#endif /* CONFIG_LRU_GEN */ -+ - struct lruvec { - struct list_head lists[NR_LRU_LISTS]; - /* per lruvec lru_lock for memcg */ -@@ -311,6 +377,10 @@ struct lruvec { - unsigned long refaults[ANON_AND_FILE]; - /* Various lruvec state flags (enum lruvec_flags) */ - unsigned long flags; -+#ifdef CONFIG_LRU_GEN -+ /* unevictable pages are on LRU_UNEVICTABLE */ -+ struct lrugen evictable; -+#endif - #ifdef CONFIG_MEMCG - struct pglist_data *pgdat; - #endif ---- a/include/linux/page-flags-layout.h -+++ b/include/linux/page-flags-layout.h -@@ -26,6 +26,14 @@ - - #define ZONES_WIDTH ZONES_SHIFT - -+#ifdef CONFIG_LRU_GEN -+/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ -+#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) -+#else -+#define LRU_GEN_WIDTH 0 -+#define LRU_REFS_WIDTH 0 -+#endif /* CONFIG_LRU_GEN */ -+ - #ifdef CONFIG_SPARSEMEM - #include - #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) -@@ -55,7 +63,8 @@ - #define SECTIONS_WIDTH 0 - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ -+ <= BITS_PER_LONG - NR_PAGEFLAGS - #define NODES_WIDTH NODES_SHIFT - #elif defined(CONFIG_SPARSEMEM_VMEMMAP) - #error "Vmemmap: No space for nodes field in page flags" -@@ -89,8 +98,8 @@ - #define LAST_CPUPID_SHIFT 0 - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ -- <= BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ -+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS - #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT - #else - #define LAST_CPUPID_WIDTH 0 -@@ -100,8 +109,8 @@ - #define LAST_CPUPID_NOT_IN_PAGE_FLAGS - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ -- > BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ -+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS - #error "Not enough bits in page flags" - #endif - ---- a/include/linux/page-flags.h -+++ b/include/linux/page-flags.h -@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall - 1UL << PG_private | 1UL << PG_private_2 | \ - 1UL << PG_writeback | 1UL << PG_reserved | \ - 1UL << PG_slab | 1UL << PG_active | \ -- 1UL << PG_unevictable | __PG_MLOCKED) -+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) - - /* - * Flags checked when a page is prepped for return by the page allocator. -@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall - * alloc-free cycle to prevent from reusing the page. - */ - #define PAGE_FLAGS_CHECK_AT_PREP \ -- (PAGEFLAGS_MASK & ~__PG_HWPOISON) -+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) - - #define PAGE_FLAGS_PRIVATE \ - (1UL << PG_private | 1UL << PG_private_2) ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -911,6 +911,9 @@ struct task_struct { - #ifdef CONFIG_MEMCG - unsigned in_user_fault:1; - #endif -+#ifdef CONFIG_LRU_GEN -+ unsigned in_nonseq_fault:1; -+#endif - #ifdef CONFIG_COMPAT_BRK - unsigned brk_randomized:1; - #endif ---- a/kernel/bounds.c -+++ b/kernel/bounds.c -@@ -22,6 +22,9 @@ int main(void) - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); - #endif - DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); -+#ifdef CONFIG_LRU_GEN -+ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); -+#endif - /* End of constants */ - - return 0; ---- a/kernel/cgroup/cgroup-internal.h -+++ b/kernel/cgroup/cgroup-internal.h -@@ -165,7 +165,6 @@ struct cgroup_mgctx { - #define DEFINE_CGROUP_MGCTX(name) \ - struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) - --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - extern struct cgroup_subsys *cgroup_subsys[]; - extern struct list_head cgroup_roots; ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc - #ifdef CONFIG_64BIT - (1L << PG_arch_2) | - #endif -- (1L << PG_dirty))); -+ (1L << PG_dirty) | -+ LRU_GEN_MASK | LRU_REFS_MASK)); - - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all - memcg->deferred_split_queue.split_queue_len = 0; - #endif - idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); -+ lru_gen_init_memcg(memcg); - return memcg; - fail: - mem_cgroup_id_remove(memcg); ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are - unsigned int flags, struct pt_regs *regs) - { - vm_fault_t ret; -+ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); - - __set_current_state(TASK_RUNNING); - -@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are - if (flags & FAULT_FLAG_USER) - mem_cgroup_enter_user_fault(); - -+ if (nonseq_fault) -+ task_enter_nonseq_fault(); -+ - if (unlikely(is_vm_hugetlb_page(vma))) - ret = hugetlb_fault(vma->vm_mm, vma, address, flags); - else - ret = __handle_mm_fault(vma, address, flags); - -+ if (nonseq_fault) -+ task_exit_nonseq_fault(); -+ - if (flags & FAULT_FLAG_USER) { - mem_cgroup_exit_user_fault(); - /* ---- a/mm/mm_init.c -+++ b/mm/mm_init.c -@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo - - shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH -- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; -+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", -- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", -+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", - SECTIONS_WIDTH, - NODES_WIDTH, - ZONES_WIDTH, - LAST_CPUPID_WIDTH, - KASAN_TAG_WIDTH, -+ LRU_GEN_WIDTH, -+ LRU_REFS_WIDTH, - NR_PAGEFLAGS); - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -7456,6 +7456,7 @@ static void __meminit pgdat_init_interna - - pgdat_page_ext_init(pgdat); - lruvec_init(&pgdat->__lruvec); -+ lru_gen_init_state(NULL, &pgdat->__lruvec); - } - - static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page) - VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); - -+ /* see the comment in lru_gen_add_page() */ -+ if (lru_gen_enabled() && !PageUnevictable(page) && -+ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) -+ SetPageActive(page); -+ - get_page(page); - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); -@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc - - static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) - { -- if (PageActive(page) && !PageUnevictable(page)) { -+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { - int nr_pages = thp_nr_pages(page); - - del_page_from_lru_list(page, lruvec); -@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p - */ - void deactivate_page(struct page *page) - { -- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { -+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); ---- a/mm/swapfile.c -+++ b/mm/swapfile.c -@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us - err = 0; - atomic_inc(&proc_poll_event); - wake_up_interruptible(&proc_poll_wait); -+ lru_gen_change_state(false, false, true); - - out_dput: - filp_close(victim, NULL); -@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use - mutex_unlock(&swapon_mutex); - atomic_inc(&proc_poll_event); - wake_up_interruptible(&proc_poll_wait); -+ lru_gen_change_state(true, false, true); - - error = 0; - goto out; ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -50,6 +50,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg - return can_demote(pgdat->node_id, sc); - } - -+#ifdef CONFIG_LRU_GEN -+ -+/****************************************************************************** -+ * shorthand helpers -+ ******************************************************************************/ -+ -+#define for_each_gen_type_zone(gen, type, zone) \ -+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ -+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ -+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) -+ -+static int page_lru_gen(struct page *page) -+{ -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+} -+ -+static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) -+{ -+ struct pglist_data *pgdat = NODE_DATA(nid); -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) { -+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; -+ -+ if (lruvec->pgdat != pgdat) -+ lruvec->pgdat = pgdat; -+ -+ return lruvec; -+ } -+#endif -+ return pgdat ? &pgdat->__lruvec : NULL; -+} -+ -+static int get_nr_gens(struct lruvec *lruvec, int type) -+{ -+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; -+} -+ -+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) -+{ -+ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS && -+ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) && -+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS; -+} -+ -+/****************************************************************************** -+ * state change -+ ******************************************************************************/ -+ -+#ifdef CONFIG_LRU_GEN_ENABLED -+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); -+#else -+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); -+#endif -+ -+static int lru_gen_nr_swapfiles; -+ -+static bool __maybe_unused state_is_valid(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ enum lru_list lru; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for_each_evictable_lru(lru) { -+ type = is_file_lru(lru); -+ -+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) -+ return false; -+ } -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) -+ return false; -+ -+ /* unlikely but not a bug when reset_batch_size() is pending */ -+ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); -+ } -+ -+ return true; -+} -+ -+static bool fill_lists(struct lruvec *lruvec) -+{ -+ enum lru_list lru; -+ int remaining = MAX_BATCH_SIZE; -+ -+ for_each_evictable_lru(lru) { -+ int type = is_file_lru(lru); -+ bool active = is_active_lru(lru); -+ struct list_head *head = &lruvec->lists[lru]; -+ -+ if (!lruvec->evictable.enabled[type]) -+ continue; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page) != active, page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ del_page_from_lru_list(page, lruvec); -+ success = lru_gen_add_page(page, lruvec, false); -+ VM_BUG_ON(!success); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+static bool drain_lists(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ int remaining = MAX_BATCH_SIZE; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; -+ -+ if (lruvec->evictable.enabled[type]) -+ continue; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ success = lru_gen_del_page(page, lruvec, false); -+ VM_BUG_ON(!success); -+ add_page_to_lru_list(page, lruvec); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+/* -+ * For file page tracking, we enable/disable it according to the main switch. -+ * For anon page tracking, we only enabled it when the main switch is on and -+ * there is at least one swapfile; we disable it when there are no swapfiles -+ * regardless of the value of the main switch. Otherwise, we will eventually -+ * reach the max size of the sliding window and have to call inc_min_seq(). -+ */ -+void lru_gen_change_state(bool enable, bool main, bool swap) -+{ -+ static DEFINE_MUTEX(state_mutex); -+ -+ struct mem_cgroup *memcg; -+ -+ mem_hotplug_begin(); -+ cgroup_lock(); -+ mutex_lock(&state_mutex); -+ -+ if (swap) { -+ if (enable) -+ swap = !lru_gen_nr_swapfiles++; -+ else -+ swap = !--lru_gen_nr_swapfiles; -+ } -+ -+ if (main && enable != lru_gen_enabled()) { -+ if (enable) -+ static_branch_enable(&lru_gen_static_key); -+ else -+ static_branch_disable(&lru_gen_static_key); -+ } else if (!swap || !lru_gen_enabled()) -+ goto unlock; -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ int nid; -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ if (!lruvec) -+ continue; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ VM_BUG_ON(!state_is_valid(lruvec)); -+ -+ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -+ lruvec->evictable.enabled[1] = lru_gen_enabled(); -+ -+ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) { -+ spin_unlock_irq(&lruvec->lru_lock); -+ cond_resched(); -+ spin_lock_irq(&lruvec->lru_lock); -+ } -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ } -+ -+ cond_resched(); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+unlock: -+ mutex_unlock(&state_mutex); -+ cgroup_unlock(); -+ mem_hotplug_done(); -+} -+ -+/****************************************************************************** -+ * initialization -+ ******************************************************************************/ -+ -+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) -+{ -+ int i; -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ lrugen->max_seq = MIN_NR_GENS + 1; -+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -+ lrugen->enabled[1] = lru_gen_enabled(); -+ -+ for (i = 0; i <= MIN_NR_GENS + 1; i++) -+ lrugen->timestamps[i] = jiffies; -+ -+ for_each_gen_type_zone(gen, type, zone) -+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -+} -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_init_memcg(struct mem_cgroup *memcg) -+{ -+ int nid; -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ lru_gen_init_state(memcg, lruvec); -+ } -+} -+#endif -+ -+static int __init init_lru_gen(void) -+{ -+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); -+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); -+ -+ return 0; -+}; -+late_initcall(init_lru_gen); -+ -+#endif /* CONFIG_LRU_GEN */ -+ - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - { - unsigned long nr[NR_LRU_LISTS]; diff --git a/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch b/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch deleted file mode 100644 index 2592f18e06..0000000000 --- a/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch +++ /dev/null @@ -1,760 +0,0 @@ -From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001 -From: Yu Zhao -Date: Mon, 5 Apr 2021 04:17:41 -0600 -Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list - -To scan PTEs for accessed pages, a mm_struct list is maintained for -each memcg. When multiple threads traverse the same memcg->mm_list, -each of them gets a unique mm_struct and therefore they can run -walk_page_range() concurrently to reach page tables of all processes -of this memcg. - -This infrastructure also provides the following optimizations: - 1) it allows walkers to skip processes that have been sleeping since - the last walk by tracking the usage of mm_struct between context - switches. - 2) it allows walkers to add interesting items they find during a - walk to a Bloom filter so that they can skip uninteresting items - during the next walk by testing whether an item is in this Bloom - filter. - -Signed-off-by: Yu Zhao -Tested-by: Konstantin Kharlamov -Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8 ---- - fs/exec.c | 2 + - include/linux/memcontrol.h | 4 + - include/linux/mm_inline.h | 6 + - include/linux/mm_types.h | 75 +++++++++ - include/linux/mmzone.h | 63 +++++++ - kernel/exit.c | 1 + - kernel/fork.c | 9 + - kernel/sched/core.c | 1 + - mm/memcontrol.c | 25 +++ - mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++ - 10 files changed, 517 insertions(+) - ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m - active_mm = tsk->active_mm; - tsk->active_mm = mm; - tsk->mm = mm; -+ lru_gen_add_mm(mm); - /* - * This prevents preemption while active_mm is being loaded and - * it and mm are being updated, which could cause problems for -@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m - if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) - local_irq_enable(); - activate_mm(active_mm, mm); -+ lru_gen_activate_mm(mm); - if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) - local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -348,6 +348,10 @@ struct mem_cgroup { - struct deferred_split deferred_split_queue; - #endif - -+#ifdef CONFIG_LRU_GEN -+ struct lru_gen_mm_list mm_list; -+#endif -+ - struct mem_cgroup_per_node *nodeinfo[]; - }; - ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig - return seq % MAX_NR_GENS; - } - -+/* Return a proper index regardless whether we keep stats for historical generations. */ -+static inline int lru_hist_from_seq(unsigned long seq) -+{ -+ return seq % NR_HIST_GENS; -+} -+ - /* The youngest and the second youngest generations are counted as active. */ - static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) - { ---- a/include/linux/mm_types.h -+++ b/include/linux/mm_types.h -@@ -3,6 +3,7 @@ - #define _LINUX_MM_TYPES_H - - #include -+#include - - #include - #include -@@ -15,6 +16,8 @@ - #include - #include - #include -+#include -+#include - - #include - -@@ -580,6 +583,18 @@ struct mm_struct { - #ifdef CONFIG_IOMMU_SUPPORT - u32 pasid; - #endif -+#ifdef CONFIG_LRU_GEN -+ struct { -+ /* the node of a global or per-memcg mm_struct list */ -+ struct list_head list; -+#ifdef CONFIG_MEMCG -+ /* points to the memcg of the owner task above */ -+ struct mem_cgroup *memcg; -+#endif -+ /* whether this mm_struct has been used since the last walk */ -+ nodemask_t nodes; -+ } lrugen; -+#endif /* CONFIG_LRU_GEN */ - } __randomize_layout; - - /* -@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru - return (struct cpumask *)&mm->cpu_bitmap; - } - -+#ifdef CONFIG_LRU_GEN -+ -+struct lru_gen_mm_list { -+ /* a global or per-memcg mm_struct list */ -+ struct list_head fifo; -+ /* protects the list above */ -+ spinlock_t lock; -+}; -+ -+void lru_gen_add_mm(struct mm_struct *mm); -+void lru_gen_del_mm(struct mm_struct *mm); -+#ifdef CONFIG_MEMCG -+void lru_gen_migrate_mm(struct mm_struct *mm); -+#endif -+ -+static inline void lru_gen_init_mm(struct mm_struct *mm) -+{ -+ INIT_LIST_HEAD(&mm->lrugen.list); -+#ifdef CONFIG_MEMCG -+ mm->lrugen.memcg = NULL; -+#endif -+ nodes_clear(mm->lrugen.nodes); -+} -+ -+/* Track the usage of each mm_struct so that we can skip inactive ones. */ -+static inline void lru_gen_activate_mm(struct mm_struct *mm) -+{ -+ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */ -+ VM_WARN_ON(list_empty(&mm->lrugen.list)); -+ -+ if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes)) -+ nodes_setall(mm->lrugen.nodes); -+} -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static inline void lru_gen_add_mm(struct mm_struct *mm) -+{ -+} -+ -+static inline void lru_gen_del_mm(struct mm_struct *mm) -+{ -+} -+ -+#ifdef CONFIG_MEMCG -+static inline void lru_gen_migrate_mm(struct mm_struct *mm) -+{ -+} -+#endif -+ -+static inline void lru_gen_init_mm(struct mm_struct *mm) -+{ -+} -+ -+static inline void lru_gen_activate_mm(struct mm_struct *mm) -+{ -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - struct mmu_gather; - extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); - extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -318,6 +318,13 @@ struct lruvec; - #define MIN_NR_GENS 2 - #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) - -+/* Whether to keep stats for historical generations. */ -+#ifdef CONFIG_LRU_GEN_STATS -+#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) -+#else -+#define NR_HIST_GENS 1U -+#endif -+ - struct lrugen { - /* the aging increments the max generation number */ - unsigned long max_seq; -@@ -333,13 +340,63 @@ struct lrugen { - bool enabled[ANON_AND_FILE]; - }; - -+enum { -+ MM_LEAF_TOTAL, /* total leaf entries */ -+ MM_LEAF_OLD, /* old leaf entries */ -+ MM_LEAF_YOUNG, /* young leaf entries */ -+ MM_NONLEAF_TOTAL, /* total non-leaf entries */ -+ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */ -+ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */ -+ NR_MM_STATS -+}; -+ -+/* mnemonic codes for the stats above */ -+#define MM_STAT_CODES "toydpc" -+ -+/* double buffering bloom filters */ -+#define NR_BLOOM_FILTERS 2 -+ -+struct lru_gen_mm_walk { -+ /* set to max_seq after each round of walk */ -+ unsigned long seq; -+ /* the next mm_struct on the list to walk */ -+ struct list_head *head; -+ /* the first mm_struct never walked before */ -+ struct list_head *tail; -+ /* to wait for the last walker to finish */ -+ struct wait_queue_head wait; -+ /* bloom filters flip after each round of walk */ -+ unsigned long *filters[NR_BLOOM_FILTERS]; -+ /* page table stats for debugging */ -+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; -+ /* the number of concurrent walkers */ -+ int nr_walkers; -+}; -+ -+#define MIN_BATCH_SIZE 64 - #define MAX_BATCH_SIZE 8192 - -+struct mm_walk_args { -+ struct mem_cgroup *memcg; -+ unsigned long max_seq; -+ unsigned long start_pfn; -+ unsigned long end_pfn; -+ unsigned long next_addr; -+ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)]; -+ int node_id; -+ int swappiness; -+ int batch_size; -+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ int mm_stats[NR_MM_STATS]; -+ bool use_filter; -+}; -+ - void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); - void lru_gen_change_state(bool enable, bool main, bool swap); - - #ifdef CONFIG_MEMCG - void lru_gen_init_memcg(struct mem_cgroup *memcg); -+void lru_gen_free_memcg(struct mem_cgroup *memcg); - #endif - - #else /* !CONFIG_LRU_GEN */ -@@ -356,6 +413,10 @@ static inline void lru_gen_change_state( - static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) - { - } -+ -+static inline void lru_gen_free_memcg(struct mem_cgroup *memcg) -+{ -+} - #endif - - #endif /* CONFIG_LRU_GEN */ -@@ -380,6 +441,8 @@ struct lruvec { - #ifdef CONFIG_LRU_GEN - /* unevictable pages are on LRU_UNEVICTABLE */ - struct lrugen evictable; -+ /* state for mm list and page table walks */ -+ struct lru_gen_mm_walk mm_walk; - #endif - #ifdef CONFIG_MEMCG - struct pglist_data *pgdat; ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -422,6 +422,7 @@ assign_new_owner: - goto retry; - } - WRITE_ONCE(mm->owner, c); -+ lru_gen_migrate_mm(mm); - task_unlock(c); - put_task_struct(c); - } ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -1080,6 +1080,7 @@ static struct mm_struct *mm_init(struct - goto fail_nocontext; - - mm->user_ns = get_user_ns(user_ns); -+ lru_gen_init_mm(mm); - return mm; - - fail_nocontext: -@@ -1122,6 +1123,7 @@ static inline void __mmput(struct mm_str - } - if (mm->binfmt) - module_put(mm->binfmt->module); -+ lru_gen_del_mm(mm); - mmdrop(mm); - } - -@@ -2617,6 +2619,13 @@ pid_t kernel_clone(struct kernel_clone_a - get_task_struct(p); - } - -+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { -+ /* lock the task to synchronize with memcg migration */ -+ task_lock(p); -+ lru_gen_add_mm(p->mm); -+ task_unlock(p); -+ } -+ - wake_up_new_task(p); - - /* forking complete and child started to run, tell ptracer */ ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -4978,6 +4978,7 @@ context_switch(struct rq *rq, struct tas - * finish_task_switch()'s mmdrop(). - */ - switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ lru_gen_activate_mm(next->mm); - - if (!prev->mm) { // from kernel - /* will mmdrop() in finish_task_switch(). */ ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -5163,6 +5163,7 @@ static void __mem_cgroup_free(struct mem - - static void mem_cgroup_free(struct mem_cgroup *memcg) - { -+ lru_gen_free_memcg(memcg); - memcg_wb_domain_exit(memcg); - __mem_cgroup_free(memcg); - } -@@ -6195,6 +6196,29 @@ static void mem_cgroup_move_task(void) - } - #endif - -+#ifdef CONFIG_LRU_GEN -+static void mem_cgroup_attach(struct cgroup_taskset *tset) -+{ -+ struct cgroup_subsys_state *css; -+ struct task_struct *task = NULL; -+ -+ cgroup_taskset_for_each_leader(task, css, tset) -+ break; -+ -+ if (!task) -+ return; -+ -+ task_lock(task); -+ if (task->mm && task->mm->owner == task) -+ lru_gen_migrate_mm(task->mm); -+ task_unlock(task); -+} -+#else -+static void mem_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+#endif /* CONFIG_LRU_GEN */ -+ - static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) - { - if (value == PAGE_COUNTER_MAX) -@@ -6538,6 +6562,7 @@ struct cgroup_subsys memory_cgrp_subsys - .css_reset = mem_cgroup_css_reset, - .css_rstat_flush = mem_cgroup_css_rstat_flush, - .can_attach = mem_cgroup_can_attach, -+ .attach = mem_cgroup_attach, - .cancel_attach = mem_cgroup_cancel_attach, - .post_attach = mem_cgroup_move_task, - .dfl_cftypes = memory_files, ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -2929,6 +2929,306 @@ static bool __maybe_unused seq_is_valid( - } - - /****************************************************************************** -+ * mm_struct list -+ ******************************************************************************/ -+ -+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) -+{ -+ static struct lru_gen_mm_list mm_list = { -+ .fifo = LIST_HEAD_INIT(mm_list.fifo), -+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), -+ }; -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) -+ return &memcg->mm_list; -+#endif -+ return &mm_list; -+} -+ -+void lru_gen_add_mm(struct mm_struct *mm) -+{ -+ int nid; -+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); -+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); -+ -+ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm); -+#ifdef CONFIG_MEMCG -+ VM_BUG_ON_MM(mm->lrugen.memcg, mm); -+ mm->lrugen.memcg = memcg; -+#endif -+ spin_lock(&mm_list->lock); -+ -+ list_add_tail(&mm->lrugen.list, &mm_list->fifo); -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ if (!lruvec) -+ continue; -+ -+ if (lruvec->mm_walk.tail == &mm_list->fifo) -+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev; -+ } -+ -+ spin_unlock(&mm_list->lock); -+} -+ -+void lru_gen_del_mm(struct mm_struct *mm) -+{ -+ int nid; -+ struct lru_gen_mm_list *mm_list; -+ struct mem_cgroup *memcg = NULL; -+ -+ if (list_empty(&mm->lrugen.list)) -+ return; -+ -+#ifdef CONFIG_MEMCG -+ memcg = mm->lrugen.memcg; -+#endif -+ mm_list = get_mm_list(memcg); -+ -+ spin_lock(&mm_list->lock); -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ if (!lruvec) -+ continue; -+ -+ if (lruvec->mm_walk.tail == &mm->lrugen.list) -+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next; -+ -+ if (lruvec->mm_walk.head != &mm->lrugen.list) -+ continue; -+ -+ lruvec->mm_walk.head = lruvec->mm_walk.head->next; -+ if (lruvec->mm_walk.head == &mm_list->fifo) -+ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1); -+ } -+ -+ list_del_init(&mm->lrugen.list); -+ -+ spin_unlock(&mm_list->lock); -+ -+#ifdef CONFIG_MEMCG -+ mem_cgroup_put(mm->lrugen.memcg); -+ mm->lrugen.memcg = NULL; -+#endif -+} -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_migrate_mm(struct mm_struct *mm) -+{ -+ struct mem_cgroup *memcg; -+ -+ lockdep_assert_held(&mm->owner->alloc_lock); -+ -+ if (mem_cgroup_disabled()) -+ return; -+ -+ rcu_read_lock(); -+ memcg = mem_cgroup_from_task(mm->owner); -+ rcu_read_unlock(); -+ if (memcg == mm->lrugen.memcg) -+ return; -+ -+ VM_BUG_ON_MM(!mm->lrugen.memcg, mm); -+ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm); -+ -+ lru_gen_del_mm(mm); -+ lru_gen_add_mm(mm); -+} -+#endif -+ -+#define BLOOM_FILTER_SHIFT 15 -+ -+static inline int filter_gen_from_seq(unsigned long seq) -+{ -+ return seq % NR_BLOOM_FILTERS; -+} -+ -+static void get_item_key(void *item, int *key) -+{ -+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); -+ -+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); -+ -+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); -+ key[1] = hash >> BLOOM_FILTER_SHIFT; -+} -+ -+static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq) -+{ -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); -+ -+ filter = lruvec->mm_walk.filters[gen]; -+ if (filter) { -+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); -+ return; -+ } -+ -+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC); -+ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter); -+} -+ -+static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -+{ -+ int key[2]; -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); -+ if (!filter) -+ return; -+ -+ get_item_key(item, key); -+ -+ if (!test_bit(key[0], filter)) -+ set_bit(key[0], filter); -+ if (!test_bit(key[1], filter)) -+ set_bit(key[1], filter); -+} -+ -+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -+{ -+ int key[2]; -+ unsigned long *filter; -+ int gen = filter_gen_from_seq(seq); -+ -+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]); -+ if (!filter) -+ return false; -+ -+ get_item_key(item, key); -+ -+ return test_bit(key[0], filter) && test_bit(key[1], filter); -+} -+ -+static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args) -+{ -+ int i; -+ int hist = lru_hist_from_seq(args->max_seq); -+ -+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); -+ -+ for (i = 0; i < NR_MM_STATS; i++) { -+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], -+ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]); -+ args->mm_stats[i] = 0; -+ } -+ -+ if (!last || NR_HIST_GENS == 1) -+ return; -+ -+ hist = lru_hist_from_seq(args->max_seq + 1); -+ for (i = 0; i < NR_MM_STATS; i++) -+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0); -+} -+ -+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args) -+{ -+ int type; -+ unsigned long size = 0; -+ -+ if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes)) -+ return true; -+ -+ if (mm_is_oom_victim(mm)) -+ return true; -+ -+ for (type = !args->swappiness; type < ANON_AND_FILE; type++) { -+ size += type ? get_mm_counter(mm, MM_FILEPAGES) : -+ get_mm_counter(mm, MM_ANONPAGES) + -+ get_mm_counter(mm, MM_SHMEMPAGES); -+ } -+ -+ if (size < MIN_BATCH_SIZE) -+ return true; -+ -+ if (!mmget_not_zero(mm)) -+ return true; -+ -+ node_clear(args->node_id, mm->lrugen.nodes); -+ -+ return false; -+} -+ -+/* To support multiple walkers that concurrently walk an mm_struct list. */ -+static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args, -+ struct mm_struct **iter) -+{ -+ bool first = false; -+ bool last = true; -+ struct mm_struct *mm = NULL; -+ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk; -+ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg); -+ -+ if (*iter) -+ mmput_async(*iter); -+ else if (args->max_seq <= READ_ONCE(mm_walk->seq)) -+ return false; -+ -+ spin_lock(&mm_list->lock); -+ -+ VM_BUG_ON(args->max_seq > mm_walk->seq + 1); -+ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq); -+ VM_BUG_ON(*iter && !mm_walk->nr_walkers); -+ -+ if (args->max_seq <= mm_walk->seq) { -+ if (!*iter) -+ last = false; -+ goto done; -+ } -+ -+ if (mm_walk->head == &mm_list->fifo) { -+ VM_BUG_ON(mm_walk->nr_walkers); -+ mm_walk->head = mm_walk->head->next; -+ first = true; -+ } -+ -+ while (!mm && mm_walk->head != &mm_list->fifo) { -+ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list); -+ -+ mm_walk->head = mm_walk->head->next; -+ -+ if (mm_walk->tail == &mm->lrugen.list) { -+ mm_walk->tail = mm_walk->tail->next; -+ args->use_filter = false; -+ } -+ -+ if (should_skip_mm(mm, args)) -+ mm = NULL; -+ } -+ -+ if (mm_walk->head == &mm_list->fifo) -+ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1); -+done: -+ if (*iter && !mm) -+ mm_walk->nr_walkers--; -+ if (!*iter && mm) -+ mm_walk->nr_walkers++; -+ -+ if (mm_walk->nr_walkers) -+ last = false; -+ -+ if (mm && first) -+ clear_bloom_filter(lruvec, args->max_seq + 1); -+ -+ if (*iter || last) -+ reset_mm_stats(lruvec, last, args); -+ -+ spin_unlock(&mm_list->lock); -+ -+ *iter = mm; -+ -+ return last; -+} -+ -+/****************************************************************************** - * state change - ******************************************************************************/ - -@@ -3112,6 +3412,7 @@ void lru_gen_init_state(struct mem_cgrou - int i; - int gen, type, zone; - struct lrugen *lrugen = &lruvec->evictable; -+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - - lrugen->max_seq = MIN_NR_GENS + 1; - lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -@@ -3122,6 +3423,17 @@ void lru_gen_init_state(struct mem_cgrou - - for_each_gen_type_zone(gen, type, zone) - INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -+ -+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) -+ spin_lock(&mm_list->lock); -+ -+ lruvec->mm_walk.seq = MIN_NR_GENS; -+ lruvec->mm_walk.head = &mm_list->fifo; -+ lruvec->mm_walk.tail = &mm_list->fifo; -+ init_waitqueue_head(&lruvec->mm_walk.wait); -+ -+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg) -+ spin_unlock(&mm_list->lock); - } - - #ifdef CONFIG_MEMCG -@@ -3129,18 +3441,37 @@ void lru_gen_init_memcg(struct mem_cgrou - { - int nid; - -+ INIT_LIST_HEAD(&memcg->mm_list.fifo); -+ spin_lock_init(&memcg->mm_list.lock); -+ - for_each_node(nid) { - struct lruvec *lruvec = get_lruvec(nid, memcg); - - lru_gen_init_state(memcg, lruvec); - } - } -+ -+void lru_gen_free_memcg(struct mem_cgroup *memcg) -+{ -+ int nid; -+ -+ for_each_node(nid) { -+ int i; -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ for (i = 0; i < NR_BLOOM_FILTERS; i++) { -+ bitmap_free(lruvec->mm_walk.filters[i]); -+ lruvec->mm_walk.filters[i] = NULL; -+ } -+ } -+} - #endif - - static int __init init_lru_gen(void) - { - BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); - BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); -+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); - - return 0; - }; diff --git a/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch b/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch deleted file mode 100644 index 6fc93d9422..0000000000 --- a/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch +++ /dev/null @@ -1,1176 +0,0 @@ -From 8217cd2238c40cf77208aa27a7cc09879e685890 Mon Sep 17 00:00:00 2001 -From: Yu Zhao -Date: Mon, 5 Apr 2021 04:35:07 -0600 -Subject: [PATCH 06/10] mm: multigenerational lru: aging - -The aging produces young generations. Given an lruvec, the aging -traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan -PTEs for accessed pages. Upon finding one, the aging updates its -generation number to max_seq (modulo MAX_NR_GENS). After each round of -traversal, the aging increments max_seq. The aging is due when -min_seq[] reaches max_seq-1. - -The aging uses the following optimizations when walking page tables: - 1) It skips non-leaf PMD entries that have the accessed bit cleared - when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y. - 2) It does not zigzag between a PGD table and the same PMD or PTE - table spanning multiple VMAs. In other words, it finishes all the - VMAs within the range of the same PMD or PTE table before it returns - to this PGD table. This optimizes workloads that have large numbers - of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5. - -Signed-off-by: Yu Zhao -Tested-by: Konstantin Kharlamov -Change-Id: I3ae8abc3100d023cecb3a699d86020ae6fc10a45 ---- - include/linux/memcontrol.h | 3 + - include/linux/mmzone.h | 9 + - include/linux/oom.h | 16 + - include/linux/swap.h | 3 + - mm/memcontrol.c | 5 + - mm/oom_kill.c | 4 +- - mm/rmap.c | 8 + - mm/vmscan.c | 948 +++++++++++++++++++++++++++++++++++++ - 8 files changed, 994 insertions(+), 2 deletions(-) - ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -1367,10 +1367,13 @@ mem_cgroup_print_oom_meminfo(struct mem_ - - static inline void lock_page_memcg(struct page *page) - { -+ /* to match page_memcg_rcu() */ -+ rcu_read_lock(); - } - - static inline void unlock_page_memcg(struct page *page) - { -+ rcu_read_unlock(); - } - - static inline void mem_cgroup_handle_over_high(void) ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -295,6 +295,7 @@ enum lruvec_flags { - }; - - struct lruvec; -+struct page_vma_mapped_walk; - - #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) - #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -@@ -393,6 +394,7 @@ struct mm_walk_args { - - void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); - void lru_gen_change_state(bool enable, bool main, bool swap); -+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); - - #ifdef CONFIG_MEMCG - void lru_gen_init_memcg(struct mem_cgroup *memcg); -@@ -409,6 +411,10 @@ static inline void lru_gen_change_state( - { - } - -+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) -+{ -+} -+ - #ifdef CONFIG_MEMCG - static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) - { -@@ -1028,6 +1034,9 @@ typedef struct pglist_data { - - unsigned long flags; - -+#ifdef CONFIG_LRU_GEN -+ struct mm_walk_args mm_walk_args; -+#endif - ZONE_PADDING(_pad2_) - - /* Per-node vmstats */ ---- a/include/linux/oom.h -+++ b/include/linux/oom.h -@@ -57,6 +57,22 @@ struct oom_control { - extern struct mutex oom_lock; - extern struct mutex oom_adj_mutex; - -+#ifdef CONFIG_MMU -+extern struct task_struct *oom_reaper_list; -+extern struct wait_queue_head oom_reaper_wait; -+ -+static inline bool oom_reaping_in_progress(void) -+{ -+ /* racy check to see if oom reaping could be in progress */ -+ return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait); -+} -+#else -+static inline bool oom_reaping_in_progress(void) -+{ -+ return false; -+} -+#endif -+ - static inline void set_current_oom_origin(void) - { - current->signal->oom_flag_origin = true; ---- a/include/linux/swap.h -+++ b/include/linux/swap.h -@@ -137,6 +137,9 @@ union swap_header { - */ - struct reclaim_state { - unsigned long reclaimed_slab; -+#ifdef CONFIG_LRU_GEN -+ struct mm_walk_args *mm_walk_args; -+#endif - }; - - #ifdef __KERNEL__ ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -1304,12 +1304,17 @@ void mem_cgroup_update_lru_size(struct l - *lru_size += nr_pages; - - size = *lru_size; -+#ifdef CONFIG_LRU_GEN -+ /* unlikely but not a bug when reset_batch_size() is pending */ -+ VM_WARN_ON(size + MAX_BATCH_SIZE < 0); -+#else - if (WARN_ONCE(size < 0, - "%s(%p, %d, %d): lru_size %ld\n", - __func__, lruvec, lru, nr_pages, size)) { - VM_BUG_ON(1); - *lru_size = 0; - } -+#endif - - if (nr_pages > 0) - *lru_size += nr_pages; ---- a/mm/oom_kill.c -+++ b/mm/oom_kill.c -@@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struc - * victim (if that is possible) to help the OOM killer to move on. - */ - static struct task_struct *oom_reaper_th; --static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); --static struct task_struct *oom_reaper_list; -+DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); -+struct task_struct *oom_reaper_list; - static DEFINE_SPINLOCK(oom_reaper_lock); - - bool __oom_reap_task_mm(struct mm_struct *mm) ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -73,6 +73,7 @@ - #include - #include - #include -+#include - - #include - -@@ -793,6 +794,13 @@ static bool page_referenced_one(struct p - } - - if (pvmw.pte) { -+ /* the multigenerational lru exploits the spatial locality */ -+ if (lru_gen_enabled() && pte_young(*pvmw.pte) && -+ !(vma->vm_flags & VM_SEQ_READ)) { -+ lru_gen_look_around(&pvmw); -+ referenced++; -+ } -+ - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - /* ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -51,6 +51,8 @@ - #include - #include - #include -+#include -+#include - - #include - #include -@@ -2887,6 +2889,15 @@ static bool can_age_anon_pages(struct pg - * shorthand helpers - ******************************************************************************/ - -+#define DEFINE_MAX_SEQ(lruvec) \ -+ unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq) -+ -+#define DEFINE_MIN_SEQ(lruvec) \ -+ unsigned long min_seq[ANON_AND_FILE] = { \ -+ READ_ONCE((lruvec)->evictable.min_seq[0]), \ -+ READ_ONCE((lruvec)->evictable.min_seq[1]), \ -+ } -+ - #define for_each_gen_type_zone(gen, type, zone) \ - for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ - for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ -@@ -2899,6 +2910,12 @@ static int page_lru_gen(struct page *pag - return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; - } - -+static int get_swappiness(struct mem_cgroup *memcg) -+{ -+ return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? -+ mem_cgroup_swappiness(memcg) : 0; -+} -+ - static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) - { - struct pglist_data *pgdat = NODE_DATA(nid); -@@ -3229,6 +3246,926 @@ done: - } - - /****************************************************************************** -+ * the aging -+ ******************************************************************************/ -+ -+static int page_update_gen(struct page *page, int gen) -+{ -+ unsigned long old_flags, new_flags; -+ -+ VM_BUG_ON(gen >= MAX_NR_GENS); -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ -+ if (!(new_flags & LRU_GEN_MASK)) { -+ new_flags |= BIT(PG_referenced); -+ continue; -+ } -+ -+ new_flags &= ~LRU_GEN_MASK; -+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ } while (new_flags != old_flags && -+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+} -+ -+static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int old_gen, new_gen; -+ unsigned long old_flags, new_flags; -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ old_gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page); -+ -+ new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+ /* page_update_gen() has updated this page? */ -+ if (new_gen >= 0 && new_gen != old_gen) { -+ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); -+ return; -+ } -+ -+ new_gen = (old_gen + 1) % MAX_NR_GENS; -+ -+ new_flags &= ~LRU_GEN_MASK; -+ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; -+ /* for end_page_writeback() */ -+ if (reclaiming) -+ new_flags |= BIT(PG_reclaim); -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, old_gen, new_gen); -+ if (reclaiming) -+ list_move(&page->lru, &lrugen->lists[new_gen][type][zone]); -+ else -+ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]); -+} -+ -+static void update_batch_size(struct page *page, int old_gen, int new_gen, -+ struct mm_walk_args *args) -+{ -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int delta = thp_nr_pages(page); -+ -+ VM_BUG_ON(old_gen >= MAX_NR_GENS); -+ VM_BUG_ON(new_gen >= MAX_NR_GENS); -+ -+ args->batch_size++; -+ -+ args->nr_pages[old_gen][type][zone] -= delta; -+ args->nr_pages[new_gen][type][zone] += delta; -+} -+ -+static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args) -+{ -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ args->batch_size = 0; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ enum lru_list lru = type * LRU_FILE; -+ int delta = args->nr_pages[gen][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ args->nr_pages[gen][type][zone] = 0; -+ WRITE_ONCE(lrugen->sizes[gen][type][zone], -+ lrugen->sizes[gen][type][zone] + delta); -+ -+ if (lru_gen_is_active(lruvec, gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, delta); -+ } -+} -+ -+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk) -+{ -+ struct address_space *mapping; -+ struct vm_area_struct *vma = walk->vma; -+ struct mm_walk_args *args = walk->private; -+ -+ if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) || -+ (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ))) -+ return true; -+ -+ if (vma_is_anonymous(vma)) -+ return !args->swappiness; -+ -+ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) -+ return true; -+ -+ mapping = vma->vm_file->f_mapping; -+ if (!mapping->a_ops->writepage) -+ return true; -+ -+ return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping); -+} -+ -+/* -+ * Some userspace memory allocators create many single-page VMAs. So instead of -+ * returning back to the PGD table for each of such VMAs, we finish at least an -+ * entire PMD table and therefore avoid many zigzags. -+ */ -+static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size, -+ unsigned long *start, unsigned long *end) -+{ -+ unsigned long next = round_up(*end, size); -+ -+ VM_BUG_ON(mask & size); -+ VM_BUG_ON(*start >= *end); -+ VM_BUG_ON((next & mask) != (*start & mask)); -+ -+ while (walk->vma) { -+ if (next >= walk->vma->vm_end) { -+ walk->vma = walk->vma->vm_next; -+ continue; -+ } -+ -+ if ((next & mask) != (walk->vma->vm_start & mask)) -+ return false; -+ -+ if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) { -+ walk->vma = walk->vma->vm_next; -+ continue; -+ } -+ -+ *start = max(next, walk->vma->vm_start); -+ next = (next | ~mask) + 1; -+ /* rounded-up boundaries can wrap to 0 */ -+ *end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end; -+ -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, -+ struct mm_walk *walk) -+{ -+ int i; -+ pte_t *pte; -+ spinlock_t *ptl; -+ unsigned long addr; -+ int worth = 0; -+ struct mm_walk_args *args = walk->private; -+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); -+ -+ VM_BUG_ON(pmd_leaf(*pmd)); -+ -+ pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl); -+ arch_enter_lazy_mmu_mode(); -+restart: -+ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { -+ struct page *page; -+ unsigned long pfn = pte_pfn(pte[i]); -+ -+ args->mm_stats[MM_LEAF_TOTAL]++; -+ -+ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) -+ continue; -+ -+ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) -+ continue; -+ -+ if (!pte_young(pte[i])) { -+ args->mm_stats[MM_LEAF_OLD]++; -+ continue; -+ } -+ -+ VM_BUG_ON(!pfn_valid(pfn)); -+ if (pfn < args->start_pfn || pfn >= args->end_pfn) -+ continue; -+ -+ page = compound_head(pfn_to_page(pfn)); -+ if (page_to_nid(page) != args->node_id) -+ continue; -+ -+ if (page_memcg_rcu(page) != args->memcg) -+ continue; -+ -+ VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end); -+ if (!ptep_test_and_clear_young(walk->vma, addr, pte + i)) -+ continue; -+ -+ args->mm_stats[MM_LEAF_YOUNG]++; -+ -+ if (pte_dirty(pte[i]) && !PageDirty(page) && -+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) -+ set_page_dirty(page); -+ -+ old_gen = page_update_gen(page, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(page, old_gen, new_gen, args); -+ -+ worth++; -+ } -+ -+ if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end)) -+ goto restart; -+ -+ arch_leave_lazy_mmu_mode(); -+ pte_unmap_unlock(pte, ptl); -+ -+ return worth >= MIN_BATCH_SIZE / 2; -+} -+ -+/* -+ * We scan PMD entries in two passes. The first pass reaches to PTE tables and -+ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD -+ * entries and needs to take the PMD lock. -+ */ -+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, -+ struct vm_area_struct *vma, struct mm_walk *walk) -+{ -+ int i; -+ pmd_t *pmd; -+ spinlock_t *ptl; -+ struct mm_walk_args *args = walk->private; -+ int old_gen, new_gen = lru_gen_from_seq(args->max_seq); -+ -+ VM_BUG_ON(pud_leaf(*pud)); -+ -+ start = (start & PUD_MASK) + offset * PMD_SIZE; -+ pmd = pmd_offset(pud, start); -+ ptl = pmd_lock(walk->mm, pmd); -+ arch_enter_lazy_mmu_mode(); -+ -+ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) { -+ struct page *page; -+ unsigned long pfn = pmd_pfn(pmd[i]); -+ unsigned long addr = start + i * PMD_SIZE; -+ -+ if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i])) -+ continue; -+ -+ if (WARN_ON_ONCE(pmd_devmap(pmd[i]))) -+ continue; -+ -+ if (!pmd_trans_huge(pmd[i])) { -+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) -+ pmdp_test_and_clear_young(vma, addr, pmd + i); -+ continue; -+ } -+ -+ VM_BUG_ON(!pfn_valid(pfn)); -+ if (pfn < args->start_pfn || pfn >= args->end_pfn) -+ continue; -+ -+ page = pfn_to_page(pfn); -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ if (page_to_nid(page) != args->node_id) -+ continue; -+ -+ if (page_memcg_rcu(page) != args->memcg) -+ continue; -+ -+ VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end); -+ if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) -+ continue; -+ -+ args->mm_stats[MM_LEAF_YOUNG]++; -+ -+ if (pmd_dirty(pmd[i]) && !PageDirty(page) && -+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) -+ set_page_dirty(page); -+ -+ old_gen = page_update_gen(page, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(page, old_gen, new_gen, args); -+ } -+ -+ arch_leave_lazy_mmu_mode(); -+ spin_unlock(ptl); -+ -+ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); -+} -+#else -+static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset, -+ struct vm_area_struct *vma, struct mm_walk *walk) -+{ -+} -+#endif -+ -+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, -+ struct mm_walk *walk) -+{ -+ int i; -+ pmd_t *pmd; -+ unsigned long next; -+ unsigned long addr; -+ struct vm_area_struct *vma; -+ int offset = -1; -+ bool reset = false; -+ struct mm_walk_args *args = walk->private; -+ struct lruvec *lruvec = get_lruvec(args->node_id, args->memcg); -+ -+ VM_BUG_ON(pud_leaf(*pud)); -+ -+ pmd = pmd_offset(pud, start & PUD_MASK); -+restart: -+ vma = walk->vma; -+ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { -+ pmd_t val = pmd_read_atomic(pmd + i); -+ -+ /* for pmd_read_atomic() */ -+ barrier(); -+ -+ next = pmd_addr_end(addr, end); -+ -+ if (!pmd_present(val)) { -+ args->mm_stats[MM_LEAF_TOTAL]++; -+ continue; -+ } -+ -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+ if (pmd_trans_huge(val)) { -+ unsigned long pfn = pmd_pfn(val); -+ -+ args->mm_stats[MM_LEAF_TOTAL]++; -+ -+ if (is_huge_zero_pmd(val)) -+ continue; -+ -+ if (!pmd_young(val)) { -+ args->mm_stats[MM_LEAF_OLD]++; -+ continue; -+ } -+ -+ if (pfn < args->start_pfn || pfn >= args->end_pfn) -+ continue; -+ -+ if (offset < 0) -+ offset = i; -+ else if (i - offset >= MIN_BATCH_SIZE) { -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+ offset = i; -+ } -+ __set_bit(i - offset, args->bitmap); -+ reset = true; -+ continue; -+ } -+#endif -+ args->mm_stats[MM_NONLEAF_TOTAL]++; -+ -+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG -+ if (!pmd_young(val)) -+ continue; -+ -+ if (offset < 0) -+ offset = i; -+ else if (i - offset >= MIN_BATCH_SIZE) { -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+ offset = i; -+ reset = false; -+ } -+ __set_bit(i - offset, args->bitmap); -+#endif -+ if (args->use_filter && !test_bloom_filter(lruvec, args->max_seq, pmd + i)) -+ continue; -+ -+ args->mm_stats[MM_NONLEAF_PREV]++; -+ -+ if (!walk_pte_range(&val, addr, next, walk)) -+ continue; -+ -+ args->mm_stats[MM_NONLEAF_CUR]++; -+ -+ set_bloom_filter(lruvec, args->max_seq + 1, pmd + i); -+ } -+ -+ if (reset) { -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+ offset = -1; -+ reset = false; -+ } -+ -+ if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end)) -+ goto restart; -+ -+ if (offset >= 0) -+ walk_pmd_range_locked(pud, start, offset, vma, walk); -+} -+ -+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, -+ struct mm_walk *walk) -+{ -+ int i; -+ pud_t *pud; -+ unsigned long addr; -+ unsigned long next; -+ struct mm_walk_args *args = walk->private; -+ -+ VM_BUG_ON(p4d_leaf(*p4d)); -+ -+ pud = pud_offset(p4d, start & P4D_MASK); -+restart: -+ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { -+ pud_t val = READ_ONCE(pud[i]); -+ -+ next = pud_addr_end(addr, end); -+ -+ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) -+ continue; -+ -+ walk_pmd_range(&val, addr, next, walk); -+ -+ if (args->batch_size >= MAX_BATCH_SIZE) { -+ end = (addr | ~PUD_MASK) + 1; -+ goto done; -+ } -+ } -+ -+ if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end)) -+ goto restart; -+ -+ end = round_up(end, P4D_SIZE); -+done: -+ /* rounded-up boundaries can wrap to 0 */ -+ args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0; -+ -+ return -EAGAIN; -+} -+ -+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct mm_walk_args *args) -+{ -+ static const struct mm_walk_ops mm_walk_ops = { -+ .test_walk = should_skip_vma, -+ .p4d_entry = walk_pud_range, -+ }; -+ -+ int err; -+ -+ args->next_addr = FIRST_USER_ADDRESS; -+ -+ do { -+ unsigned long start = args->next_addr; -+ unsigned long end = mm->highest_vm_end; -+ -+ err = -EBUSY; -+ -+ rcu_read_lock(); -+#ifdef CONFIG_MEMCG -+ if (args->memcg && atomic_read(&args->memcg->moving_account)) -+ goto contended; -+#endif -+ if (!mmap_read_trylock(mm)) -+ goto contended; -+ -+ err = walk_page_range(mm, start, end, &mm_walk_ops, args); -+ -+ mmap_read_unlock(mm); -+ -+ if (args->batch_size) { -+ spin_lock_irq(&lruvec->lru_lock); -+ reset_batch_size(lruvec, args); -+ spin_unlock_irq(&lruvec->lru_lock); -+ } -+contended: -+ rcu_read_unlock(); -+ -+ cond_resched(); -+ } while (err == -EAGAIN && args->next_addr && !mm_is_oom_victim(mm)); -+} -+ -+static struct mm_walk_args *alloc_mm_walk_args(void) -+{ -+ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) -+ return kvzalloc(sizeof(struct mm_walk_args), GFP_KERNEL); -+ -+ return current->reclaim_state->mm_walk_args; -+} -+ -+static void free_mm_walk_args(struct mm_walk_args *args) -+{ -+ if (!current->reclaim_state || !current->reclaim_state->mm_walk_args) -+ kvfree(args); -+} -+ -+static bool inc_min_seq(struct lruvec *lruvec, int type) -+{ -+ int gen, zone; -+ int remaining = MAX_BATCH_SIZE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ if (get_nr_gens(lruvec, type) != MAX_NR_GENS) -+ return true; -+ -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ struct list_head *head = &lrugen->lists[gen][type][zone]; -+ -+ while (!list_empty(head)) { -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ page_inc_gen(page, lruvec, false); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); -+ -+ return true; -+} -+ -+static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) -+{ -+ int gen, type, zone; -+ bool success = false; -+ struct lrugen *lrugen = &lruvec->evictable; -+ DEFINE_MIN_SEQ(lruvec); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ while (lrugen->max_seq - min_seq[type] >= MIN_NR_GENS) { -+ gen = lru_gen_from_seq(min_seq[type]); -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ if (!list_empty(&lrugen->lists[gen][type][zone])) -+ goto next; -+ } -+ -+ min_seq[type]++; -+ } -+next: -+ ; -+ } -+ -+ min_seq[0] = min(min_seq[0], min_seq[1]); -+ if (swappiness) -+ min_seq[1] = max(min_seq[0], lrugen->min_seq[1]); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ if (min_seq[type] == lrugen->min_seq[type]) -+ continue; -+ -+ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); -+ success = true; -+ } -+ -+ return success; -+} -+ -+static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq) -+{ -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ if (max_seq != lrugen->max_seq) -+ goto unlock; -+ -+ if (!try_to_inc_min_seq(lruvec, true)) { -+ for (type = ANON_AND_FILE - 1; type >= 0; type--) { -+ while (!inc_min_seq(lruvec, type)) { -+ spin_unlock_irq(&lruvec->lru_lock); -+ cond_resched(); -+ spin_lock_irq(&lruvec->lru_lock); -+ } -+ } -+ } -+ -+ gen = lru_gen_from_seq(lrugen->max_seq - 1); -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ enum lru_list lru = type * LRU_FILE; -+ long delta = lrugen->sizes[gen][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ WARN_ON_ONCE(delta != (int)delta); -+ -+ update_lru_size(lruvec, lru, zone, delta); -+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); -+ } -+ } -+ -+ gen = lru_gen_from_seq(lrugen->max_seq + 1); -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) { -+ enum lru_list lru = type * LRU_FILE; -+ long delta = lrugen->sizes[gen][type][zone]; -+ -+ if (!delta) -+ continue; -+ -+ WARN_ON_ONCE(delta != (int)delta); -+ -+ update_lru_size(lruvec, lru, zone, -delta); -+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); -+ } -+ } -+ -+ WRITE_ONCE(lrugen->timestamps[gen], jiffies); -+ /* make sure all preceding modifications appear first */ -+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); -+unlock: -+ spin_unlock_irq(&lruvec->lru_lock); -+} -+ -+/* Main function used by the foreground, the background and the user-triggered aging. */ -+static bool try_to_inc_max_seq(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long max_seq, bool use_filter) -+{ -+ bool last; -+ struct mm_walk_args *args; -+ struct mm_struct *mm = NULL; -+ struct lrugen *lrugen = &lruvec->evictable; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ int nid = pgdat->node_id; -+ -+ VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq)); -+ -+ /* -+ * If we are not from run_aging() and clearing the accessed bit may -+ * trigger page faults, then don't proceed to clearing all accessed -+ * PTEs. Instead, fallback to lru_gen_look_around(), which only clears a -+ * handful of accessed PTEs. This is less efficient but causes fewer -+ * page faults on CPUs that don't have the capability. -+ */ -+ if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young(false)) { -+ inc_max_seq(lruvec, max_seq); -+ return true; -+ } -+ -+ args = alloc_mm_walk_args(); -+ if (!args) -+ return false; -+ -+ args->memcg = memcg; -+ args->max_seq = max_seq; -+ args->start_pfn = pgdat->node_start_pfn; -+ args->end_pfn = pgdat_end_pfn(pgdat); -+ args->node_id = nid; -+ args->swappiness = swappiness; -+ args->use_filter = use_filter; -+ -+ do { -+ last = get_next_mm(lruvec, args, &mm); -+ if (mm) -+ walk_mm(lruvec, mm, args); -+ -+ cond_resched(); -+ } while (mm); -+ -+ free_mm_walk_args(args); -+ -+ if (!last) { -+ /* don't wait unless we may have trouble reclaiming */ -+ if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2) -+ wait_event_killable(lruvec->mm_walk.wait, -+ max_seq < READ_ONCE(lrugen->max_seq)); -+ -+ return max_seq < READ_ONCE(lrugen->max_seq); -+ } -+ -+ VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq)); -+ -+ inc_max_seq(lruvec, max_seq); -+ /* either we see any waiters or they will see updated max_seq */ -+ if (wq_has_sleeper(&lruvec->mm_walk.wait)) -+ wake_up_all(&lruvec->mm_walk.wait); -+ -+ wakeup_flusher_threads(WB_REASON_VMSCAN); -+ -+ return true; -+} -+ -+static long get_nr_evictable(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long max_seq, unsigned long *min_seq, bool *low) -+{ -+ int gen, type, zone; -+ long max = 0; -+ long min = 0; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for (type = !swappiness; type < ANON_AND_FILE; type++) { -+ unsigned long seq; -+ -+ for (seq = min_seq[type]; seq <= max_seq; seq++) { -+ long size = 0; -+ -+ gen = lru_gen_from_seq(seq); -+ -+ for (zone = 0; zone <= sc->reclaim_idx; zone++) -+ size += READ_ONCE(lrugen->sizes[gen][type][zone]); -+ -+ max += size; -+ if (type && max_seq - seq >= MIN_NR_GENS) -+ min += size; -+ } -+ } -+ -+ *low = max_seq - min_seq[1] <= MIN_NR_GENS && min < MIN_BATCH_SIZE; -+ -+ return max > 0 ? max : 0; -+} -+ -+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, -+ unsigned long min_ttl) -+{ -+ bool low; -+ long nr_to_scan; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ int swappiness = get_swappiness(memcg); -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (mem_cgroup_below_min(memcg)) -+ return false; -+ -+ if (min_ttl) { -+ int gen = lru_gen_from_seq(min_seq[1]); -+ unsigned long birth = READ_ONCE(lruvec->evictable.timestamps[gen]); -+ -+ if (time_is_after_jiffies(birth + min_ttl)) -+ return false; -+ } -+ -+ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); -+ if (!nr_to_scan) -+ return false; -+ -+ nr_to_scan >>= sc->priority; -+ -+ if (!mem_cgroup_online(memcg)) -+ nr_to_scan++; -+ -+ if (nr_to_scan && low && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim)) -+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true); -+ -+ return true; -+} -+ -+/* Protect the working set accessed within the last N milliseconds. */ -+static unsigned long lru_gen_min_ttl __read_mostly; -+ -+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+ struct mem_cgroup *memcg; -+ bool success = false; -+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); -+ -+ VM_BUG_ON(!current_is_kswapd()); -+ -+ if (!sc->force_deactivate) { -+ sc->force_deactivate = 1; -+ return; -+ } -+ -+ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ -+ if (age_lruvec(lruvec, sc, min_ttl)) -+ success = true; -+ -+ cond_resched(); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+ -+ if (!success && mutex_trylock(&oom_lock)) { -+ struct oom_control oc = { -+ .gfp_mask = sc->gfp_mask, -+ .order = sc->order, -+ }; -+ -+ /* to avoid overkilling */ -+ if (!oom_reaping_in_progress()) -+ out_of_memory(&oc); -+ -+ mutex_unlock(&oom_lock); -+ } -+ -+ current->reclaim_state->mm_walk_args = NULL; -+} -+ -+/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */ -+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) -+{ -+ int i; -+ pte_t *pte; -+ struct page *page; -+ int old_gen, new_gen; -+ unsigned long start; -+ unsigned long end; -+ unsigned long addr; -+ struct mm_walk_args *args; -+ int worth = 0; -+ struct mem_cgroup *memcg = page_memcg(pvmw->page); -+ struct pglist_data *pgdat = page_pgdat(pvmw->page); -+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ DEFINE_MAX_SEQ(lruvec); -+ -+ lockdep_assert_held(pvmw->ptl); -+ VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page); -+ -+ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; -+ if (!args) -+ return; -+ -+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); -+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; -+ -+ if (end - start > MIN_BATCH_SIZE * PAGE_SIZE) { -+ if (pvmw->address - start < MIN_BATCH_SIZE * PAGE_SIZE / 2) -+ end = start + MIN_BATCH_SIZE * PAGE_SIZE; -+ else if (end - pvmw->address < MIN_BATCH_SIZE * PAGE_SIZE / 2) -+ start = end - MIN_BATCH_SIZE * PAGE_SIZE; -+ else { -+ start = pvmw->address - MIN_BATCH_SIZE * PAGE_SIZE / 2; -+ end = pvmw->address + MIN_BATCH_SIZE * PAGE_SIZE / 2; -+ } -+ } -+ -+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; -+ new_gen = lru_gen_from_seq(max_seq); -+ -+ lock_page_memcg(pvmw->page); -+ arch_enter_lazy_mmu_mode(); -+ -+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { -+ unsigned long pfn = pte_pfn(pte[i]); -+ -+ if (!pte_present(pte[i]) || is_zero_pfn(pfn)) -+ continue; -+ -+ if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i]))) -+ continue; -+ -+ VM_BUG_ON(!pfn_valid(pfn)); -+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) -+ continue; -+ -+ worth++; -+ -+ if (!pte_young(pte[i])) -+ continue; -+ -+ page = compound_head(pfn_to_page(pfn)); -+ if (page_to_nid(page) != pgdat->node_id) -+ continue; -+ -+ if (page_memcg_rcu(page) != memcg) -+ continue; -+ -+ VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end); -+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) -+ continue; -+ -+ if (pte_dirty(pte[i]) && !PageDirty(page) && -+ !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page))) -+ __set_bit(i, args->bitmap); -+ -+ old_gen = page_update_gen(page, new_gen); -+ if (old_gen >= 0 && old_gen != new_gen) -+ update_batch_size(page, old_gen, new_gen, args); -+ } -+ -+ arch_leave_lazy_mmu_mode(); -+ unlock_page_memcg(pvmw->page); -+ -+ if (worth >= MIN_BATCH_SIZE / 2) -+ set_bloom_filter(lruvec, max_seq, pvmw->pmd); -+ -+ for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) -+ set_page_dirty(pte_page(pte[i])); -+ -+ bitmap_zero(args->bitmap, MIN_BATCH_SIZE); -+} -+ -+/****************************************************************************** - * state change - ******************************************************************************/ - -@@ -3477,6 +4414,12 @@ static int __init init_lru_gen(void) - }; - late_initcall(init_lru_gen); - -+#else -+ -+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) -+{ -+} -+ - #endif /* CONFIG_LRU_GEN */ - - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -@@ -4333,6 +5276,11 @@ static void age_active_anon(struct pglis - struct mem_cgroup *memcg; - struct lruvec *lruvec; - -+ if (lru_gen_enabled()) { -+ lru_gen_age_node(pgdat, sc); -+ return; -+ } -+ - if (!can_age_anon_pages(pgdat, sc)) - return; - diff --git a/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch b/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch deleted file mode 100644 index 09e4315dcc..0000000000 --- a/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch +++ /dev/null @@ -1,1002 +0,0 @@ -From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001 -From: Yu Zhao -Date: Tue, 29 Jun 2021 20:46:47 -0600 -Subject: [PATCH 07/10] mm: multigenerational lru: eviction - -The eviction consumes old generations. Given an lruvec, the eviction -scans pages on lrugen->lists indexed by anon and file min_seq[] -(modulo MAX_NR_GENS). It first tries to select a type based on the -values of min_seq[]. If they are equal, it selects the type that has -a lower refaulted %. The eviction sorts a page according to its -updated generation number if the aging has found this page accessed. -It also moves a page to the next generation if this page is from an -upper tier that has a higher refaulted % than the base tier. The -eviction increments min_seq[] of a selected type when it finds -lrugen->lists indexed by min_seq[] of this selected type are empty. - -Each generation is divided into multiple tiers. Tiers represent -different ranges of numbers of accesses from file descriptors only. -Pages accessed N times via file descriptors belong to tier -order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers, -and they require additional MAX_NR_TIERS-2 bits in page->flags. In -contrast to moving between generations which requires list operations, -moving between tiers only involves operations on page->flags and -therefore has a negligible cost. A feedback loop modeled after the PID -controller monitors refaulted % across all tiers and decides when to -protect pages from which tiers. - -Unmapped pages are initially added to the oldest generation and then -conditionally protected by tiers. Each tier keeps track of how many -pages from it have refaulted. Tier 0 is the base tier and pages from -it are evicted unconditionally because there are no better candidates. -Pages from an upper tier are either evicted or moved to the next -generation, depending on whether this upper tier has a higher -refaulted % than the base tier. This model has the following -advantages: - 1) It removes the cost in the buffered access path and reduces the - overall cost of protection because pages are conditionally protected - in the reclaim path. - 2) It takes mapped pages into account and avoids overprotecting - pages accessed multiple times via file descriptors. - 3 Additional tiers improve the protection of pages accessed more - than twice. - -Signed-off-by: Yu Zhao -Tested-by: Konstantin Kharlamov -Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac ---- - include/linux/mm_inline.h | 10 + - include/linux/mmzone.h | 33 +++ - mm/swap.c | 42 +++ - mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++- - mm/workingset.c | 120 ++++++++- - 5 files changed, 757 insertions(+), 3 deletions(-) - ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi - return seq % NR_HIST_GENS; - } - -+/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */ -+static inline int lru_tier_from_refs(int refs) -+{ -+ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH)); -+ -+ return order_base_2(refs + 1); -+} -+ - /* The youngest and the second youngest generations are counted as active. */ - static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) - { -@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru - gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; - - new_flags &= ~LRU_GEN_MASK; -+ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS) -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); - /* for shrink_page_list() */ - if (reclaiming) - new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -319,6 +319,30 @@ struct page_vma_mapped_walk; - #define MIN_NR_GENS 2 - #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) - -+/* -+ * Each generation is divided into multiple tiers. Tiers represent different -+ * ranges of numbers of accesses from file descriptors, i.e., -+ * mark_page_accessed(). In contrast to moving between generations which -+ * requires the lru lock, moving between tiers only involves an atomic -+ * operation on page->flags and therefore has a negligible cost. -+ * -+ * The purposes of tiers are to: -+ * 1) estimate whether pages accessed multiple times via file descriptors are -+ * more active than pages accessed only via page tables by separating the two -+ * access types into upper tiers and the base tier, and comparing refaulted % -+ * across all tiers. -+ * 2) improve buffered io performance by deferring the protection of pages -+ * accessed multiple times until the eviction. That is the protection happens -+ * in the reclaim path, not the access path. -+ * -+ * Pages accessed N times via file descriptors belong to tier order_base_2(N). -+ * The base tier may be marked by PageReferenced(). All upper tiers are marked -+ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are -+ * used to support more than one upper tier. -+ */ -+#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) -+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) -+ - /* Whether to keep stats for historical generations. */ - #ifdef CONFIG_LRU_GEN_STATS - #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) -@@ -337,6 +361,15 @@ struct lrugen { - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; - /* the sizes of the multigenerational lru lists in pages */ - unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* the exponential moving average of refaulted */ -+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; -+ /* the exponential moving average of protected+evicted */ -+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; -+ /* the base tier isn't protected, hence the minus one */ -+ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; -+ /* incremented without holding the lru lock */ -+ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; -+ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; - /* whether the multigenerational lru is enabled */ - bool enabled[ANON_AND_FILE]; - }; ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st - local_unlock(&lru_pvecs.lock); - } - -+#ifdef CONFIG_LRU_GEN -+static void page_inc_refs(struct page *page) -+{ -+ unsigned long refs; -+ unsigned long old_flags, new_flags; -+ -+ if (PageUnevictable(page)) -+ return; -+ -+ /* see the comment on MAX_NR_TIERS */ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ -+ if (!(new_flags & BIT(PG_referenced))) { -+ new_flags |= BIT(PG_referenced); -+ continue; -+ } -+ -+ if (!(new_flags & BIT(PG_workingset))) { -+ new_flags |= BIT(PG_workingset); -+ continue; -+ } -+ -+ refs = new_flags & LRU_REFS_MASK; -+ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK); -+ -+ new_flags &= ~LRU_REFS_MASK; -+ new_flags |= refs; -+ } while (new_flags != old_flags && -+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+} -+#else -+static void page_inc_refs(struct page *page) -+{ -+} -+#endif /* CONFIG_LRU_GEN */ -+ - /* - * Mark a page as having seen activity. - * -@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag - { - page = compound_head(page); - -+ if (lru_gen_enabled()) { -+ page_inc_refs(page); -+ return; -+ } -+ - if (!PageReferenced(page)) { - SetPageReferenced(page); - } else if (PageUnevictable(page)) { ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre - - if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page_private(page) }; -- mem_cgroup_swapout(page, swap); -+ -+ /* get a shadow entry before page_memcg() is cleared */ - if (reclaimed && !mapping_exiting(mapping)) - shadow = workingset_eviction(page, target_memcg); -+ mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page, swap, shadow); - xa_unlock_irq(&mapping->i_pages); - put_swap_page(page, swap); -@@ -1410,6 +1412,11 @@ retry: - if (!sc->may_unmap && page_mapped(page)) - goto keep_locked; - -+ /* lru_gen_look_around() has updated this page? */ -+ if (lru_gen_enabled() && !ignore_references && -+ page_mapped(page) && PageReferenced(page)) -+ goto keep_locked; -+ - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); - -@@ -2570,6 +2577,9 @@ static void prepare_scan_count(pg_data_t - unsigned long file; - struct lruvec *target_lruvec; - -+ if (lru_gen_enabled()) -+ return; -+ - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - - /* -@@ -2910,6 +2920,17 @@ static int page_lru_gen(struct page *pag - return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; - } - -+static int page_lru_tier(struct page *page) -+{ -+ int refs; -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ? -+ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0; -+ -+ return lru_tier_from_refs(refs); -+} -+ - static int get_swappiness(struct mem_cgroup *memcg) - { - return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? -@@ -3246,6 +3267,91 @@ done: - } - - /****************************************************************************** -+ * refault feedback loop -+ ******************************************************************************/ -+ -+/* -+ * A feedback loop modeled after the PID controller. Currently supports the -+ * proportional (P) and the integral (I) terms; the derivative (D) term can be -+ * added if necessary. The setpoint (SP) is the desired position; the process -+ * variable (PV) is the measured position. The error is the difference between -+ * the SP and the PV. A positive error results in a positive control output -+ * correction, which, in our case, is to allow eviction. -+ * -+ * The P term is refaulted % of the current generation being evicted. The I -+ * term is the exponential moving average of refaulted % of previously evicted -+ * generations, using the smoothing factor 1/2. -+ * -+ * Our goal is to maintain proportional refaulted % across all tiers. -+ */ -+struct ctrl_pos { -+ unsigned long refaulted; -+ unsigned long total; -+ int gain; -+}; -+ -+static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, -+ struct ctrl_pos *pos) -+{ -+ struct lrugen *lrugen = &lruvec->evictable; -+ int hist = lru_hist_from_seq(lrugen->min_seq[type]); -+ -+ pos->refaulted = lrugen->avg_refaulted[type][tier] + -+ atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ pos->total = lrugen->avg_total[type][tier] + -+ atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ pos->total += lrugen->protected[hist][type][tier - 1]; -+ pos->gain = gain; -+} -+ -+static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type) -+{ -+ int tier; -+ int hist = lru_hist_from_seq(gen); -+ struct lrugen *lrugen = &lruvec->evictable; -+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); -+ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; -+ -+ if (!carryover && !clear) -+ return; -+ -+ for (tier = 0; tier < MAX_NR_TIERS; tier++) { -+ if (carryover) { -+ unsigned long sum; -+ -+ sum = lrugen->avg_refaulted[type][tier] + -+ atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); -+ -+ sum = lrugen->avg_total[type][tier] + -+ atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ sum += lrugen->protected[hist][type][tier - 1]; -+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); -+ } -+ -+ if (clear) { -+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); -+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0); -+ if (tier) -+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); -+ } -+ } -+} -+ -+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) -+{ -+ /* -+ * Allow eviction if the PV has a limited number of refaulted pages or a -+ * lower refaulted % than the SP. -+ */ -+ return pv->refaulted < MIN_BATCH_SIZE || -+ pv->refaulted * max(sp->total, 1UL) * sp->gain <= -+ sp->refaulted * max(pv->total, 1UL) * pv->gain; -+} -+ -+/****************************************************************************** - * the aging - ******************************************************************************/ - -@@ -3265,6 +3371,7 @@ static int page_update_gen(struct page * - - new_flags &= ~LRU_GEN_MASK; - new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); - } while (new_flags != old_flags && - cmpxchg(&page->flags, old_flags, new_flags) != old_flags); - -@@ -3296,6 +3403,7 @@ static void page_inc_gen(struct page *pa - - new_flags &= ~LRU_GEN_MASK; - new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; -+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); - /* for end_page_writeback() */ - if (reclaiming) - new_flags |= BIT(PG_reclaim); -@@ -3787,6 +3895,7 @@ static bool inc_min_seq(struct lruvec *l - } - } - -+ reset_ctrl_pos(lruvec, gen, type); - WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); - - return true; -@@ -3824,6 +3933,8 @@ next: - if (min_seq[type] == lrugen->min_seq[type]) - continue; - -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ reset_ctrl_pos(lruvec, gen, type); - WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); - success = true; - } -@@ -3885,6 +3996,9 @@ static void inc_max_seq(struct lruvec *l - } - } - -+ for (type = 0; type < ANON_AND_FILE; type++) -+ reset_ctrl_pos(lruvec, gen, type); -+ - WRITE_ONCE(lrugen->timestamps[gen], jiffies); - /* make sure all preceding modifications appear first */ - smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); -@@ -4166,6 +4280,433 @@ void lru_gen_look_around(struct page_vma - } - - /****************************************************************************** -+ * the eviction -+ ******************************************************************************/ -+ -+static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx) -+{ -+ bool success; -+ int gen = page_lru_gen(page); -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int tier = page_lru_tier(page); -+ int delta = thp_nr_pages(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page); -+ -+ /* an mlocked page? */ -+ if (!page_evictable(page)) { -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ SetPageUnevictable(page); -+ add_page_to_lru_list(page, lruvec); -+ __count_vm_events(UNEVICTABLE_PGCULLED, delta); -+ return true; -+ } -+ -+ /* a lazy-free page that has been written into? */ -+ if (type && PageDirty(page) && PageAnon(page)) { -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ SetPageSwapBacked(page); -+ add_page_to_lru_list_tail(page, lruvec); -+ return true; -+ } -+ -+ /* page_update_gen() has updated this page? */ -+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { -+ list_move(&page->lru, &lrugen->lists[gen][type][zone]); -+ return true; -+ } -+ -+ /* protect this page if its tier has a higher refaulted % */ -+ if (tier > tier_idx) { -+ int hist = lru_hist_from_seq(gen); -+ -+ page_inc_gen(page, lruvec, false); -+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], -+ lrugen->protected[hist][type][tier - 1] + delta); -+ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); -+ return true; -+ } -+ -+ /* mark this page for reclaim if it's pending writeback */ -+ if (PageWriteback(page) || (type && PageDirty(page))) { -+ page_inc_gen(page, lruvec, true); -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) -+{ -+ bool success; -+ -+ if (!sc->may_unmap && page_mapped(page)) -+ return false; -+ -+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && -+ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) -+ return false; -+ -+ if (!get_page_unless_zero(page)) -+ return false; -+ -+ if (!TestClearPageLRU(page)) { -+ put_page(page); -+ return false; -+ } -+ -+ success = lru_gen_del_page(page, lruvec, true); -+ VM_BUG_ON_PAGE(!success, page); -+ -+ return true; -+} -+ -+static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, -+ int type, int tier, struct list_head *list) -+{ -+ int gen, zone; -+ enum vm_event_item item; -+ int sorted = 0; -+ int scanned = 0; -+ int isolated = 0; -+ int remaining = MAX_BATCH_SIZE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ -+ VM_BUG_ON(!list_empty(list)); -+ -+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS) -+ return 0; -+ -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ for (zone = sc->reclaim_idx; zone >= 0; zone--) { -+ LIST_HEAD(moved); -+ int skipped = 0; -+ struct list_head *head = &lrugen->lists[gen][type][zone]; -+ -+ while (!list_empty(head)) { -+ struct page *page = lru_to_page(head); -+ int delta = thp_nr_pages(page); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ scanned += delta; -+ -+ if (sort_page(page, lruvec, tier)) -+ sorted += delta; -+ else if (isolate_page(page, lruvec, sc)) { -+ list_add(&page->lru, list); -+ isolated += delta; -+ } else { -+ list_move(&page->lru, &moved); -+ skipped += delta; -+ } -+ -+ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE) -+ break; -+ } -+ -+ if (skipped) { -+ list_splice(&moved, head); -+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); -+ } -+ -+ if (!remaining || isolated >= MIN_BATCH_SIZE) -+ break; -+ } -+ -+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; -+ if (!cgroup_reclaim(sc)) { -+ __count_vm_events(item, isolated); -+ __count_vm_events(PGREFILL, sorted); -+ } -+ __count_memcg_events(memcg, item, isolated); -+ __count_memcg_events(memcg, PGREFILL, sorted); -+ __count_vm_events(PGSCAN_ANON + type, isolated); -+ -+ /* -+ * We may have trouble finding eligible pages due to reclaim_idx, -+ * may_unmap and may_writepage. Check `remaining` to make sure we won't -+ * be stuck if we aren't making enough progress. -+ */ -+ return isolated || !remaining ? scanned : 0; -+} -+ -+static int get_tier_idx(struct lruvec *lruvec, int type) -+{ -+ int tier; -+ struct ctrl_pos sp, pv; -+ -+ /* -+ * Ideally we don't want to evict upper tiers that have higher refaulted -+ * %. However, we need to leave a margin for the fluctuation in -+ * refaulted %. So we use a larger gain factor to make sure upper tiers -+ * are indeed more active. We choose 2 because the lowest upper tier -+ * would have twice of refaulted % of the base tier, according to their -+ * numbers of accesses. -+ */ -+ read_ctrl_pos(lruvec, type, 0, 1, &sp); -+ for (tier = 1; tier < MAX_NR_TIERS; tier++) { -+ read_ctrl_pos(lruvec, type, tier, 2, &pv); -+ if (!positive_ctrl_err(&sp, &pv)) -+ break; -+ } -+ -+ return tier - 1; -+} -+ -+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) -+{ -+ int type, tier; -+ struct ctrl_pos sp, pv; -+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; -+ -+ /* -+ * Compare refaulted % between the base tiers of anon and file to -+ * determine which type to evict. Also need to compare refaulted % of -+ * the upper tiers of the selected type with that of the base tier of -+ * the other type to determine which tier of the selected type to evict. -+ */ -+ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp); -+ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv); -+ type = positive_ctrl_err(&sp, &pv); -+ -+ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); -+ for (tier = 1; tier < MAX_NR_TIERS; tier++) { -+ read_ctrl_pos(lruvec, type, tier, gain[type], &pv); -+ if (!positive_ctrl_err(&sp, &pv)) -+ break; -+ } -+ -+ *tier_idx = tier - 1; -+ -+ return type; -+} -+ -+static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ int *type_scanned, struct list_head *list) -+{ -+ int i; -+ int type; -+ int scanned; -+ int tier = -1; -+ DEFINE_MIN_SEQ(lruvec); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ -+ /* -+ * Try to select a type based on generations and swappiness, and if that -+ * fails, fall back to get_type_to_scan(). When anon and file are both -+ * available from the same generation, swappiness 200 is interpreted as -+ * anon first and swappiness 1 is interpreted as file first. -+ */ -+ if (!swappiness) -+ type = 1; -+ else if (min_seq[0] < min_seq[1]) -+ type = 0; -+ else if (swappiness == 1) -+ type = 1; -+ else if (swappiness == 200) -+ type = 0; -+ else -+ type = get_type_to_scan(lruvec, swappiness, &tier); -+ -+ for (i = !swappiness; i < ANON_AND_FILE; i++) { -+ if (tier < 0) -+ tier = get_tier_idx(lruvec, type); -+ -+ scanned = scan_pages(lruvec, sc, type, tier, list); -+ if (scanned) -+ break; -+ -+ type = !type; -+ tier = -1; -+ } -+ -+ *type_scanned = type; -+ -+ return scanned; -+} -+ -+/* Main function used by the foreground, the background and the user-triggered eviction. */ -+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) -+{ -+ int type; -+ int scanned; -+ int reclaimed; -+ LIST_HEAD(list); -+ struct page *page; -+ enum vm_event_item item; -+ struct reclaim_stat stat; -+ struct mm_walk_args *args; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); -+ -+ if (try_to_inc_min_seq(lruvec, swappiness)) -+ scanned++; -+ -+ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS) -+ scanned = 0; -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ if (list_empty(&list)) -+ return scanned; -+ -+ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); -+ /* -+ * We need to prevent rejected pages from being added back to the same -+ * lists they were isolated from. Otherwise we may risk looping on them -+ * forever. -+ */ -+ list_for_each_entry(page, &list, lru) { -+ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) -+ SetPageActive(page); -+ -+ ClearPageReferenced(page); -+ ClearPageWorkingset(page); -+ } -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ move_pages_to_lru(lruvec, &list); -+ -+ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; -+ if (args && args->batch_size) -+ reset_batch_size(lruvec, args); -+ -+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; -+ if (!cgroup_reclaim(sc)) -+ __count_vm_events(item, reclaimed); -+ __count_memcg_events(memcg, item, reclaimed); -+ __count_vm_events(PGSTEAL_ANON + type, reclaimed); -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ -+ mem_cgroup_uncharge_list(&list); -+ free_unref_page_list(&list); -+ -+ sc->nr_reclaimed += reclaimed; -+ -+ return scanned; -+} -+ -+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) -+{ -+ bool low; -+ long nr_to_scan; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ int priority = sc->priority; -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (mem_cgroup_below_min(memcg) || -+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) -+ return 0; -+ -+ if (sc->nr_reclaimed >= sc->nr_to_reclaim) { -+ priority = DEF_PRIORITY; -+ sc->force_deactivate = 0; -+ } -+ -+ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); -+ if (!nr_to_scan) -+ return 0; -+ -+ nr_to_scan >>= priority; -+ -+ if (!mem_cgroup_online(memcg)) -+ nr_to_scan++; -+ -+ if (!nr_to_scan) -+ return 0; -+ -+ if (current_is_kswapd()) { -+ /* leave the work to lru_gen_age_node() */ -+ if (max_seq - min_seq[1] < MIN_NR_GENS) -+ return 0; -+ -+ if (!low) -+ sc->force_deactivate = 0; -+ -+ return nr_to_scan; -+ } -+ -+ if (max_seq - min_seq[1] >= MIN_NR_GENS) -+ return nr_to_scan; -+ -+ /* move onto slab and other memcgs if we haven't tried them all */ -+ if (!sc->force_deactivate) { -+ sc->skipped_deactivate = 1; -+ return 0; -+ } -+ -+ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0; -+} -+ -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+ struct blk_plug plug; -+ long scanned = 0; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ struct pglist_data *pgdat = lruvec_pgdat(lruvec); -+ -+ lru_add_drain(); -+ -+ if (current_is_kswapd()) -+ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; -+ -+ blk_start_plug(&plug); -+ -+ while (true) { -+ int delta; -+ int swappiness; -+ long nr_to_scan; -+ -+ if (sc->may_swap) -+ swappiness = get_swappiness(memcg); -+ else if (!cgroup_reclaim(sc) && get_swappiness(memcg)) -+ swappiness = 1; -+ else -+ swappiness = 0; -+ -+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); -+ if (!nr_to_scan) -+ break; -+ -+ delta = evict_pages(lruvec, sc, swappiness); -+ if (!delta) -+ break; -+ -+ scanned += delta; -+ if (scanned >= nr_to_scan) -+ break; -+ -+ cond_resched(); -+ } -+ -+ blk_finish_plug(&plug); -+ -+ if (current_is_kswapd()) -+ current->reclaim_state->mm_walk_args = NULL; -+} -+ -+/****************************************************************************** - * state change - ******************************************************************************/ - -@@ -4420,6 +4961,10 @@ static void lru_gen_age_node(struct pgli - { - } - -+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -+{ -+} -+ - #endif /* CONFIG_LRU_GEN */ - - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -@@ -4433,6 +4978,11 @@ static void shrink_lruvec(struct lruvec - struct blk_plug plug; - bool scan_adjusted; - -+ if (lru_gen_enabled()) { -+ lru_gen_shrink_lruvec(lruvec, sc); -+ return; -+ } -+ - get_scan_count(lruvec, sc, nr); - - /* Record the original scan target for proportional adjustments later */ -@@ -4906,6 +5456,9 @@ static void snapshot_refaults(struct mem - struct lruvec *target_lruvec; - unsigned long refaults; - -+ if (lru_gen_enabled()) -+ return; -+ - target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - target_lruvec->refaults[0] = refaults; ---- a/mm/workingset.c -+++ b/mm/workingset.c -@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_ - static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, - bool workingset) - { -- eviction >>= bucket_order; - eviction &= EVICTION_MASK; - eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; - eviction = (eviction << NODES_SHIFT) | pgdat->node_id; -@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow, - - *memcgidp = memcgid; - *pgdat = NODE_DATA(nid); -- *evictionp = entry << bucket_order; -+ *evictionp = entry; - *workingsetp = workingset; - } - -+#ifdef CONFIG_LRU_GEN -+ -+static int page_lru_refs(struct page *page) -+{ -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); -+ -+ /* see the comment on MAX_NR_TIERS */ -+ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0; -+} -+ -+/* Return a token to be stored in the shadow entry of a page being evicted. */ -+static void *lru_gen_eviction(struct page *page) -+{ -+ int hist, tier; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lrugen *lrugen; -+ int type = page_is_file_lru(page); -+ int refs = page_lru_refs(page); -+ int delta = thp_nr_pages(page); -+ bool workingset = PageWorkingset(page); -+ struct mem_cgroup *memcg = page_memcg(page); -+ struct pglist_data *pgdat = page_pgdat(page); -+ -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->evictable; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ token = (min_seq << LRU_REFS_WIDTH) | refs; -+ -+ hist = lru_hist_from_seq(min_seq); -+ tier = lru_tier_from_refs(refs + workingset); -+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); -+ -+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); -+} -+ -+/* Count a refaulted page based on the token stored in its shadow entry. */ -+static void lru_gen_refault(struct page *page, void *shadow) -+{ -+ int hist, tier, refs; -+ int memcg_id; -+ bool workingset; -+ unsigned long token; -+ unsigned long min_seq; -+ struct lruvec *lruvec; -+ struct lrugen *lrugen; -+ struct mem_cgroup *memcg; -+ struct pglist_data *pgdat; -+ int type = page_is_file_lru(page); -+ int delta = thp_nr_pages(page); -+ -+ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); -+ if (page_pgdat(page) != pgdat) -+ return; -+ -+ rcu_read_lock(); -+ memcg = page_memcg_rcu(page); -+ if (mem_cgroup_id(memcg) != memcg_id) -+ goto unlock; -+ -+ refs = token & (BIT(LRU_REFS_WIDTH) - 1); -+ if (refs && !workingset) -+ goto unlock; -+ -+ token >>= LRU_REFS_WIDTH; -+ lruvec = mem_cgroup_lruvec(memcg, pgdat); -+ lrugen = &lruvec->evictable; -+ min_seq = READ_ONCE(lrugen->min_seq[type]); -+ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) -+ goto unlock; -+ -+ hist = lru_hist_from_seq(min_seq); -+ tier = lru_tier_from_refs(refs + workingset); -+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); -+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); -+ -+ /* -+ * Tiers don't offer any protection to pages accessed via page tables. -+ * That's what generations do. Tiers can't fully protect pages after -+ * their numbers of accesses has exceeded the max value. Conservatively -+ * count these two conditions as stalls even though they might not -+ * indicate any real memory pressure. -+ */ -+ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) { -+ SetPageWorkingset(page); -+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); -+ } -+unlock: -+ rcu_read_unlock(); -+} -+ -+#else -+ -+static void *lru_gen_eviction(struct page *page) -+{ -+ return NULL; -+} -+ -+static void lru_gen_refault(struct page *page, void *shadow) -+{ -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - /** - * workingset_age_nonresident - age non-resident entries as LRU ages - * @lruvec: the lruvec that was aged -@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - -+ if (lru_gen_enabled()) -+ return lru_gen_eviction(page); -+ - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - /* XXX: target_memcg can be NULL, go through lruvec */ - memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->nonresident_age); -+ eviction >>= bucket_order; - workingset_age_nonresident(lruvec, thp_nr_pages(page)); - return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); - } -@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag - bool workingset; - int memcgid; - -+ if (lru_gen_enabled()) { -+ lru_gen_refault(page, shadow); -+ return; -+ } -+ - unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); -+ eviction <<= bucket_order; - - rcu_read_lock(); - /* diff --git a/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch b/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch deleted file mode 100644 index a1a749fc38..0000000000 --- a/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch +++ /dev/null @@ -1,496 +0,0 @@ -From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001 -From: Yu Zhao -Date: Mon, 25 Jan 2021 21:38:02 -0700 -Subject: [PATCH 08/10] mm: multigenerational lru: user interface - -Add /sys/kernel/mm/lru_gen/enabled to enable and disable the -multigenerational lru at runtime. - -Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a -given number of milliseconds. The OOM killer is invoked if this -working set cannot be kept in memory. - -Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and -invoke the aging and the eviction. This file has the following output: - memcg memcg_id memcg_path - node node_id - min_gen birth_time anon_size file_size - ... - max_gen birth_time anon_size file_size - -min_gen is the oldest generation number and max_gen is the youngest -generation number. birth_time is in milliseconds. anon_size and -file_size are in pages. - -This file takes the following input: - + memcg_id node_id max_gen [swappiness] [use_bloom_filter] - - memcg_id node_id min_gen [swappiness] [nr_to_reclaim] - -The first command line invokes the aging, which scans PTEs for -accessed pages and then creates the next generation max_gen+1. A swap -file and a non-zero swappiness, which overrides vm.swappiness, are -required to scan PTEs mapping anon pages. The second command line -invokes the eviction, which evicts generations less than or equal to -min_gen. min_gen should be less than max_gen-1 as max_gen and -max_gen-1 are not fully aged and therefore cannot be evicted. -Setting nr_to_reclaim to N limits the number of pages to evict. -Setting use_bloom_filter to 0 overrides the default behavior which -only scans PTE tables found populated. Multiple command lines are -supported, as is concatenation with delimiters "," and ";". - -Signed-off-by: Yu Zhao -Tested-by: Konstantin Kharlamov -Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3 ---- - include/linux/nodemask.h | 1 + - mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++ - 2 files changed, 416 insertions(+) - ---- a/include/linux/nodemask.h -+++ b/include/linux/nodemask.h -@@ -485,6 +485,7 @@ static inline int num_node_state(enum no - #define first_online_node 0 - #define first_memory_node 0 - #define next_online_node(nid) (MAX_NUMNODES) -+#define next_memory_node(nid) (MAX_NUMNODES) - #define nr_node_ids 1U - #define nr_online_nodes 1U - ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -53,6 +53,8 @@ - #include - #include - #include -+#include -+#include - - #include - #include -@@ -4882,6 +4884,413 @@ unlock: - } - - /****************************************************************************** -+ * sysfs interface -+ ******************************************************************************/ -+ -+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); -+} -+ -+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t len) -+{ -+ unsigned int msecs; -+ -+ if (kstrtouint(buf, 10, &msecs)) -+ return -EINVAL; -+ -+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); -+ -+ return len; -+} -+ -+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( -+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl -+); -+ -+static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -+{ -+ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled()); -+} -+ -+static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t len) -+{ -+ bool enable; -+ -+ if (kstrtobool(buf, &enable)) -+ return -EINVAL; -+ -+ lru_gen_change_state(enable, true, false); -+ -+ return len; -+} -+ -+static struct kobj_attribute lru_gen_enabled_attr = __ATTR( -+ enabled, 0644, show_enable, store_enable -+); -+ -+static struct attribute *lru_gen_attrs[] = { -+ &lru_gen_min_ttl_attr.attr, -+ &lru_gen_enabled_attr.attr, -+ NULL -+}; -+ -+static struct attribute_group lru_gen_attr_group = { -+ .name = "lru_gen", -+ .attrs = lru_gen_attrs, -+}; -+ -+/****************************************************************************** -+ * debugfs interface -+ ******************************************************************************/ -+ -+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) -+{ -+ struct mem_cgroup *memcg; -+ loff_t nr_to_skip = *pos; -+ -+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL); -+ if (!m->private) -+ return ERR_PTR(-ENOMEM); -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ int nid; -+ -+ for_each_node_state(nid, N_MEMORY) { -+ if (!nr_to_skip--) -+ return get_lruvec(nid, memcg); -+ } -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+ -+ return NULL; -+} -+ -+static void lru_gen_seq_stop(struct seq_file *m, void *v) -+{ -+ if (!IS_ERR_OR_NULL(v)) -+ mem_cgroup_iter_break(NULL, lruvec_memcg(v)); -+ -+ kvfree(m->private); -+ m->private = NULL; -+} -+ -+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) -+{ -+ int nid = lruvec_pgdat(v)->node_id; -+ struct mem_cgroup *memcg = lruvec_memcg(v); -+ -+ ++*pos; -+ -+ nid = next_memory_node(nid); -+ if (nid == MAX_NUMNODES) { -+ memcg = mem_cgroup_iter(NULL, memcg, NULL); -+ if (!memcg) -+ return NULL; -+ -+ nid = first_memory_node; -+ } -+ -+ return get_lruvec(nid, memcg); -+} -+ -+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, -+ unsigned long max_seq, unsigned long *min_seq, -+ unsigned long seq) -+{ -+ int i; -+ int type, tier; -+ int hist = lru_hist_from_seq(seq); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for (tier = 0; tier < MAX_NR_TIERS; tier++) { -+ seq_printf(m, " %10d", tier); -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ unsigned long n[3] = {}; -+ -+ if (seq == max_seq) { -+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); -+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]); -+ -+ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]); -+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { -+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); -+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); -+ if (tier) -+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); -+ -+ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]); -+ } else -+ seq_puts(m, " 0 0 0 "); -+ } -+ seq_putc(m, '\n'); -+ } -+ -+ seq_puts(m, " "); -+ for (i = 0; i < NR_MM_STATS; i++) { -+ if (seq == max_seq && NR_HIST_GENS == 1) -+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), -+ toupper(MM_STAT_CODES[i])); -+ else if (seq != max_seq && NR_HIST_GENS > 1) -+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]), -+ MM_STAT_CODES[i]); -+ else -+ seq_puts(m, " 0 "); -+ } -+ seq_putc(m, '\n'); -+} -+ -+static int lru_gen_seq_show(struct seq_file *m, void *v) -+{ -+ unsigned long seq; -+ bool full = !debugfs_real_fops(m->file)->write; -+ struct lruvec *lruvec = v; -+ struct lrugen *lrugen = &lruvec->evictable; -+ int nid = lruvec_pgdat(lruvec)->node_id; -+ struct mem_cgroup *memcg = lruvec_memcg(lruvec); -+ DEFINE_MAX_SEQ(lruvec); -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (nid == first_memory_node) { -+ const char *path = memcg ? m->private : ""; -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) -+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); -+#endif -+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); -+ } -+ -+ seq_printf(m, " node %5d\n", nid); -+ -+ if (!full) -+ seq = min_seq[0]; -+ else if (max_seq >= MAX_NR_GENS) -+ seq = max_seq - MAX_NR_GENS + 1; -+ else -+ seq = 0; -+ -+ for (; seq <= max_seq; seq++) { -+ int gen, type, zone; -+ unsigned int msecs; -+ -+ gen = lru_gen_from_seq(seq); -+ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen])); -+ -+ seq_printf(m, " %10lu %10u", seq, msecs); -+ -+ for (type = 0; type < ANON_AND_FILE; type++) { -+ long size = 0; -+ -+ if (seq < min_seq[type]) { -+ seq_puts(m, " -0 "); -+ continue; -+ } -+ -+ for (zone = 0; zone < MAX_NR_ZONES; zone++) -+ size += READ_ONCE(lrugen->sizes[gen][type][zone]); -+ -+ seq_printf(m, " %10lu ", max(size, 0L)); -+ } -+ -+ seq_putc(m, '\n'); -+ -+ if (full) -+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); -+ } -+ -+ return 0; -+} -+ -+static const struct seq_operations lru_gen_seq_ops = { -+ .start = lru_gen_seq_start, -+ .stop = lru_gen_seq_stop, -+ .next = lru_gen_seq_next, -+ .show = lru_gen_seq_show, -+}; -+ -+static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long seq, bool use_filter) -+{ -+ DEFINE_MAX_SEQ(lruvec); -+ -+ if (seq == max_seq) -+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter); -+ -+ return seq > max_seq ? -EINVAL : 0; -+} -+ -+static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness, -+ unsigned long seq, unsigned long nr_to_reclaim) -+{ -+ struct blk_plug plug; -+ int err = -EINTR; -+ DEFINE_MAX_SEQ(lruvec); -+ -+ if (seq >= max_seq - 1) -+ return -EINVAL; -+ -+ sc->nr_reclaimed = 0; -+ -+ blk_start_plug(&plug); -+ -+ while (!signal_pending(current)) { -+ DEFINE_MIN_SEQ(lruvec); -+ -+ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim || -+ !evict_pages(lruvec, sc, swappiness)) { -+ err = 0; -+ break; -+ } -+ -+ cond_resched(); -+ } -+ -+ blk_finish_plug(&plug); -+ -+ return err; -+} -+ -+static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc, -+ int swappiness, unsigned long seq, unsigned long opt) -+{ -+ struct lruvec *lruvec; -+ int err = -EINVAL; -+ struct mem_cgroup *memcg = NULL; -+ -+ if (!mem_cgroup_disabled()) { -+ rcu_read_lock(); -+ memcg = mem_cgroup_from_id(memcg_id); -+#ifdef CONFIG_MEMCG -+ if (memcg && !css_tryget(&memcg->css)) -+ memcg = NULL; -+#endif -+ rcu_read_unlock(); -+ -+ if (!memcg) -+ goto done; -+ } -+ if (memcg_id != mem_cgroup_id(memcg)) -+ goto done; -+ -+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) -+ goto done; -+ -+ lruvec = get_lruvec(nid, memcg); -+ -+ if (swappiness < 0) -+ swappiness = get_swappiness(memcg); -+ else if (swappiness > 200) -+ goto done; -+ -+ switch (cmd) { -+ case '+': -+ err = run_aging(lruvec, sc, swappiness, seq, opt); -+ break; -+ case '-': -+ err = run_eviction(lruvec, sc, swappiness, seq, opt); -+ break; -+ } -+done: -+ mem_cgroup_put(memcg); -+ -+ return err; -+} -+ -+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, -+ size_t len, loff_t *pos) -+{ -+ void *buf; -+ char *cur, *next; -+ unsigned int flags; -+ int err = 0; -+ struct scan_control sc = { -+ .may_writepage = 1, -+ .may_unmap = 1, -+ .may_swap = 1, -+ .reclaim_idx = MAX_NR_ZONES - 1, -+ .gfp_mask = GFP_KERNEL, -+ }; -+ -+ buf = kvmalloc(len + 1, GFP_KERNEL); -+ if (!buf) -+ return -ENOMEM; -+ -+ if (copy_from_user(buf, src, len)) { -+ kvfree(buf); -+ return -EFAULT; -+ } -+ -+ next = buf; -+ next[len] = '\0'; -+ -+ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args(); -+ if (!sc.reclaim_state.mm_walk_args) { -+ kvfree(buf); -+ return -ENOMEM; -+ } -+ -+ flags = memalloc_noreclaim_save(); -+ set_task_reclaim_state(current, &sc.reclaim_state); -+ -+ while ((cur = strsep(&next, ",;\n"))) { -+ int n; -+ int end; -+ char cmd; -+ unsigned int memcg_id; -+ unsigned int nid; -+ unsigned long seq; -+ unsigned int swappiness = -1; -+ unsigned long opt = -1; -+ -+ cur = skip_spaces(cur); -+ if (!*cur) -+ continue; -+ -+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, -+ &seq, &end, &swappiness, &end, &opt, &end); -+ if (n < 4 || cur[end]) { -+ err = -EINVAL; -+ break; -+ } -+ -+ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt); -+ if (err) -+ break; -+ } -+ -+ set_task_reclaim_state(current, NULL); -+ memalloc_noreclaim_restore(flags); -+ -+ free_mm_walk_args(sc.reclaim_state.mm_walk_args); -+ kvfree(buf); -+ -+ return err ? : len; -+} -+ -+static int lru_gen_seq_open(struct inode *inode, struct file *file) -+{ -+ return seq_open(file, &lru_gen_seq_ops); -+} -+ -+static const struct file_operations lru_gen_rw_fops = { -+ .open = lru_gen_seq_open, -+ .read = seq_read, -+ .write = lru_gen_seq_write, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+static const struct file_operations lru_gen_ro_fops = { -+ .open = lru_gen_seq_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release, -+}; -+ -+/****************************************************************************** - * initialization - ******************************************************************************/ - -@@ -4951,6 +5360,12 @@ static int __init init_lru_gen(void) - BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); - BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); - -+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) -+ pr_err("lru_gen: failed to create sysfs group\n"); -+ -+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); -+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); -+ - return 0; - }; - late_initcall(init_lru_gen); diff --git a/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch b/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch deleted file mode 100644 index 4462549f99..0000000000 --- a/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch +++ /dev/null @@ -1,80 +0,0 @@ -From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001 -From: Yu Zhao -Date: Mon, 25 Jan 2021 21:47:24 -0700 -Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig - -Add configuration options for the multigenerational lru. - -Signed-off-by: Yu Zhao -Tested-by: Konstantin Kharlamov -Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635 ---- - mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 59 insertions(+) - ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -899,4 +899,63 @@ config SECRETMEM - - source "mm/damon/Kconfig" - -+# the multigenerational lru { -+config LRU_GEN -+ bool "Multigenerational LRU" -+ depends on MMU -+ # the following options may leave not enough spare bits in page->flags -+ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP) -+ help -+ A high performance LRU implementation to heavily overcommit workloads -+ that are not IO bound. See Documentation/vm/multigen_lru.rst for -+ details. -+ -+ Warning: do not enable this option unless you plan to use it because -+ it introduces a small per-process and per-memcg and per-node memory -+ overhead. -+ -+config LRU_GEN_ENABLED -+ bool "Turn on by default" -+ depends on LRU_GEN -+ help -+ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option -+ changes it to 1. -+ -+ Warning: the default value is the fast path. See -+ Documentation/static-keys.txt for details. -+ -+config LRU_GEN_STATS -+ bool "Full stats for debugging" -+ depends on LRU_GEN -+ help -+ This option keeps full stats for each generation, which can be read -+ from /sys/kernel/debug/lru_gen_full. -+ -+ Warning: do not enable this option unless you plan to use it because -+ it introduces an additional small per-process and per-memcg and -+ per-node memory overhead. -+ -+config NR_LRU_GENS -+ int "Max number of generations" -+ depends on LRU_GEN -+ range 4 31 -+ default 7 -+ help -+ This will use order_base_2(N+1) spare bits from page flags. -+ -+ Warning: do not use numbers larger than necessary because each -+ generation introduces a small per-node and per-memcg memory overhead. -+ -+config TIERS_PER_GEN -+ int "Number of tiers per generation" -+ depends on LRU_GEN -+ range 2 5 -+ default 4 -+ help -+ This will use N-2 spare bits from page flags. -+ -+ Larger values generally offer better protection to active pages under -+ heavy buffered I/O workloads. -+# } -+ - endmenu diff --git a/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch b/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch deleted file mode 100644 index f4716fb68d..0000000000 --- a/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch +++ /dev/null @@ -1,161 +0,0 @@ -From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001 -From: Yu Zhao -Date: Tue, 2 Feb 2021 01:27:45 -0700 -Subject: [PATCH 10/10] mm: multigenerational lru: documentation - -Add Documentation/vm/multigen_lru.rst. - -Signed-off-by: Yu Zhao -Tested-by: Konstantin Kharlamov -Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e ---- - Documentation/vm/index.rst | 1 + - Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++ - 2 files changed, 133 insertions(+) - create mode 100644 Documentation/vm/multigen_lru.rst - ---- a/Documentation/vm/index.rst -+++ b/Documentation/vm/index.rst -@@ -17,6 +17,7 @@ various features of the Linux memory man - - swap_numa - zswap -+ multigen_lru - - Kernel developers MM documentation - ================================== ---- /dev/null -+++ b/Documentation/vm/multigen_lru.rst -@@ -0,0 +1,132 @@ -+.. SPDX-License-Identifier: GPL-2.0 -+ -+===================== -+Multigenerational LRU -+===================== -+ -+Quick Start -+=========== -+Build Configurations -+-------------------- -+:Required: Set ``CONFIG_LRU_GEN=y``. -+ -+:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by -+ default. -+ -+Runtime Configurations -+---------------------- -+:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the -+ feature was not turned on by default. -+ -+:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to -+ protect the working set of ``N`` milliseconds. The OOM killer is -+ invoked if this working set cannot be kept in memory. -+ -+:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature -+ is turned on. This file has the following output: -+ -+:: -+ -+ memcg memcg_id memcg_path -+ node node_id -+ min_gen birth_time anon_size file_size -+ ... -+ max_gen birth_time anon_size file_size -+ -+``min_gen`` is the oldest generation number and ``max_gen`` is the -+youngest generation number. ``birth_time`` is in milliseconds. -+``anon_size`` and ``file_size`` are in pages. -+ -+Phones/Laptops/Workstations -+--------------------------- -+No additional configurations required. -+ -+Servers/Data Centers -+-------------------- -+:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a -+ larger number. -+ -+:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger -+ number. -+ -+:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``. -+ -+:Working set estimation: Write ``+ memcg_id node_id max_gen -+ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to -+ invoke the aging, which scans PTEs for accessed pages and then -+ creates the next generation ``max_gen+1``. A swap file and a non-zero -+ ``swappiness``, which overrides ``vm.swappiness``, are required to -+ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to -+ override the default behavior which only scans PTE tables found -+ populated. -+ -+:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness] -+ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the -+ eviction, which evicts generations less than or equal to ``min_gen``. -+ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and -+ ``max_gen-1`` are not fully aged and therefore cannot be evicted. -+ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple -+ command lines are supported, so does concatenation with delimiters -+ ``,`` and ``;``. -+ -+Framework -+========= -+For each ``lruvec``, evictable pages are divided into multiple -+generations. The youngest generation number is stored in -+``lrugen->max_seq`` for both anon and file types as they are aged on -+an equal footing. The oldest generation numbers are stored in -+``lrugen->min_seq[]`` separately for anon and file types as clean -+file pages can be evicted regardless of swap and writeback -+constraints. These three variables are monotonically increasing. -+Generation numbers are truncated into -+``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into -+``page->flags``. The sliding window technique is used to prevent -+truncated generation numbers from overlapping. Each truncated -+generation number is an index to an array of per-type and per-zone -+lists ``lrugen->lists``. -+ -+Each generation is divided into multiple tiers. Tiers represent -+different ranges of numbers of accesses from file descriptors only. -+Pages accessed ``N`` times via file descriptors belong to tier -+``order_base_2(N)``. Each generation contains at most -+``CONFIG_TIERS_PER_GEN`` tiers, and they require additional -+``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to -+moving between generations which requires list operations, moving -+between tiers only involves operations on ``page->flags`` and -+therefore has a negligible cost. A feedback loop modeled after the PID -+controller monitors refaulted % across all tiers and decides when to -+protect pages from which tiers. -+ -+The framework comprises two conceptually independent components: the -+aging and the eviction, which can be invoked separately from user -+space for the purpose of working set estimation and proactive reclaim. -+ -+Aging -+----- -+The aging produces young generations. Given an ``lruvec``, the aging -+traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()`` -+to scan PTEs for accessed pages (a ``mm_struct`` list is maintained -+for each ``memcg``). Upon finding one, the aging updates its -+generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``). -+After each round of traversal, the aging increments ``max_seq``. The -+aging is due when ``min_seq[]`` reaches ``max_seq-1``. -+ -+Eviction -+-------- -+The eviction consumes old generations. Given an ``lruvec``, the -+eviction scans pages on the per-zone lists indexed by anon and file -+``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to -+select a type based on the values of ``min_seq[]``. If they are -+equal, it selects the type that has a lower refaulted %. The eviction -+sorts a page according to its updated generation number if the aging -+has found this page accessed. It also moves a page to the next -+generation if this page is from an upper tier that has a higher -+refaulted % than the base tier. The eviction increments ``min_seq[]`` -+of a selected type when it finds all the per-zone lists indexed by -+``min_seq[]`` of this selected type are empty. -+ -+To-do List -+========== -+KVM Optimization -+---------------- -+Support shadow page table walk.