2f4fb911f5b580d42b6cf7c2fd1734a441f6f100
[openwrt/staging/nbd.git] /
1 From e20b7386fccc18c791796eb1dc1a91eee3ccf801 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Wed, 21 Dec 2022 21:19:02 -0700
4 Subject: [PATCH 24/29] mm: multi-gen LRU: remove aging fairness safeguard
5
6 Recall that the aging produces the youngest generation: first it scans
7 for accessed folios and updates their gen counters; then it increments
8 lrugen->max_seq.
9
10 The current aging fairness safeguard for kswapd uses two passes to
11 ensure the fairness to multiple eligible memcgs. On the first pass,
12 which is shared with the eviction, it checks whether all eligible
13 memcgs are low on cold folios. If so, it requires a second pass, on
14 which it ages all those memcgs at the same time.
15
16 With memcg LRU, the aging, while ensuring eventual fairness, will run
17 when necessary. Therefore the current aging fairness safeguard for
18 kswapd will not be needed.
19
20 Note that memcg LRU only applies to global reclaim. For memcg reclaim,
21 the aging can be unfair to different memcgs, i.e., their
22 lrugen->max_seq can be incremented at different paces.
23
24 Link: https://lkml.kernel.org/r/20221222041905.2431096-5-yuzhao@google.com
25 Signed-off-by: Yu Zhao <yuzhao@google.com>
26 Cc: Johannes Weiner <hannes@cmpxchg.org>
27 Cc: Jonathan Corbet <corbet@lwn.net>
28 Cc: Michael Larabel <Michael@MichaelLarabel.com>
29 Cc: Michal Hocko <mhocko@kernel.org>
30 Cc: Mike Rapoport <rppt@kernel.org>
31 Cc: Roman Gushchin <roman.gushchin@linux.dev>
32 Cc: Suren Baghdasaryan <surenb@google.com>
33 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
34 ---
35 mm/vmscan.c | 126 ++++++++++++++++++++++++----------------------------
36 1 file changed, 59 insertions(+), 67 deletions(-)
37
38 --- a/mm/vmscan.c
39 +++ b/mm/vmscan.c
40 @@ -131,7 +131,6 @@ struct scan_control {
41
42 #ifdef CONFIG_LRU_GEN
43 /* help kswapd make better choices among multiple memcgs */
44 - unsigned int memcgs_need_aging:1;
45 unsigned long last_reclaimed;
46 #endif
47
48 @@ -4184,7 +4183,7 @@ done:
49 return true;
50 }
51
52 -static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
53 +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
54 struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
55 {
56 int gen, type, zone;
57 @@ -4193,6 +4192,13 @@ static bool should_run_aging(struct lruv
58 unsigned long total = 0;
59 struct lru_gen_folio *lrugen = &lruvec->lrugen;
60 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
61 + DEFINE_MIN_SEQ(lruvec);
62 +
63 + /* whether this lruvec is completely out of cold folios */
64 + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
65 + *nr_to_scan = 0;
66 + return true;
67 + }
68
69 for (type = !can_swap; type < ANON_AND_FILE; type++) {
70 unsigned long seq;
71 @@ -4221,8 +4227,6 @@ static bool should_run_aging(struct lruv
72 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
73 * ideal number of generations is MIN_NR_GENS+1.
74 */
75 - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
76 - return true;
77 if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
78 return false;
79
80 @@ -4241,40 +4245,54 @@ static bool should_run_aging(struct lruv
81 return false;
82 }
83
84 -static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
85 +static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
86 {
87 - bool need_aging;
88 - unsigned long nr_to_scan;
89 - int swappiness = get_swappiness(lruvec, sc);
90 + int gen, type, zone;
91 + unsigned long total = 0;
92 + bool can_swap = get_swappiness(lruvec, sc);
93 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
94 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
95 DEFINE_MAX_SEQ(lruvec);
96 DEFINE_MIN_SEQ(lruvec);
97
98 - VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
99 + for (type = !can_swap; type < ANON_AND_FILE; type++) {
100 + unsigned long seq;
101
102 - mem_cgroup_calculate_protection(NULL, memcg);
103 + for (seq = min_seq[type]; seq <= max_seq; seq++) {
104 + gen = lru_gen_from_seq(seq);
105
106 - if (mem_cgroup_below_min(memcg))
107 - return false;
108 + for (zone = 0; zone < MAX_NR_ZONES; zone++)
109 + total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
110 + }
111 + }
112
113 - need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
114 + /* whether the size is big enough to be helpful */
115 + return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
116 +}
117
118 - if (min_ttl) {
119 - int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
120 - unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
121 +static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
122 + unsigned long min_ttl)
123 +{
124 + int gen;
125 + unsigned long birth;
126 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
127 + DEFINE_MIN_SEQ(lruvec);
128
129 - if (time_is_after_jiffies(birth + min_ttl))
130 - return false;
131 + VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
132
133 - /* the size is likely too small to be helpful */
134 - if (!nr_to_scan && sc->priority != DEF_PRIORITY)
135 - return false;
136 - }
137 + /* see the comment on lru_gen_folio */
138 + gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
139 + birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
140
141 - if (need_aging)
142 - try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
143 + if (time_is_after_jiffies(birth + min_ttl))
144 + return false;
145
146 - return true;
147 + if (!lruvec_is_sizable(lruvec, sc))
148 + return false;
149 +
150 + mem_cgroup_calculate_protection(NULL, memcg);
151 +
152 + return !mem_cgroup_below_min(memcg);
153 }
154
155 /* to protect the working set of the last N jiffies */
156 @@ -4283,46 +4301,32 @@ static unsigned long lru_gen_min_ttl __r
157 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
158 {
159 struct mem_cgroup *memcg;
160 - bool success = false;
161 unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
162
163 VM_WARN_ON_ONCE(!current_is_kswapd());
164
165 sc->last_reclaimed = sc->nr_reclaimed;
166
167 - /*
168 - * To reduce the chance of going into the aging path, which can be
169 - * costly, optimistically skip it if the flag below was cleared in the
170 - * eviction path. This improves the overall performance when multiple
171 - * memcgs are available.
172 - */
173 - if (!sc->memcgs_need_aging) {
174 - sc->memcgs_need_aging = true;
175 + /* check the order to exclude compaction-induced reclaim */
176 + if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
177 return;
178 - }
179 -
180 - set_mm_walk(pgdat);
181
182 memcg = mem_cgroup_iter(NULL, NULL, NULL);
183 do {
184 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
185
186 - if (age_lruvec(lruvec, sc, min_ttl))
187 - success = true;
188 + if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
189 + mem_cgroup_iter_break(NULL, memcg);
190 + return;
191 + }
192
193 cond_resched();
194 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
195
196 - clear_mm_walk();
197 -
198 - /* check the order to exclude compaction-induced reclaim */
199 - if (success || !min_ttl || sc->order)
200 - return;
201 -
202 /*
203 * The main goal is to OOM kill if every generation from all memcgs is
204 * younger than min_ttl. However, another possibility is all memcgs are
205 - * either below min or empty.
206 + * either too small or below min.
207 */
208 if (mutex_trylock(&oom_lock)) {
209 struct oom_control oc = {
210 @@ -4830,33 +4834,27 @@ retry:
211 * reclaim.
212 */
213 static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
214 - bool can_swap, bool *need_aging)
215 + bool can_swap)
216 {
217 unsigned long nr_to_scan;
218 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
219 DEFINE_MAX_SEQ(lruvec);
220 - DEFINE_MIN_SEQ(lruvec);
221
222 if (mem_cgroup_below_min(memcg) ||
223 (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
224 return 0;
225
226 - *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
227 - if (!*need_aging)
228 + if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
229 return nr_to_scan;
230
231 /* skip the aging path at the default priority */
232 if (sc->priority == DEF_PRIORITY)
233 - goto done;
234 + return nr_to_scan;
235
236 - /* leave the work to lru_gen_age_node() */
237 - if (current_is_kswapd())
238 - return 0;
239 + try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
240
241 - if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
242 - return nr_to_scan;
243 -done:
244 - return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
245 + /* skip this lruvec as it's low on cold folios */
246 + return 0;
247 }
248
249 static unsigned long get_nr_to_reclaim(struct scan_control *sc)
250 @@ -4875,9 +4873,7 @@ static unsigned long get_nr_to_reclaim(s
251 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
252 {
253 struct blk_plug plug;
254 - bool need_aging = false;
255 unsigned long scanned = 0;
256 - unsigned long reclaimed = sc->nr_reclaimed;
257 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
258
259 lru_add_drain();
260 @@ -4898,13 +4894,13 @@ static void lru_gen_shrink_lruvec(struct
261 else
262 swappiness = 0;
263
264 - nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
265 + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
266 if (!nr_to_scan)
267 - goto done;
268 + break;
269
270 delta = evict_folios(lruvec, sc, swappiness);
271 if (!delta)
272 - goto done;
273 + break;
274
275 scanned += delta;
276 if (scanned >= nr_to_scan)
277 @@ -4916,10 +4912,6 @@ static void lru_gen_shrink_lruvec(struct
278 cond_resched();
279 }
280
281 - /* see the comment in lru_gen_age_node() */
282 - if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
283 - sc->memcgs_need_aging = false;
284 -done:
285 clear_mm_walk();
286
287 blk_finish_plug(&plug);