14f28820ec015da8ad92e57c591508c8fba9d2a5
[openwrt/staging/linusw.git] /
1 From 8ee8571e47aa75221e5fbd4c9c7802fc4244c346 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Wed, 21 Dec 2022 21:19:04 -0700
4 Subject: [PATCH 06/19] BACKPORT: mm: multi-gen LRU: per-node lru_gen_folio
5 lists
6
7 For each node, memcgs are divided into two generations: the old and
8 the young. For each generation, memcgs are randomly sharded into
9 multiple bins to improve scalability. For each bin, an RCU hlist_nulls
10 is virtually divided into three segments: the head, the tail and the
11 default.
12
13 An onlining memcg is added to the tail of a random bin in the old
14 generation. The eviction starts at the head of a random bin in the old
15 generation. The per-node memcg generation counter, whose reminder (mod
16 2) indexes the old generation, is incremented when all its bins become
17 empty.
18
19 There are four operations:
20 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
21 its current generation (old or young) and updates its "seg" to
22 "head";
23 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
24 its current generation (old or young) and updates its "seg" to
25 "tail";
26 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
27 the old generation, updates its "gen" to "old" and resets its "seg"
28 to "default";
29 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
30 in the young generation, updates its "gen" to "young" and resets
31 its "seg" to "default".
32
33 The events that trigger the above operations are:
34 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
35 2. The first attempt to reclaim an memcg below low, which triggers
36 MEMCG_LRU_TAIL;
37 3. The first attempt to reclaim an memcg below reclaimable size
38 threshold, which triggers MEMCG_LRU_TAIL;
39 4. The second attempt to reclaim an memcg below reclaimable size
40 threshold, which triggers MEMCG_LRU_YOUNG;
41 5. Attempting to reclaim an memcg below min, which triggers
42 MEMCG_LRU_YOUNG;
43 6. Finishing the aging on the eviction path, which triggers
44 MEMCG_LRU_YOUNG;
45 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
46
47 Note that memcg LRU only applies to global reclaim, and the
48 round-robin incrementing of their max_seq counters ensures the
49 eventual fairness to all eligible memcgs. For memcg reclaim, it still
50 relies on mem_cgroup_iter().
51
52 Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
53 Signed-off-by: Yu Zhao <yuzhao@google.com>
54 Cc: Johannes Weiner <hannes@cmpxchg.org>
55 Cc: Jonathan Corbet <corbet@lwn.net>
56 Cc: Michael Larabel <Michael@MichaelLarabel.com>
57 Cc: Michal Hocko <mhocko@kernel.org>
58 Cc: Mike Rapoport <rppt@kernel.org>
59 Cc: Roman Gushchin <roman.gushchin@linux.dev>
60 Cc: Suren Baghdasaryan <surenb@google.com>
61 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
62 Bug: 274865848
63 (cherry picked from commit e4dde56cd208674ce899b47589f263499e5b8cdc)
64 [TJ: Resolved conflicts with older function signatures for
65 min_cgroup_below_min / min_cgroup_below_low and includes]
66 Change-Id: Idc8a0f635e035d72dd911f807d1224cb47cbd655
67 Signed-off-by: T.J. Mercier <tjmercier@google.com>
68 ---
69 include/linux/memcontrol.h | 10 +
70 include/linux/mm_inline.h | 17 ++
71 include/linux/mmzone.h | 117 +++++++++++-
72 mm/memcontrol.c | 16 ++
73 mm/page_alloc.c | 1 +
74 mm/vmscan.c | 374 +++++++++++++++++++++++++++++++++----
75 6 files changed, 500 insertions(+), 35 deletions(-)
76
77 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
78 index e039763029563..82d28b052a9e5 100644
79 --- a/include/linux/memcontrol.h
80 +++ b/include/linux/memcontrol.h
81 @@ -790,6 +790,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
82 percpu_ref_put(&objcg->refcnt);
83 }
84
85 +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
86 +{
87 + return !memcg || css_tryget(&memcg->css);
88 +}
89 +
90 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
91 {
92 if (memcg)
93 @@ -1290,6 +1295,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
94 {
95 }
96
97 +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
98 +{
99 + return true;
100 +}
101 +
102 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
103 {
104 }
105 diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
106 index da38e3d962e2f..c1fd3922dc5dd 100644
107 --- a/include/linux/mm_inline.h
108 +++ b/include/linux/mm_inline.h
109 @@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void)
110 return current->in_lru_fault;
111 }
112
113 +#ifdef CONFIG_MEMCG
114 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
115 +{
116 + return READ_ONCE(lruvec->lrugen.seg);
117 +}
118 +#else
119 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
120 +{
121 + return 0;
122 +}
123 +#endif
124 +
125 static inline int lru_gen_from_seq(unsigned long seq)
126 {
127 return seq % MAX_NR_GENS;
128 @@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void)
129 return false;
130 }
131
132 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
133 +{
134 + return 0;
135 +}
136 +
137 static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
138 {
139 return false;
140 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
141 index 02e4323744715..66e067a635682 100644
142 --- a/include/linux/mmzone.h
143 +++ b/include/linux/mmzone.h
144 @@ -7,6 +7,7 @@
145
146 #include <linux/spinlock.h>
147 #include <linux/list.h>
148 +#include <linux/list_nulls.h>
149 #include <linux/wait.h>
150 #include <linux/bitops.h>
151 #include <linux/cache.h>
152 @@ -367,6 +368,15 @@ struct page_vma_mapped_walk;
153 #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
154 #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
155
156 +/* see the comment on MEMCG_NR_GENS */
157 +enum {
158 + MEMCG_LRU_NOP,
159 + MEMCG_LRU_HEAD,
160 + MEMCG_LRU_TAIL,
161 + MEMCG_LRU_OLD,
162 + MEMCG_LRU_YOUNG,
163 +};
164 +
165 #ifdef CONFIG_LRU_GEN
166
167 enum {
168 @@ -426,6 +436,14 @@ struct lru_gen_folio {
169 atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
170 /* whether the multi-gen LRU is enabled */
171 bool enabled;
172 +#ifdef CONFIG_MEMCG
173 + /* the memcg generation this lru_gen_folio belongs to */
174 + u8 gen;
175 + /* the list segment this lru_gen_folio belongs to */
176 + u8 seg;
177 + /* per-node lru_gen_folio list for global reclaim */
178 + struct hlist_nulls_node list;
179 +#endif
180 };
181
182 enum {
183 @@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec);
184 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
185
186 #ifdef CONFIG_MEMCG
187 +
188 +/*
189 + * For each node, memcgs are divided into two generations: the old and the
190 + * young. For each generation, memcgs are randomly sharded into multiple bins
191 + * to improve scalability. For each bin, the hlist_nulls is virtually divided
192 + * into three segments: the head, the tail and the default.
193 + *
194 + * An onlining memcg is added to the tail of a random bin in the old generation.
195 + * The eviction starts at the head of a random bin in the old generation. The
196 + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
197 + * the old generation, is incremented when all its bins become empty.
198 + *
199 + * There are four operations:
200 + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
201 + * current generation (old or young) and updates its "seg" to "head";
202 + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
203 + * current generation (old or young) and updates its "seg" to "tail";
204 + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
205 + * generation, updates its "gen" to "old" and resets its "seg" to "default";
206 + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
207 + * young generation, updates its "gen" to "young" and resets its "seg" to
208 + * "default".
209 + *
210 + * The events that trigger the above operations are:
211 + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
212 + * 2. The first attempt to reclaim an memcg below low, which triggers
213 + * MEMCG_LRU_TAIL;
214 + * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
215 + * which triggers MEMCG_LRU_TAIL;
216 + * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
217 + * which triggers MEMCG_LRU_YOUNG;
218 + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
219 + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
220 + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
221 + *
222 + * Note that memcg LRU only applies to global reclaim, and the round-robin
223 + * incrementing of their max_seq counters ensures the eventual fairness to all
224 + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
225 + */
226 +#define MEMCG_NR_GENS 2
227 +#define MEMCG_NR_BINS 8
228 +
229 +struct lru_gen_memcg {
230 + /* the per-node memcg generation counter */
231 + unsigned long seq;
232 + /* each memcg has one lru_gen_folio per node */
233 + unsigned long nr_memcgs[MEMCG_NR_GENS];
234 + /* per-node lru_gen_folio list for global reclaim */
235 + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
236 + /* protects the above */
237 + spinlock_t lock;
238 +};
239 +
240 +void lru_gen_init_pgdat(struct pglist_data *pgdat);
241 +
242 void lru_gen_init_memcg(struct mem_cgroup *memcg);
243 void lru_gen_exit_memcg(struct mem_cgroup *memcg);
244 -#endif
245 +void lru_gen_online_memcg(struct mem_cgroup *memcg);
246 +void lru_gen_offline_memcg(struct mem_cgroup *memcg);
247 +void lru_gen_release_memcg(struct mem_cgroup *memcg);
248 +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
249 +
250 +#else /* !CONFIG_MEMCG */
251 +
252 +#define MEMCG_NR_GENS 1
253 +
254 +struct lru_gen_memcg {
255 +};
256 +
257 +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
258 +{
259 +}
260 +
261 +#endif /* CONFIG_MEMCG */
262
263 #else /* !CONFIG_LRU_GEN */
264
265 +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
266 +{
267 +}
268 +
269 static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
270 {
271 }
272 @@ -494,6 +587,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
273 }
274
275 #ifdef CONFIG_MEMCG
276 +
277 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
278 {
279 }
280 @@ -501,7 +595,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
281 static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
282 {
283 }
284 -#endif
285 +
286 +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
287 +{
288 +}
289 +
290 +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
291 +{
292 +}
293 +
294 +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
295 +{
296 +}
297 +
298 +static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
299 +{
300 +}
301 +
302 +#endif /* CONFIG_MEMCG */
303
304 #endif /* CONFIG_LRU_GEN */
305
306 @@ -1219,6 +1330,8 @@ typedef struct pglist_data {
307 #ifdef CONFIG_LRU_GEN
308 /* kswap mm walk data */
309 struct lru_gen_mm_walk mm_walk;
310 + /* lru_gen_folio list */
311 + struct lru_gen_memcg memcg_lru;
312 #endif
313
314 CACHELINE_PADDING(_pad2_);
315 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
316 index 3e8f1ad0fe9db..7815d556e38cc 100644
317 --- a/mm/memcontrol.c
318 +++ b/mm/memcontrol.c
319 @@ -477,6 +477,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
320 struct mem_cgroup_per_node *mz;
321 struct mem_cgroup_tree_per_node *mctz;
322
323 + if (lru_gen_enabled()) {
324 + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
325 +
326 + /* see the comment on MEMCG_NR_GENS */
327 + if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
328 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
329 +
330 + return;
331 + }
332 +
333 mctz = soft_limit_tree.rb_tree_per_node[nid];
334 if (!mctz)
335 return;
336 @@ -3522,6 +3532,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
337 struct mem_cgroup_tree_per_node *mctz;
338 unsigned long excess;
339
340 + if (lru_gen_enabled())
341 + return 0;
342 +
343 if (order > 0)
344 return 0;
345
346 @@ -5382,6 +5395,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
347 if (unlikely(mem_cgroup_is_root(memcg)))
348 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
349 2UL*HZ);
350 + lru_gen_online_memcg(memcg);
351 return 0;
352 offline_kmem:
353 memcg_offline_kmem(memcg);
354 @@ -5413,6 +5427,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
355 memcg_offline_kmem(memcg);
356 reparent_shrinker_deferred(memcg);
357 wb_memcg_offline(memcg);
358 + lru_gen_offline_memcg(memcg);
359
360 drain_all_stock(memcg);
361
362 @@ -5424,6 +5439,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
363 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
364
365 invalidate_reclaim_iterators(memcg);
366 + lru_gen_release_memcg(memcg);
367 }
368
369 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
370 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
371 index 69668817fed37..473057b81a9df 100644
372 --- a/mm/page_alloc.c
373 +++ b/mm/page_alloc.c
374 @@ -7957,6 +7957,7 @@ static void __init free_area_init_node(int nid)
375 pgdat_set_deferred_range(pgdat);
376
377 free_area_init_core(pgdat);
378 + lru_gen_init_pgdat(pgdat);
379 }
380
381 static void __init free_area_init_memoryless_node(int nid)
382 diff --git a/mm/vmscan.c b/mm/vmscan.c
383 index 0c47952714b26..65eb28448f216 100644
384 --- a/mm/vmscan.c
385 +++ b/mm/vmscan.c
386 @@ -54,6 +54,8 @@
387 #include <linux/shmem_fs.h>
388 #include <linux/ctype.h>
389 #include <linux/debugfs.h>
390 +#include <linux/rculist_nulls.h>
391 +#include <linux/random.h>
392
393 #include <asm/tlbflush.h>
394 #include <asm/div64.h>
395 @@ -134,11 +136,6 @@ struct scan_control {
396 /* Always discard instead of demoting to lower tier memory */
397 unsigned int no_demotion:1;
398
399 -#ifdef CONFIG_LRU_GEN
400 - /* help kswapd make better choices among multiple memcgs */
401 - unsigned long last_reclaimed;
402 -#endif
403 -
404 /* Allocation order */
405 s8 order;
406
407 @@ -3160,6 +3157,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
408 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
409 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
410
411 +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
412 +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
413 +
414 static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
415 {
416 struct pglist_data *pgdat = NODE_DATA(nid);
417 @@ -4440,8 +4440,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
418 if (sc->priority <= DEF_PRIORITY - 2)
419 wait_event_killable(lruvec->mm_state.wait,
420 max_seq < READ_ONCE(lrugen->max_seq));
421 -
422 - return max_seq < READ_ONCE(lrugen->max_seq);
423 + return false;
424 }
425
426 VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
427 @@ -4514,8 +4513,6 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
428
429 VM_WARN_ON_ONCE(!current_is_kswapd());
430
431 - sc->last_reclaimed = sc->nr_reclaimed;
432 -
433 /* check the order to exclude compaction-induced reclaim */
434 if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
435 return;
436 @@ -5104,8 +5101,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
437 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
438 * reclaim.
439 */
440 -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
441 - bool can_swap)
442 +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
443 {
444 unsigned long nr_to_scan;
445 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
446 @@ -5122,10 +5118,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
447 if (sc->priority == DEF_PRIORITY)
448 return nr_to_scan;
449
450 - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
451 -
452 /* skip this lruvec as it's low on cold folios */
453 - return 0;
454 + return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
455 }
456
457 static unsigned long get_nr_to_reclaim(struct scan_control *sc)
458 @@ -5134,29 +5128,18 @@ static unsigned long get_nr_to_reclaim(struct scan_control *sc)
459 if (!global_reclaim(sc))
460 return -1;
461
462 - /* discount the previous progress for kswapd */
463 - if (current_is_kswapd())
464 - return sc->nr_to_reclaim + sc->last_reclaimed;
465 -
466 return max(sc->nr_to_reclaim, compact_gap(sc->order));
467 }
468
469 -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
470 +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
471 {
472 - struct blk_plug plug;
473 + long nr_to_scan;
474 unsigned long scanned = 0;
475 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
476
477 - lru_add_drain();
478 -
479 - blk_start_plug(&plug);
480 -
481 - set_mm_walk(lruvec_pgdat(lruvec));
482 -
483 while (true) {
484 int delta;
485 int swappiness;
486 - unsigned long nr_to_scan;
487
488 if (sc->may_swap)
489 swappiness = get_swappiness(lruvec, sc);
490 @@ -5166,7 +5149,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
491 swappiness = 0;
492
493 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
494 - if (!nr_to_scan)
495 + if (nr_to_scan <= 0)
496 break;
497
498 delta = evict_folios(lruvec, sc, swappiness);
499 @@ -5183,11 +5166,252 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
500 cond_resched();
501 }
502
503 + /* whether try_to_inc_max_seq() was successful */
504 + return nr_to_scan < 0;
505 +}
506 +
507 +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
508 +{
509 + bool success;
510 + unsigned long scanned = sc->nr_scanned;
511 + unsigned long reclaimed = sc->nr_reclaimed;
512 + int seg = lru_gen_memcg_seg(lruvec);
513 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
514 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
515 +
516 + /* see the comment on MEMCG_NR_GENS */
517 + if (!lruvec_is_sizable(lruvec, sc))
518 + return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
519 +
520 + mem_cgroup_calculate_protection(NULL, memcg);
521 +
522 + if (mem_cgroup_below_min(memcg))
523 + return MEMCG_LRU_YOUNG;
524 +
525 + if (mem_cgroup_below_low(memcg)) {
526 + /* see the comment on MEMCG_NR_GENS */
527 + if (seg != MEMCG_LRU_TAIL)
528 + return MEMCG_LRU_TAIL;
529 +
530 + memcg_memory_event(memcg, MEMCG_LOW);
531 + }
532 +
533 + success = try_to_shrink_lruvec(lruvec, sc);
534 +
535 + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
536 +
537 + if (!sc->proactive)
538 + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
539 + sc->nr_reclaimed - reclaimed);
540 +
541 + sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
542 + current->reclaim_state->reclaimed_slab = 0;
543 +
544 + return success ? MEMCG_LRU_YOUNG : 0;
545 +}
546 +
547 +#ifdef CONFIG_MEMCG
548 +
549 +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
550 +{
551 + int gen;
552 + int bin;
553 + int first_bin;
554 + struct lruvec *lruvec;
555 + struct lru_gen_folio *lrugen;
556 + const struct hlist_nulls_node *pos;
557 + int op = 0;
558 + struct mem_cgroup *memcg = NULL;
559 + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
560 +
561 + bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
562 +restart:
563 + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
564 +
565 + rcu_read_lock();
566 +
567 + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
568 + if (op)
569 + lru_gen_rotate_memcg(lruvec, op);
570 +
571 + mem_cgroup_put(memcg);
572 +
573 + lruvec = container_of(lrugen, struct lruvec, lrugen);
574 + memcg = lruvec_memcg(lruvec);
575 +
576 + if (!mem_cgroup_tryget(memcg)) {
577 + op = 0;
578 + memcg = NULL;
579 + continue;
580 + }
581 +
582 + rcu_read_unlock();
583 +
584 + op = shrink_one(lruvec, sc);
585 +
586 + if (sc->nr_reclaimed >= nr_to_reclaim)
587 + goto success;
588 +
589 + rcu_read_lock();
590 + }
591 +
592 + rcu_read_unlock();
593 +
594 + /* restart if raced with lru_gen_rotate_memcg() */
595 + if (gen != get_nulls_value(pos))
596 + goto restart;
597 +
598 + /* try the rest of the bins of the current generation */
599 + bin = get_memcg_bin(bin + 1);
600 + if (bin != first_bin)
601 + goto restart;
602 +success:
603 + if (op)
604 + lru_gen_rotate_memcg(lruvec, op);
605 +
606 + mem_cgroup_put(memcg);
607 +}
608 +
609 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
610 +{
611 + struct blk_plug plug;
612 +
613 + VM_WARN_ON_ONCE(global_reclaim(sc));
614 +
615 + lru_add_drain();
616 +
617 + blk_start_plug(&plug);
618 +
619 + set_mm_walk(lruvec_pgdat(lruvec));
620 +
621 + if (try_to_shrink_lruvec(lruvec, sc))
622 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
623 +
624 clear_mm_walk();
625
626 blk_finish_plug(&plug);
627 }
628
629 +#else /* !CONFIG_MEMCG */
630 +
631 +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
632 +{
633 + BUILD_BUG();
634 +}
635 +
636 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
637 +{
638 + BUILD_BUG();
639 +}
640 +
641 +#endif
642 +
643 +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
644 +{
645 + int priority;
646 + unsigned long reclaimable;
647 + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
648 +
649 + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
650 + return;
651 + /*
652 + * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
653 + * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
654 + * estimated reclaimed_to_scanned_ratio = inactive / total.
655 + */
656 + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
657 + if (get_swappiness(lruvec, sc))
658 + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
659 +
660 + reclaimable /= MEMCG_NR_GENS;
661 +
662 + /* round down reclaimable and round up sc->nr_to_reclaim */
663 + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
664 +
665 + sc->priority = clamp(priority, 0, DEF_PRIORITY);
666 +}
667 +
668 +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
669 +{
670 + struct blk_plug plug;
671 + unsigned long reclaimed = sc->nr_reclaimed;
672 +
673 + VM_WARN_ON_ONCE(!global_reclaim(sc));
674 +
675 + lru_add_drain();
676 +
677 + blk_start_plug(&plug);
678 +
679 + set_mm_walk(pgdat);
680 +
681 + set_initial_priority(pgdat, sc);
682 +
683 + if (current_is_kswapd())
684 + sc->nr_reclaimed = 0;
685 +
686 + if (mem_cgroup_disabled())
687 + shrink_one(&pgdat->__lruvec, sc);
688 + else
689 + shrink_many(pgdat, sc);
690 +
691 + if (current_is_kswapd())
692 + sc->nr_reclaimed += reclaimed;
693 +
694 + clear_mm_walk();
695 +
696 + blk_finish_plug(&plug);
697 +
698 + /* kswapd should never fail */
699 + pgdat->kswapd_failures = 0;
700 +}
701 +
702 +#ifdef CONFIG_MEMCG
703 +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
704 +{
705 + int seg;
706 + int old, new;
707 + int bin = get_random_u32_below(MEMCG_NR_BINS);
708 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
709 +
710 + spin_lock(&pgdat->memcg_lru.lock);
711 +
712 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
713 +
714 + seg = 0;
715 + new = old = lruvec->lrugen.gen;
716 +
717 + /* see the comment on MEMCG_NR_GENS */
718 + if (op == MEMCG_LRU_HEAD)
719 + seg = MEMCG_LRU_HEAD;
720 + else if (op == MEMCG_LRU_TAIL)
721 + seg = MEMCG_LRU_TAIL;
722 + else if (op == MEMCG_LRU_OLD)
723 + new = get_memcg_gen(pgdat->memcg_lru.seq);
724 + else if (op == MEMCG_LRU_YOUNG)
725 + new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
726 + else
727 + VM_WARN_ON_ONCE(true);
728 +
729 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
730 +
731 + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
732 + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
733 + else
734 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
735 +
736 + pgdat->memcg_lru.nr_memcgs[old]--;
737 + pgdat->memcg_lru.nr_memcgs[new]++;
738 +
739 + lruvec->lrugen.gen = new;
740 + WRITE_ONCE(lruvec->lrugen.seg, seg);
741 +
742 + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
743 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
744 +
745 + spin_unlock(&pgdat->memcg_lru.lock);
746 +}
747 +#endif
748 +
749 /******************************************************************************
750 * state change
751 ******************************************************************************/
752 @@ -5644,11 +5868,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
753
754 if (!mem_cgroup_disabled()) {
755 rcu_read_lock();
756 +
757 memcg = mem_cgroup_from_id(memcg_id);
758 -#ifdef CONFIG_MEMCG
759 - if (memcg && !css_tryget(&memcg->css))
760 + if (!mem_cgroup_tryget(memcg))
761 memcg = NULL;
762 -#endif
763 +
764 rcu_read_unlock();
765
766 if (!memcg)
767 @@ -5796,6 +6020,19 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
768 }
769
770 #ifdef CONFIG_MEMCG
771 +
772 +void lru_gen_init_pgdat(struct pglist_data *pgdat)
773 +{
774 + int i, j;
775 +
776 + spin_lock_init(&pgdat->memcg_lru.lock);
777 +
778 + for (i = 0; i < MEMCG_NR_GENS; i++) {
779 + for (j = 0; j < MEMCG_NR_BINS; j++)
780 + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
781 + }
782 +}
783 +
784 void lru_gen_init_memcg(struct mem_cgroup *memcg)
785 {
786 INIT_LIST_HEAD(&memcg->mm_list.fifo);
787 @@ -5819,7 +6056,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
788 }
789 }
790 }
791 -#endif
792 +
793 +void lru_gen_online_memcg(struct mem_cgroup *memcg)
794 +{
795 + int gen;
796 + int nid;
797 + int bin = get_random_u32_below(MEMCG_NR_BINS);
798 +
799 + for_each_node(nid) {
800 + struct pglist_data *pgdat = NODE_DATA(nid);
801 + struct lruvec *lruvec = get_lruvec(memcg, nid);
802 +
803 + spin_lock(&pgdat->memcg_lru.lock);
804 +
805 + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
806 +
807 + gen = get_memcg_gen(pgdat->memcg_lru.seq);
808 +
809 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
810 + pgdat->memcg_lru.nr_memcgs[gen]++;
811 +
812 + lruvec->lrugen.gen = gen;
813 +
814 + spin_unlock(&pgdat->memcg_lru.lock);
815 + }
816 +}
817 +
818 +void lru_gen_offline_memcg(struct mem_cgroup *memcg)
819 +{
820 + int nid;
821 +
822 + for_each_node(nid) {
823 + struct lruvec *lruvec = get_lruvec(memcg, nid);
824 +
825 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
826 + }
827 +}
828 +
829 +void lru_gen_release_memcg(struct mem_cgroup *memcg)
830 +{
831 + int gen;
832 + int nid;
833 +
834 + for_each_node(nid) {
835 + struct pglist_data *pgdat = NODE_DATA(nid);
836 + struct lruvec *lruvec = get_lruvec(memcg, nid);
837 +
838 + spin_lock(&pgdat->memcg_lru.lock);
839 +
840 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
841 +
842 + gen = lruvec->lrugen.gen;
843 +
844 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
845 + pgdat->memcg_lru.nr_memcgs[gen]--;
846 +
847 + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
848 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
849 +
850 + spin_unlock(&pgdat->memcg_lru.lock);
851 + }
852 +}
853 +
854 +#endif /* CONFIG_MEMCG */
855
856 static int __init init_lru_gen(void)
857 {
858 @@ -5846,6 +6145,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
859 {
860 }
861
862 +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
863 +{
864 +}
865 +
866 #endif /* CONFIG_LRU_GEN */
867
868 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
869 @@ -5859,7 +6162,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
870 bool proportional_reclaim;
871 struct blk_plug plug;
872
873 - if (lru_gen_enabled()) {
874 + if (lru_gen_enabled() && !global_reclaim(sc)) {
875 lru_gen_shrink_lruvec(lruvec, sc);
876 return;
877 }
878 @@ -6102,6 +6405,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
879 struct lruvec *target_lruvec;
880 bool reclaimable = false;
881
882 + if (lru_gen_enabled() && global_reclaim(sc)) {
883 + lru_gen_shrink_node(pgdat, sc);
884 + return;
885 + }
886 +
887 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
888
889 again:
890 --
891 2.40.1
892