ef98d52b78d6166c523487aa4ea4be3610637bf6
[openwrt/staging/linusw.git] /
1 From 8ee8571e47aa75221e5fbd4c9c7802fc4244c346 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Wed, 21 Dec 2022 21:19:04 -0700
4 Subject: [PATCH 06/19] BACKPORT: mm: multi-gen LRU: per-node lru_gen_folio
5 lists
6
7 For each node, memcgs are divided into two generations: the old and
8 the young. For each generation, memcgs are randomly sharded into
9 multiple bins to improve scalability. For each bin, an RCU hlist_nulls
10 is virtually divided into three segments: the head, the tail and the
11 default.
12
13 An onlining memcg is added to the tail of a random bin in the old
14 generation. The eviction starts at the head of a random bin in the old
15 generation. The per-node memcg generation counter, whose reminder (mod
16 2) indexes the old generation, is incremented when all its bins become
17 empty.
18
19 There are four operations:
20 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
21 its current generation (old or young) and updates its "seg" to
22 "head";
23 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
24 its current generation (old or young) and updates its "seg" to
25 "tail";
26 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
27 the old generation, updates its "gen" to "old" and resets its "seg"
28 to "default";
29 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
30 in the young generation, updates its "gen" to "young" and resets
31 its "seg" to "default".
32
33 The events that trigger the above operations are:
34 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
35 2. The first attempt to reclaim an memcg below low, which triggers
36 MEMCG_LRU_TAIL;
37 3. The first attempt to reclaim an memcg below reclaimable size
38 threshold, which triggers MEMCG_LRU_TAIL;
39 4. The second attempt to reclaim an memcg below reclaimable size
40 threshold, which triggers MEMCG_LRU_YOUNG;
41 5. Attempting to reclaim an memcg below min, which triggers
42 MEMCG_LRU_YOUNG;
43 6. Finishing the aging on the eviction path, which triggers
44 MEMCG_LRU_YOUNG;
45 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
46
47 Note that memcg LRU only applies to global reclaim, and the
48 round-robin incrementing of their max_seq counters ensures the
49 eventual fairness to all eligible memcgs. For memcg reclaim, it still
50 relies on mem_cgroup_iter().
51
52 Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
53 Signed-off-by: Yu Zhao <yuzhao@google.com>
54 Cc: Johannes Weiner <hannes@cmpxchg.org>
55 Cc: Jonathan Corbet <corbet@lwn.net>
56 Cc: Michael Larabel <Michael@MichaelLarabel.com>
57 Cc: Michal Hocko <mhocko@kernel.org>
58 Cc: Mike Rapoport <rppt@kernel.org>
59 Cc: Roman Gushchin <roman.gushchin@linux.dev>
60 Cc: Suren Baghdasaryan <surenb@google.com>
61 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
62 Bug: 274865848
63 (cherry picked from commit e4dde56cd208674ce899b47589f263499e5b8cdc)
64 [TJ: Resolved conflicts with older function signatures for
65 min_cgroup_below_min / min_cgroup_below_low and includes]
66 Change-Id: Idc8a0f635e035d72dd911f807d1224cb47cbd655
67 Signed-off-by: T.J. Mercier <tjmercier@google.com>
68 ---
69 include/linux/memcontrol.h | 10 +
70 include/linux/mm_inline.h | 17 ++
71 include/linux/mmzone.h | 117 +++++++++++-
72 mm/memcontrol.c | 16 ++
73 mm/page_alloc.c | 1 +
74 mm/vmscan.c | 374 +++++++++++++++++++++++++++++++++----
75 6 files changed, 500 insertions(+), 35 deletions(-)
76
77 --- a/include/linux/memcontrol.h
78 +++ b/include/linux/memcontrol.h
79 @@ -795,6 +795,11 @@ static inline void obj_cgroup_put(struct
80 percpu_ref_put(&objcg->refcnt);
81 }
82
83 +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
84 +{
85 + return !memcg || css_tryget(&memcg->css);
86 +}
87 +
88 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
89 {
90 if (memcg)
91 @@ -1295,6 +1300,11 @@ static inline void obj_cgroup_put(struct
92 {
93 }
94
95 +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
96 +{
97 + return true;
98 +}
99 +
100 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
101 {
102 }
103 --- a/include/linux/mm_inline.h
104 +++ b/include/linux/mm_inline.h
105 @@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void
106 return current->in_lru_fault;
107 }
108
109 +#ifdef CONFIG_MEMCG
110 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
111 +{
112 + return READ_ONCE(lruvec->lrugen.seg);
113 +}
114 +#else
115 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
116 +{
117 + return 0;
118 +}
119 +#endif
120 +
121 static inline int lru_gen_from_seq(unsigned long seq)
122 {
123 return seq % MAX_NR_GENS;
124 @@ -297,6 +309,11 @@ static inline bool lru_gen_in_fault(void
125 return false;
126 }
127
128 +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
129 +{
130 + return 0;
131 +}
132 +
133 static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
134 {
135 return false;
136 --- a/include/linux/mmzone.h
137 +++ b/include/linux/mmzone.h
138 @@ -7,6 +7,7 @@
139
140 #include <linux/spinlock.h>
141 #include <linux/list.h>
142 +#include <linux/list_nulls.h>
143 #include <linux/wait.h>
144 #include <linux/bitops.h>
145 #include <linux/cache.h>
146 @@ -367,6 +368,15 @@ struct page_vma_mapped_walk;
147 #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
148 #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
149
150 +/* see the comment on MEMCG_NR_GENS */
151 +enum {
152 + MEMCG_LRU_NOP,
153 + MEMCG_LRU_HEAD,
154 + MEMCG_LRU_TAIL,
155 + MEMCG_LRU_OLD,
156 + MEMCG_LRU_YOUNG,
157 +};
158 +
159 #ifdef CONFIG_LRU_GEN
160
161 enum {
162 @@ -426,6 +436,14 @@ struct lru_gen_folio {
163 atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
164 /* whether the multi-gen LRU is enabled */
165 bool enabled;
166 +#ifdef CONFIG_MEMCG
167 + /* the memcg generation this lru_gen_folio belongs to */
168 + u8 gen;
169 + /* the list segment this lru_gen_folio belongs to */
170 + u8 seg;
171 + /* per-node lru_gen_folio list for global reclaim */
172 + struct hlist_nulls_node list;
173 +#endif
174 };
175
176 enum {
177 @@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *
178 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
179
180 #ifdef CONFIG_MEMCG
181 +
182 +/*
183 + * For each node, memcgs are divided into two generations: the old and the
184 + * young. For each generation, memcgs are randomly sharded into multiple bins
185 + * to improve scalability. For each bin, the hlist_nulls is virtually divided
186 + * into three segments: the head, the tail and the default.
187 + *
188 + * An onlining memcg is added to the tail of a random bin in the old generation.
189 + * The eviction starts at the head of a random bin in the old generation. The
190 + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
191 + * the old generation, is incremented when all its bins become empty.
192 + *
193 + * There are four operations:
194 + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
195 + * current generation (old or young) and updates its "seg" to "head";
196 + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
197 + * current generation (old or young) and updates its "seg" to "tail";
198 + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
199 + * generation, updates its "gen" to "old" and resets its "seg" to "default";
200 + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
201 + * young generation, updates its "gen" to "young" and resets its "seg" to
202 + * "default".
203 + *
204 + * The events that trigger the above operations are:
205 + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
206 + * 2. The first attempt to reclaim an memcg below low, which triggers
207 + * MEMCG_LRU_TAIL;
208 + * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
209 + * which triggers MEMCG_LRU_TAIL;
210 + * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
211 + * which triggers MEMCG_LRU_YOUNG;
212 + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
213 + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
214 + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
215 + *
216 + * Note that memcg LRU only applies to global reclaim, and the round-robin
217 + * incrementing of their max_seq counters ensures the eventual fairness to all
218 + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
219 + */
220 +#define MEMCG_NR_GENS 2
221 +#define MEMCG_NR_BINS 8
222 +
223 +struct lru_gen_memcg {
224 + /* the per-node memcg generation counter */
225 + unsigned long seq;
226 + /* each memcg has one lru_gen_folio per node */
227 + unsigned long nr_memcgs[MEMCG_NR_GENS];
228 + /* per-node lru_gen_folio list for global reclaim */
229 + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
230 + /* protects the above */
231 + spinlock_t lock;
232 +};
233 +
234 +void lru_gen_init_pgdat(struct pglist_data *pgdat);
235 +
236 void lru_gen_init_memcg(struct mem_cgroup *memcg);
237 void lru_gen_exit_memcg(struct mem_cgroup *memcg);
238 -#endif
239 +void lru_gen_online_memcg(struct mem_cgroup *memcg);
240 +void lru_gen_offline_memcg(struct mem_cgroup *memcg);
241 +void lru_gen_release_memcg(struct mem_cgroup *memcg);
242 +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
243 +
244 +#else /* !CONFIG_MEMCG */
245 +
246 +#define MEMCG_NR_GENS 1
247 +
248 +struct lru_gen_memcg {
249 +};
250 +
251 +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
252 +{
253 +}
254 +
255 +#endif /* CONFIG_MEMCG */
256
257 #else /* !CONFIG_LRU_GEN */
258
259 +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
260 +{
261 +}
262 +
263 static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
264 {
265 }
266 @@ -494,6 +587,7 @@ static inline void lru_gen_look_around(s
267 }
268
269 #ifdef CONFIG_MEMCG
270 +
271 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
272 {
273 }
274 @@ -501,7 +595,24 @@ static inline void lru_gen_init_memcg(st
275 static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
276 {
277 }
278 -#endif
279 +
280 +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
281 +{
282 +}
283 +
284 +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
285 +{
286 +}
287 +
288 +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
289 +{
290 +}
291 +
292 +static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
293 +{
294 +}
295 +
296 +#endif /* CONFIG_MEMCG */
297
298 #endif /* CONFIG_LRU_GEN */
299
300 @@ -1219,6 +1330,8 @@ typedef struct pglist_data {
301 #ifdef CONFIG_LRU_GEN
302 /* kswap mm walk data */
303 struct lru_gen_mm_walk mm_walk;
304 + /* lru_gen_folio list */
305 + struct lru_gen_memcg memcg_lru;
306 #endif
307
308 CACHELINE_PADDING(_pad2_);
309 --- a/mm/memcontrol.c
310 +++ b/mm/memcontrol.c
311 @@ -477,6 +477,16 @@ static void mem_cgroup_update_tree(struc
312 struct mem_cgroup_per_node *mz;
313 struct mem_cgroup_tree_per_node *mctz;
314
315 + if (lru_gen_enabled()) {
316 + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
317 +
318 + /* see the comment on MEMCG_NR_GENS */
319 + if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
320 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
321 +
322 + return;
323 + }
324 +
325 mctz = soft_limit_tree.rb_tree_per_node[nid];
326 if (!mctz)
327 return;
328 @@ -3523,6 +3533,9 @@ unsigned long mem_cgroup_soft_limit_recl
329 struct mem_cgroup_tree_per_node *mctz;
330 unsigned long excess;
331
332 + if (lru_gen_enabled())
333 + return 0;
334 +
335 if (order > 0)
336 return 0;
337
338 @@ -5383,6 +5396,7 @@ static int mem_cgroup_css_online(struct
339 if (unlikely(mem_cgroup_is_root(memcg)))
340 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
341 2UL*HZ);
342 + lru_gen_online_memcg(memcg);
343 return 0;
344 offline_kmem:
345 memcg_offline_kmem(memcg);
346 @@ -5414,6 +5428,7 @@ static void mem_cgroup_css_offline(struc
347 memcg_offline_kmem(memcg);
348 reparent_shrinker_deferred(memcg);
349 wb_memcg_offline(memcg);
350 + lru_gen_offline_memcg(memcg);
351
352 drain_all_stock(memcg);
353
354 @@ -5425,6 +5440,7 @@ static void mem_cgroup_css_released(stru
355 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
356
357 invalidate_reclaim_iterators(memcg);
358 + lru_gen_release_memcg(memcg);
359 }
360
361 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
362 --- a/mm/page_alloc.c
363 +++ b/mm/page_alloc.c
364 @@ -7957,6 +7957,7 @@ static void __init free_area_init_node(i
365 pgdat_set_deferred_range(pgdat);
366
367 free_area_init_core(pgdat);
368 + lru_gen_init_pgdat(pgdat);
369 }
370
371 static void __init free_area_init_memoryless_node(int nid)
372 --- a/mm/vmscan.c
373 +++ b/mm/vmscan.c
374 @@ -54,6 +54,8 @@
375 #include <linux/shmem_fs.h>
376 #include <linux/ctype.h>
377 #include <linux/debugfs.h>
378 +#include <linux/rculist_nulls.h>
379 +#include <linux/random.h>
380
381 #include <asm/tlbflush.h>
382 #include <asm/div64.h>
383 @@ -134,11 +136,6 @@ struct scan_control {
384 /* Always discard instead of demoting to lower tier memory */
385 unsigned int no_demotion:1;
386
387 -#ifdef CONFIG_LRU_GEN
388 - /* help kswapd make better choices among multiple memcgs */
389 - unsigned long last_reclaimed;
390 -#endif
391 -
392 /* Allocation order */
393 s8 order;
394
395 @@ -3160,6 +3157,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_ca
396 for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
397 for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
398
399 +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
400 +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
401 +
402 static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
403 {
404 struct pglist_data *pgdat = NODE_DATA(nid);
405 @@ -4442,8 +4442,7 @@ done:
406 if (sc->priority <= DEF_PRIORITY - 2)
407 wait_event_killable(lruvec->mm_state.wait,
408 max_seq < READ_ONCE(lrugen->max_seq));
409 -
410 - return max_seq < READ_ONCE(lrugen->max_seq);
411 + return false;
412 }
413
414 VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
415 @@ -4516,8 +4515,6 @@ static void lru_gen_age_node(struct pgli
416
417 VM_WARN_ON_ONCE(!current_is_kswapd());
418
419 - sc->last_reclaimed = sc->nr_reclaimed;
420 -
421 /* check the order to exclude compaction-induced reclaim */
422 if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
423 return;
424 @@ -5116,8 +5113,7 @@ static bool should_run_aging(struct lruv
425 * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
426 * reclaim.
427 */
428 -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
429 - bool can_swap)
430 +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
431 {
432 unsigned long nr_to_scan;
433 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
434 @@ -5134,10 +5130,8 @@ static unsigned long get_nr_to_scan(stru
435 if (sc->priority == DEF_PRIORITY)
436 return nr_to_scan;
437
438 - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
439 -
440 /* skip this lruvec as it's low on cold folios */
441 - return 0;
442 + return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
443 }
444
445 static unsigned long get_nr_to_reclaim(struct scan_control *sc)
446 @@ -5146,29 +5140,18 @@ static unsigned long get_nr_to_reclaim(s
447 if (!global_reclaim(sc))
448 return -1;
449
450 - /* discount the previous progress for kswapd */
451 - if (current_is_kswapd())
452 - return sc->nr_to_reclaim + sc->last_reclaimed;
453 -
454 return max(sc->nr_to_reclaim, compact_gap(sc->order));
455 }
456
457 -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
458 +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
459 {
460 - struct blk_plug plug;
461 + long nr_to_scan;
462 unsigned long scanned = 0;
463 unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
464
465 - lru_add_drain();
466 -
467 - blk_start_plug(&plug);
468 -
469 - set_mm_walk(lruvec_pgdat(lruvec));
470 -
471 while (true) {
472 int delta;
473 int swappiness;
474 - unsigned long nr_to_scan;
475
476 if (sc->may_swap)
477 swappiness = get_swappiness(lruvec, sc);
478 @@ -5178,7 +5161,7 @@ static void lru_gen_shrink_lruvec(struct
479 swappiness = 0;
480
481 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
482 - if (!nr_to_scan)
483 + if (nr_to_scan <= 0)
484 break;
485
486 delta = evict_folios(lruvec, sc, swappiness);
487 @@ -5195,10 +5178,251 @@ static void lru_gen_shrink_lruvec(struct
488 cond_resched();
489 }
490
491 + /* whether try_to_inc_max_seq() was successful */
492 + return nr_to_scan < 0;
493 +}
494 +
495 +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
496 +{
497 + bool success;
498 + unsigned long scanned = sc->nr_scanned;
499 + unsigned long reclaimed = sc->nr_reclaimed;
500 + int seg = lru_gen_memcg_seg(lruvec);
501 + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
502 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
503 +
504 + /* see the comment on MEMCG_NR_GENS */
505 + if (!lruvec_is_sizable(lruvec, sc))
506 + return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
507 +
508 + mem_cgroup_calculate_protection(NULL, memcg);
509 +
510 + if (mem_cgroup_below_min(memcg))
511 + return MEMCG_LRU_YOUNG;
512 +
513 + if (mem_cgroup_below_low(memcg)) {
514 + /* see the comment on MEMCG_NR_GENS */
515 + if (seg != MEMCG_LRU_TAIL)
516 + return MEMCG_LRU_TAIL;
517 +
518 + memcg_memory_event(memcg, MEMCG_LOW);
519 + }
520 +
521 + success = try_to_shrink_lruvec(lruvec, sc);
522 +
523 + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
524 +
525 + if (!sc->proactive)
526 + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
527 + sc->nr_reclaimed - reclaimed);
528 +
529 + sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
530 + current->reclaim_state->reclaimed_slab = 0;
531 +
532 + return success ? MEMCG_LRU_YOUNG : 0;
533 +}
534 +
535 +#ifdef CONFIG_MEMCG
536 +
537 +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
538 +{
539 + int gen;
540 + int bin;
541 + int first_bin;
542 + struct lruvec *lruvec;
543 + struct lru_gen_folio *lrugen;
544 + const struct hlist_nulls_node *pos;
545 + int op = 0;
546 + struct mem_cgroup *memcg = NULL;
547 + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
548 +
549 + bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
550 +restart:
551 + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
552 +
553 + rcu_read_lock();
554 +
555 + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
556 + if (op)
557 + lru_gen_rotate_memcg(lruvec, op);
558 +
559 + mem_cgroup_put(memcg);
560 +
561 + lruvec = container_of(lrugen, struct lruvec, lrugen);
562 + memcg = lruvec_memcg(lruvec);
563 +
564 + if (!mem_cgroup_tryget(memcg)) {
565 + op = 0;
566 + memcg = NULL;
567 + continue;
568 + }
569 +
570 + rcu_read_unlock();
571 +
572 + op = shrink_one(lruvec, sc);
573 +
574 + if (sc->nr_reclaimed >= nr_to_reclaim)
575 + goto success;
576 +
577 + rcu_read_lock();
578 + }
579 +
580 + rcu_read_unlock();
581 +
582 + /* restart if raced with lru_gen_rotate_memcg() */
583 + if (gen != get_nulls_value(pos))
584 + goto restart;
585 +
586 + /* try the rest of the bins of the current generation */
587 + bin = get_memcg_bin(bin + 1);
588 + if (bin != first_bin)
589 + goto restart;
590 +success:
591 + if (op)
592 + lru_gen_rotate_memcg(lruvec, op);
593 +
594 + mem_cgroup_put(memcg);
595 +}
596 +
597 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
598 +{
599 + struct blk_plug plug;
600 +
601 + VM_WARN_ON_ONCE(global_reclaim(sc));
602 +
603 + lru_add_drain();
604 +
605 + blk_start_plug(&plug);
606 +
607 + set_mm_walk(lruvec_pgdat(lruvec));
608 +
609 + if (try_to_shrink_lruvec(lruvec, sc))
610 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
611 +
612 + clear_mm_walk();
613 +
614 + blk_finish_plug(&plug);
615 +}
616 +
617 +#else /* !CONFIG_MEMCG */
618 +
619 +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
620 +{
621 + BUILD_BUG();
622 +}
623 +
624 +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
625 +{
626 + BUILD_BUG();
627 +}
628 +
629 +#endif
630 +
631 +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
632 +{
633 + int priority;
634 + unsigned long reclaimable;
635 + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
636 +
637 + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
638 + return;
639 + /*
640 + * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
641 + * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
642 + * estimated reclaimed_to_scanned_ratio = inactive / total.
643 + */
644 + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
645 + if (get_swappiness(lruvec, sc))
646 + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
647 +
648 + reclaimable /= MEMCG_NR_GENS;
649 +
650 + /* round down reclaimable and round up sc->nr_to_reclaim */
651 + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
652 +
653 + sc->priority = clamp(priority, 0, DEF_PRIORITY);
654 +}
655 +
656 +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
657 +{
658 + struct blk_plug plug;
659 + unsigned long reclaimed = sc->nr_reclaimed;
660 +
661 + VM_WARN_ON_ONCE(!global_reclaim(sc));
662 +
663 + lru_add_drain();
664 +
665 + blk_start_plug(&plug);
666 +
667 + set_mm_walk(pgdat);
668 +
669 + set_initial_priority(pgdat, sc);
670 +
671 + if (current_is_kswapd())
672 + sc->nr_reclaimed = 0;
673 +
674 + if (mem_cgroup_disabled())
675 + shrink_one(&pgdat->__lruvec, sc);
676 + else
677 + shrink_many(pgdat, sc);
678 +
679 + if (current_is_kswapd())
680 + sc->nr_reclaimed += reclaimed;
681 +
682 clear_mm_walk();
683
684 blk_finish_plug(&plug);
685 +
686 + /* kswapd should never fail */
687 + pgdat->kswapd_failures = 0;
688 +}
689 +
690 +#ifdef CONFIG_MEMCG
691 +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
692 +{
693 + int seg;
694 + int old, new;
695 + int bin = get_random_u32_below(MEMCG_NR_BINS);
696 + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
697 +
698 + spin_lock(&pgdat->memcg_lru.lock);
699 +
700 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
701 +
702 + seg = 0;
703 + new = old = lruvec->lrugen.gen;
704 +
705 + /* see the comment on MEMCG_NR_GENS */
706 + if (op == MEMCG_LRU_HEAD)
707 + seg = MEMCG_LRU_HEAD;
708 + else if (op == MEMCG_LRU_TAIL)
709 + seg = MEMCG_LRU_TAIL;
710 + else if (op == MEMCG_LRU_OLD)
711 + new = get_memcg_gen(pgdat->memcg_lru.seq);
712 + else if (op == MEMCG_LRU_YOUNG)
713 + new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
714 + else
715 + VM_WARN_ON_ONCE(true);
716 +
717 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
718 +
719 + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
720 + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
721 + else
722 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
723 +
724 + pgdat->memcg_lru.nr_memcgs[old]--;
725 + pgdat->memcg_lru.nr_memcgs[new]++;
726 +
727 + lruvec->lrugen.gen = new;
728 + WRITE_ONCE(lruvec->lrugen.seg, seg);
729 +
730 + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
731 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
732 +
733 + spin_unlock(&pgdat->memcg_lru.lock);
734 }
735 +#endif
736
737 /******************************************************************************
738 * state change
739 @@ -5656,11 +5880,11 @@ static int run_cmd(char cmd, int memcg_i
740
741 if (!mem_cgroup_disabled()) {
742 rcu_read_lock();
743 +
744 memcg = mem_cgroup_from_id(memcg_id);
745 -#ifdef CONFIG_MEMCG
746 - if (memcg && !css_tryget(&memcg->css))
747 + if (!mem_cgroup_tryget(memcg))
748 memcg = NULL;
749 -#endif
750 +
751 rcu_read_unlock();
752
753 if (!memcg)
754 @@ -5808,6 +6032,19 @@ void lru_gen_init_lruvec(struct lruvec *
755 }
756
757 #ifdef CONFIG_MEMCG
758 +
759 +void lru_gen_init_pgdat(struct pglist_data *pgdat)
760 +{
761 + int i, j;
762 +
763 + spin_lock_init(&pgdat->memcg_lru.lock);
764 +
765 + for (i = 0; i < MEMCG_NR_GENS; i++) {
766 + for (j = 0; j < MEMCG_NR_BINS; j++)
767 + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
768 + }
769 +}
770 +
771 void lru_gen_init_memcg(struct mem_cgroup *memcg)
772 {
773 INIT_LIST_HEAD(&memcg->mm_list.fifo);
774 @@ -5831,7 +6068,69 @@ void lru_gen_exit_memcg(struct mem_cgrou
775 }
776 }
777 }
778 -#endif
779 +
780 +void lru_gen_online_memcg(struct mem_cgroup *memcg)
781 +{
782 + int gen;
783 + int nid;
784 + int bin = get_random_u32_below(MEMCG_NR_BINS);
785 +
786 + for_each_node(nid) {
787 + struct pglist_data *pgdat = NODE_DATA(nid);
788 + struct lruvec *lruvec = get_lruvec(memcg, nid);
789 +
790 + spin_lock(&pgdat->memcg_lru.lock);
791 +
792 + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
793 +
794 + gen = get_memcg_gen(pgdat->memcg_lru.seq);
795 +
796 + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
797 + pgdat->memcg_lru.nr_memcgs[gen]++;
798 +
799 + lruvec->lrugen.gen = gen;
800 +
801 + spin_unlock(&pgdat->memcg_lru.lock);
802 + }
803 +}
804 +
805 +void lru_gen_offline_memcg(struct mem_cgroup *memcg)
806 +{
807 + int nid;
808 +
809 + for_each_node(nid) {
810 + struct lruvec *lruvec = get_lruvec(memcg, nid);
811 +
812 + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
813 + }
814 +}
815 +
816 +void lru_gen_release_memcg(struct mem_cgroup *memcg)
817 +{
818 + int gen;
819 + int nid;
820 +
821 + for_each_node(nid) {
822 + struct pglist_data *pgdat = NODE_DATA(nid);
823 + struct lruvec *lruvec = get_lruvec(memcg, nid);
824 +
825 + spin_lock(&pgdat->memcg_lru.lock);
826 +
827 + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
828 +
829 + gen = lruvec->lrugen.gen;
830 +
831 + hlist_nulls_del_rcu(&lruvec->lrugen.list);
832 + pgdat->memcg_lru.nr_memcgs[gen]--;
833 +
834 + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
835 + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
836 +
837 + spin_unlock(&pgdat->memcg_lru.lock);
838 + }
839 +}
840 +
841 +#endif /* CONFIG_MEMCG */
842
843 static int __init init_lru_gen(void)
844 {
845 @@ -5858,6 +6157,10 @@ static void lru_gen_shrink_lruvec(struct
846 {
847 }
848
849 +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
850 +{
851 +}
852 +
853 #endif /* CONFIG_LRU_GEN */
854
855 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
856 @@ -5871,7 +6174,7 @@ static void shrink_lruvec(struct lruvec
857 bool proportional_reclaim;
858 struct blk_plug plug;
859
860 - if (lru_gen_enabled()) {
861 + if (lru_gen_enabled() && !global_reclaim(sc)) {
862 lru_gen_shrink_lruvec(lruvec, sc);
863 return;
864 }
865 @@ -6114,6 +6417,11 @@ static void shrink_node(pg_data_t *pgdat
866 struct lruvec *target_lruvec;
867 bool reclaimable = false;
868
869 + if (lru_gen_enabled() && global_reclaim(sc)) {
870 + lru_gen_shrink_node(pgdat, sc);
871 + return;
872 + }
873 +
874 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
875
876 again: