fe32acc9851223e553576ecf6e284812df3625fa
[openwrt/openwrt.git] /
1 From 8c20e2eb5f2a0175b774134685e4d7bd93e85ff8 Mon Sep 17 00:00:00 2001
2 From: Yu Zhao <yuzhao@google.com>
3 Date: Wed, 21 Dec 2022 21:18:59 -0700
4 Subject: [PATCH 01/19] UPSTREAM: mm: multi-gen LRU: rename lru_gen_struct to
5 lru_gen_folio
6
7 Patch series "mm: multi-gen LRU: memcg LRU", v3.
8
9 Overview
10 ========
11
12 An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
13 since each node and memcg combination has an LRU of folios (see
14 mem_cgroup_lruvec()).
15
16 Its goal is to improve the scalability of global reclaim, which is
17 critical to system-wide memory overcommit in data centers. Note that
18 memcg reclaim is currently out of scope.
19
20 Its memory bloat is a pointer to each lruvec and negligible to each
21 pglist_data. In terms of traversing memcgs during global reclaim, it
22 improves the best-case complexity from O(n) to O(1) and does not affect
23 the worst-case complexity O(n). Therefore, on average, it has a sublinear
24 complexity in contrast to the current linear complexity.
25
26 The basic structure of an memcg LRU can be understood by an analogy to
27 the active/inactive LRU (of folios):
28 1. It has the young and the old (generations), i.e., the counterparts
29 to the active and the inactive;
30 2. The increment of max_seq triggers promotion, i.e., the counterpart
31 to activation;
32 3. Other events trigger similar operations, e.g., offlining an memcg
33 triggers demotion, i.e., the counterpart to deactivation.
34
35 In terms of global reclaim, it has two distinct features:
36 1. Sharding, which allows each thread to start at a random memcg (in
37 the old generation) and improves parallelism;
38 2. Eventual fairness, which allows direct reclaim to bail out at will
39 and reduces latency without affecting fairness over some time.
40
41 The commit message in patch 6 details the workflow:
42 https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/
43
44 The following is a simple test to quickly verify its effectiveness.
45
46 Test design:
47 1. Create multiple memcgs.
48 2. Each memcg contains a job (fio).
49 3. All jobs access the same amount of memory randomly.
50 4. The system does not experience global memory pressure.
51 5. Periodically write to the root memory.reclaim.
52
53 Desired outcome:
54 1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal)
55 over mean(pgsteal) is close to 0%.
56 2. The total pgsteal is close to the total requested through
57 memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close
58 to 100%.
59
60 Actual outcome [1]:
61 MGLRU off MGLRU on
62 stddev(pgsteal) / mean(pgsteal) 75% 20%
63 sum(pgsteal) / sum(requested) 425% 95%
64
65 ####################################################################
66 MEMCGS=128
67
68 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
69 mkdir /sys/fs/cgroup/memcg$memcg
70 done
71
72 start() {
73 echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs
74
75 fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \
76 --filename=/dev/zero --size=1920M --rw=randrw \
77 --rate=64m,64m --random_distribution=random \
78 --fadvise_hint=0 --time_based --runtime=10h \
79 --group_reporting --minimal
80 }
81
82 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
83 start &
84 done
85
86 sleep 600
87
88 for ((i = 0; i < 600; i++)); do
89 echo 256m >/sys/fs/cgroup/memory.reclaim
90 sleep 6
91 done
92
93 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
94 grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat
95 done
96 ####################################################################
97
98 [1]: This was obtained from running the above script (touches less
99 than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an
100 hour.
101
102 This patch (of 8):
103
104 The new name lru_gen_folio will be more distinct from the coming
105 lru_gen_memcg.
106
107 Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com
108 Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com
109 Signed-off-by: Yu Zhao <yuzhao@google.com>
110 Cc: Johannes Weiner <hannes@cmpxchg.org>
111 Cc: Jonathan Corbet <corbet@lwn.net>
112 Cc: Michael Larabel <Michael@MichaelLarabel.com>
113 Cc: Michal Hocko <mhocko@kernel.org>
114 Cc: Mike Rapoport <rppt@kernel.org>
115 Cc: Roman Gushchin <roman.gushchin@linux.dev>
116 Cc: Suren Baghdasaryan <surenb@google.com>
117 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
118 Bug: 274865848
119 (cherry picked from commit 391655fe08d1f942359a11148aa9aaf3f99d6d6f)
120 Change-Id: I7df67e0e2435ba28f10eaa57d28d98b61a9210a6
121 Signed-off-by: T.J. Mercier <tjmercier@google.com>
122 ---
123 include/linux/mm_inline.h | 4 ++--
124 include/linux/mmzone.h | 6 +++---
125 mm/vmscan.c | 34 +++++++++++++++++-----------------
126 mm/workingset.c | 4 ++--
127 4 files changed, 24 insertions(+), 24 deletions(-)
128
129 --- a/include/linux/mm_inline.h
130 +++ b/include/linux/mm_inline.h
131 @@ -178,7 +178,7 @@ static inline void lru_gen_update_size(s
132 int zone = folio_zonenum(folio);
133 int delta = folio_nr_pages(folio);
134 enum lru_list lru = type * LRU_INACTIVE_FILE;
135 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
136 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
137
138 VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
139 VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
140 @@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(str
141 int gen = folio_lru_gen(folio);
142 int type = folio_is_file_lru(folio);
143 int zone = folio_zonenum(folio);
144 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
145 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
146
147 VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
148
149 --- a/include/linux/mmzone.h
150 +++ b/include/linux/mmzone.h
151 @@ -404,7 +404,7 @@ enum {
152 * The number of pages in each generation is eventually consistent and therefore
153 * can be transiently negative when reset_batch_size() is pending.
154 */
155 -struct lru_gen_struct {
156 +struct lru_gen_folio {
157 /* the aging increments the youngest generation number */
158 unsigned long max_seq;
159 /* the eviction increments the oldest generation numbers */
160 @@ -461,7 +461,7 @@ struct lru_gen_mm_state {
161 struct lru_gen_mm_walk {
162 /* the lruvec under reclaim */
163 struct lruvec *lruvec;
164 - /* unstable max_seq from lru_gen_struct */
165 + /* unstable max_seq from lru_gen_folio */
166 unsigned long max_seq;
167 /* the next address within an mm to scan */
168 unsigned long next_addr;
169 @@ -524,7 +524,7 @@ struct lruvec {
170 unsigned long flags;
171 #ifdef CONFIG_LRU_GEN
172 /* evictable pages divided into generations */
173 - struct lru_gen_struct lrugen;
174 + struct lru_gen_folio lrugen;
175 /* to concurrently iterate lru_gen_mm_list */
176 struct lru_gen_mm_state mm_state;
177 #endif
178 --- a/mm/vmscan.c
179 +++ b/mm/vmscan.c
180 @@ -3190,7 +3190,7 @@ static int get_nr_gens(struct lruvec *lr
181
182 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
183 {
184 - /* see the comment on lru_gen_struct */
185 + /* see the comment on lru_gen_folio */
186 return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
187 get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
188 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
189 @@ -3596,7 +3596,7 @@ struct ctrl_pos {
190 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
191 struct ctrl_pos *pos)
192 {
193 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
194 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
195 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
196
197 pos->refaulted = lrugen->avg_refaulted[type][tier] +
198 @@ -3611,7 +3611,7 @@ static void read_ctrl_pos(struct lruvec
199 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
200 {
201 int hist, tier;
202 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
203 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
204 bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
205 unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
206
207 @@ -3688,7 +3688,7 @@ static int folio_update_gen(struct folio
208 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
209 {
210 int type = folio_is_file_lru(folio);
211 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
212 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
213 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
214 unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
215
216 @@ -3733,7 +3733,7 @@ static void update_batch_size(struct lru
217 static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
218 {
219 int gen, type, zone;
220 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
221 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
222
223 walk->batched = 0;
224
225 @@ -4250,7 +4250,7 @@ static bool inc_min_seq(struct lruvec *l
226 {
227 int zone;
228 int remaining = MAX_LRU_BATCH;
229 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
230 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
231 int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
232
233 if (type == LRU_GEN_ANON && !can_swap)
234 @@ -4286,7 +4286,7 @@ static bool try_to_inc_min_seq(struct lr
235 {
236 int gen, type, zone;
237 bool success = false;
238 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
239 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
240 DEFINE_MIN_SEQ(lruvec);
241
242 VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
243 @@ -4307,7 +4307,7 @@ next:
244 ;
245 }
246
247 - /* see the comment on lru_gen_struct */
248 + /* see the comment on lru_gen_folio */
249 if (can_swap) {
250 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
251 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
252 @@ -4329,7 +4329,7 @@ static void inc_max_seq(struct lruvec *l
253 {
254 int prev, next;
255 int type, zone;
256 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
257 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
258
259 restart:
260 spin_lock_irq(&lruvec->lru_lock);
261 @@ -4389,7 +4389,7 @@ static bool try_to_inc_max_seq(struct lr
262 bool success;
263 struct lru_gen_mm_walk *walk;
264 struct mm_struct *mm = NULL;
265 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
266 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
267
268 VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
269
270 @@ -4454,7 +4454,7 @@ static bool should_run_aging(struct lruv
271 unsigned long old = 0;
272 unsigned long young = 0;
273 unsigned long total = 0;
274 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
275 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
276 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
277
278 for (type = !can_swap; type < ANON_AND_FILE; type++) {
279 @@ -4740,7 +4740,7 @@ static bool sort_folio(struct lruvec *lr
280 int delta = folio_nr_pages(folio);
281 int refs = folio_lru_refs(folio);
282 int tier = lru_tier_from_refs(refs);
283 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
284 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
285
286 VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
287
288 @@ -4848,7 +4848,7 @@ static int scan_folios(struct lruvec *lr
289 int scanned = 0;
290 int isolated = 0;
291 int remaining = MAX_LRU_BATCH;
292 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
293 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
294 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
295
296 VM_WARN_ON_ONCE(!list_empty(list));
297 @@ -5249,7 +5249,7 @@ done:
298
299 static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
300 {
301 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
302 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
303
304 if (lrugen->enabled) {
305 enum lru_list lru;
306 @@ -5531,7 +5531,7 @@ static void lru_gen_seq_show_full(struct
307 int i;
308 int type, tier;
309 int hist = lru_hist_from_seq(seq);
310 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
311 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
312
313 for (tier = 0; tier < MAX_NR_TIERS; tier++) {
314 seq_printf(m, " %10d", tier);
315 @@ -5581,7 +5581,7 @@ static int lru_gen_seq_show(struct seq_f
316 unsigned long seq;
317 bool full = !debugfs_real_fops(m->file)->write;
318 struct lruvec *lruvec = v;
319 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
320 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
321 int nid = lruvec_pgdat(lruvec)->node_id;
322 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
323 DEFINE_MAX_SEQ(lruvec);
324 @@ -5835,7 +5835,7 @@ void lru_gen_init_lruvec(struct lruvec *
325 {
326 int i;
327 int gen, type, zone;
328 - struct lru_gen_struct *lrugen = &lruvec->lrugen;
329 + struct lru_gen_folio *lrugen = &lruvec->lrugen;
330
331 lrugen->max_seq = MIN_NR_GENS + 1;
332 lrugen->enabled = lru_gen_enabled();
333 --- a/mm/workingset.c
334 +++ b/mm/workingset.c
335 @@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct fol
336 unsigned long token;
337 unsigned long min_seq;
338 struct lruvec *lruvec;
339 - struct lru_gen_struct *lrugen;
340 + struct lru_gen_folio *lrugen;
341 int type = folio_is_file_lru(folio);
342 int delta = folio_nr_pages(folio);
343 int refs = folio_lru_refs(folio);
344 @@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio
345 unsigned long token;
346 unsigned long min_seq;
347 struct lruvec *lruvec;
348 - struct lru_gen_struct *lrugen;
349 + struct lru_gen_folio *lrugen;
350 struct mem_cgroup *memcg;
351 struct pglist_data *pgdat;
352 int type = folio_is_file_lru(folio);