f98588c4d897d368b157e0a30d8494bf7fe3af9f
[openwrt/openwrt.git] /
1 From 397624e12244ec038f51cb1f178ccb7a2ec562e5 Mon Sep 17 00:00:00 2001
2 From: "T.J. Alumbaugh" <talumbau@google.com>
3 Date: Wed, 18 Jan 2023 00:18:23 +0000
4 Subject: [PATCH 14/19] UPSTREAM: mm: multi-gen LRU: section for Bloom filters
5
6 Move Bloom filters code into a dedicated section. Improve the design doc
7 to explain Bloom filter usage and connection between aging and eviction in
8 their use.
9
10 Link: https://lkml.kernel.org/r/20230118001827.1040870-4-talumbau@google.com
11 Change-Id: I73e866f687c1ed9f5c8538086aa39408b79897db
12 Signed-off-by: T.J. Alumbaugh <talumbau@google.com>
13 Cc: Yu Zhao <yuzhao@google.com>
14 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
15 (cherry picked from commit ccbbbb85945d8f0255aa9dbc1b617017e2294f2c)
16 Bug: 274865848
17 Signed-off-by: T.J. Mercier <tjmercier@google.com>
18 ---
19 Documentation/mm/multigen_lru.rst | 16 +++
20 mm/vmscan.c | 180 +++++++++++++++---------------
21 2 files changed, 108 insertions(+), 88 deletions(-)
22
23 diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
24 index bd988a142bc2f..770b5d539856c 100644
25 --- a/Documentation/mm/multigen_lru.rst
26 +++ b/Documentation/mm/multigen_lru.rst
27 @@ -170,6 +170,22 @@ promotes hot pages. If the scan was done cacheline efficiently, it
28 adds the PMD entry pointing to the PTE table to the Bloom filter. This
29 forms a feedback loop between the eviction and the aging.
30
31 +Bloom Filters
32 +-------------
33 +Bloom filters are a space and memory efficient data structure for set
34 +membership test, i.e., test if an element is not in the set or may be
35 +in the set.
36 +
37 +In the eviction path, specifically, in ``lru_gen_look_around()``, if a
38 +PMD has a sufficient number of hot pages, its address is placed in the
39 +filter. In the aging path, set membership means that the PTE range
40 +will be scanned for young pages.
41 +
42 +Note that Bloom filters are probabilistic on set membership. If a test
43 +is false positive, the cost is an additional scan of a range of PTEs,
44 +which may yield hot pages anyway. Parameters of the filter itself can
45 +control the false positive rate in the limit.
46 +
47 Summary
48 -------
49 The multi-gen LRU can be disassembled into the following parts:
50 diff --git a/mm/vmscan.c b/mm/vmscan.c
51 index 8fa82630240d6..74b4f9d660b56 100644
52 --- a/mm/vmscan.c
53 +++ b/mm/vmscan.c
54 @@ -3208,6 +3208,98 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
55 get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
56 }
57
58 +/******************************************************************************
59 + * Bloom filters
60 + ******************************************************************************/
61 +
62 +/*
63 + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
64 + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
65 + * bits in a bitmap, k is the number of hash functions and n is the number of
66 + * inserted items.
67 + *
68 + * Page table walkers use one of the two filters to reduce their search space.
69 + * To get rid of non-leaf entries that no longer have enough leaf entries, the
70 + * aging uses the double-buffering technique to flip to the other filter each
71 + * time it produces a new generation. For non-leaf entries that have enough
72 + * leaf entries, the aging carries them over to the next generation in
73 + * walk_pmd_range(); the eviction also report them when walking the rmap
74 + * in lru_gen_look_around().
75 + *
76 + * For future optimizations:
77 + * 1. It's not necessary to keep both filters all the time. The spare one can be
78 + * freed after the RCU grace period and reallocated if needed again.
79 + * 2. And when reallocating, it's worth scaling its size according to the number
80 + * of inserted entries in the other filter, to reduce the memory overhead on
81 + * small systems and false positives on large systems.
82 + * 3. Jenkins' hash function is an alternative to Knuth's.
83 + */
84 +#define BLOOM_FILTER_SHIFT 15
85 +
86 +static inline int filter_gen_from_seq(unsigned long seq)
87 +{
88 + return seq % NR_BLOOM_FILTERS;
89 +}
90 +
91 +static void get_item_key(void *item, int *key)
92 +{
93 + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
94 +
95 + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
96 +
97 + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
98 + key[1] = hash >> BLOOM_FILTER_SHIFT;
99 +}
100 +
101 +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
102 +{
103 + int key[2];
104 + unsigned long *filter;
105 + int gen = filter_gen_from_seq(seq);
106 +
107 + filter = READ_ONCE(lruvec->mm_state.filters[gen]);
108 + if (!filter)
109 + return true;
110 +
111 + get_item_key(item, key);
112 +
113 + return test_bit(key[0], filter) && test_bit(key[1], filter);
114 +}
115 +
116 +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
117 +{
118 + int key[2];
119 + unsigned long *filter;
120 + int gen = filter_gen_from_seq(seq);
121 +
122 + filter = READ_ONCE(lruvec->mm_state.filters[gen]);
123 + if (!filter)
124 + return;
125 +
126 + get_item_key(item, key);
127 +
128 + if (!test_bit(key[0], filter))
129 + set_bit(key[0], filter);
130 + if (!test_bit(key[1], filter))
131 + set_bit(key[1], filter);
132 +}
133 +
134 +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
135 +{
136 + unsigned long *filter;
137 + int gen = filter_gen_from_seq(seq);
138 +
139 + filter = lruvec->mm_state.filters[gen];
140 + if (filter) {
141 + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
142 + return;
143 + }
144 +
145 + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
146 + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
147 + WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
148 +}
149 +
150 /******************************************************************************
151 * mm_struct list
152 ******************************************************************************/
153 @@ -3333,94 +3425,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
154 }
155 #endif
156
157 -/*
158 - * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
159 - * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
160 - * bits in a bitmap, k is the number of hash functions and n is the number of
161 - * inserted items.
162 - *
163 - * Page table walkers use one of the two filters to reduce their search space.
164 - * To get rid of non-leaf entries that no longer have enough leaf entries, the
165 - * aging uses the double-buffering technique to flip to the other filter each
166 - * time it produces a new generation. For non-leaf entries that have enough
167 - * leaf entries, the aging carries them over to the next generation in
168 - * walk_pmd_range(); the eviction also report them when walking the rmap
169 - * in lru_gen_look_around().
170 - *
171 - * For future optimizations:
172 - * 1. It's not necessary to keep both filters all the time. The spare one can be
173 - * freed after the RCU grace period and reallocated if needed again.
174 - * 2. And when reallocating, it's worth scaling its size according to the number
175 - * of inserted entries in the other filter, to reduce the memory overhead on
176 - * small systems and false positives on large systems.
177 - * 3. Jenkins' hash function is an alternative to Knuth's.
178 - */
179 -#define BLOOM_FILTER_SHIFT 15
180 -
181 -static inline int filter_gen_from_seq(unsigned long seq)
182 -{
183 - return seq % NR_BLOOM_FILTERS;
184 -}
185 -
186 -static void get_item_key(void *item, int *key)
187 -{
188 - u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
189 -
190 - BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
191 -
192 - key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
193 - key[1] = hash >> BLOOM_FILTER_SHIFT;
194 -}
195 -
196 -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
197 -{
198 - unsigned long *filter;
199 - int gen = filter_gen_from_seq(seq);
200 -
201 - filter = lruvec->mm_state.filters[gen];
202 - if (filter) {
203 - bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
204 - return;
205 - }
206 -
207 - filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
208 - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
209 - WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
210 -}
211 -
212 -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
213 -{
214 - int key[2];
215 - unsigned long *filter;
216 - int gen = filter_gen_from_seq(seq);
217 -
218 - filter = READ_ONCE(lruvec->mm_state.filters[gen]);
219 - if (!filter)
220 - return;
221 -
222 - get_item_key(item, key);
223 -
224 - if (!test_bit(key[0], filter))
225 - set_bit(key[0], filter);
226 - if (!test_bit(key[1], filter))
227 - set_bit(key[1], filter);
228 -}
229 -
230 -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
231 -{
232 - int key[2];
233 - unsigned long *filter;
234 - int gen = filter_gen_from_seq(seq);
235 -
236 - filter = READ_ONCE(lruvec->mm_state.filters[gen]);
237 - if (!filter)
238 - return true;
239 -
240 - get_item_key(item, key);
241 -
242 - return test_bit(key[0], filter) && test_bit(key[1], filter);
243 -}
244 -
245 static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
246 {
247 int i;
248 --
249 2.40.1
250