c044ad25af4c05366a2065c6f8e2eab473cc70af
[openwrt/staging/nbd.git] /
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Fri, 4 Jun 2021 17:17:38 +0200
4 Subject: [PATCH] wireguard: allowedips: free empty intermediate nodes when
5 removing single node
6
7 commit bf7b042dc62a31f66d3a41dd4dfc7806f267b307 upstream.
8
9 When removing single nodes, it's possible that that node's parent is an
10 empty intermediate node, in which case, it too should be removed.
11 Otherwise the trie fills up and never is fully emptied, leading to
12 gradual memory leaks over time for tries that are modified often. There
13 was originally code to do this, but was removed during refactoring in
14 2016 and never reworked. Now that we have proper parent pointers from
15 the previous commits, we can implement this properly.
16
17 In order to reduce branching and expensive comparisons, we want to keep
18 the double pointer for parent assignment (which lets us easily chain up
19 to the root), but we still need to actually get the parent's base
20 address. So encode the bit number into the last two bits of the pointer,
21 and pack and unpack it as needed. This is a little bit clumsy but is the
22 fastest and less memory wasteful of the compromises. Note that we align
23 the root struct here to a minimum of 4, because it's embedded into a
24 larger struct, and we're relying on having the bottom two bits for our
25 flag, which would only be 16-bit aligned on m68k.
26
27 The existing macro-based helpers were a bit unwieldy for adding the bit
28 packing to, so this commit replaces them with safer and clearer ordinary
29 functions.
30
31 We add a test to the randomized/fuzzer part of the selftests, to free
32 the randomized tries by-peer, refuzz it, and repeat, until it's supposed
33 to be empty, and then then see if that actually resulted in the whole
34 thing being emptied. That combined with kmemcheck should hopefully make
35 sure this commit is doing what it should. Along the way this resulted in
36 various other cleanups of the tests and fixes for recent graphviz.
37
38 Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
39 Cc: stable@vger.kernel.org
40 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
41 Signed-off-by: David S. Miller <davem@davemloft.net>
42 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
43 ---
44 drivers/net/wireguard/allowedips.c | 102 ++++++------
45 drivers/net/wireguard/allowedips.h | 4 +-
46 drivers/net/wireguard/selftest/allowedips.c | 162 ++++++++++----------
47 3 files changed, 137 insertions(+), 131 deletions(-)
48
49 --- a/drivers/net/wireguard/allowedips.c
50 +++ b/drivers/net/wireguard/allowedips.c
51 @@ -30,8 +30,11 @@ static void copy_and_assign_cidr(struct
52 node->bitlen = bits;
53 memcpy(node->bits, src, bits / 8U);
54 }
55 -#define CHOOSE_NODE(parent, key) \
56 - parent->bit[(key[parent->bit_at_a] >> parent->bit_at_b) & 1]
57 +
58 +static inline u8 choose(struct allowedips_node *node, const u8 *key)
59 +{
60 + return (key[node->bit_at_a] >> node->bit_at_b) & 1;
61 +}
62
63 static void push_rcu(struct allowedips_node **stack,
64 struct allowedips_node __rcu *p, unsigned int *len)
65 @@ -112,7 +115,7 @@ static struct allowedips_node *find_node
66 found = node;
67 if (node->cidr == bits)
68 break;
69 - node = rcu_dereference_bh(CHOOSE_NODE(node, key));
70 + node = rcu_dereference_bh(node->bit[choose(node, key)]);
71 }
72 return found;
73 }
74 @@ -144,8 +147,7 @@ static bool node_placement(struct allowe
75 u8 cidr, u8 bits, struct allowedips_node **rnode,
76 struct mutex *lock)
77 {
78 - struct allowedips_node *node = rcu_dereference_protected(trie,
79 - lockdep_is_held(lock));
80 + struct allowedips_node *node = rcu_dereference_protected(trie, lockdep_is_held(lock));
81 struct allowedips_node *parent = NULL;
82 bool exact = false;
83
84 @@ -155,13 +157,24 @@ static bool node_placement(struct allowe
85 exact = true;
86 break;
87 }
88 - node = rcu_dereference_protected(CHOOSE_NODE(parent, key),
89 - lockdep_is_held(lock));
90 + node = rcu_dereference_protected(parent->bit[choose(parent, key)], lockdep_is_held(lock));
91 }
92 *rnode = parent;
93 return exact;
94 }
95
96 +static inline void connect_node(struct allowedips_node **parent, u8 bit, struct allowedips_node *node)
97 +{
98 + node->parent_bit_packed = (unsigned long)parent | bit;
99 + rcu_assign_pointer(*parent, node);
100 +}
101 +
102 +static inline void choose_and_connect_node(struct allowedips_node *parent, struct allowedips_node *node)
103 +{
104 + u8 bit = choose(parent, node->bits);
105 + connect_node(&parent->bit[bit], bit, node);
106 +}
107 +
108 static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
109 u8 cidr, struct wg_peer *peer, struct mutex *lock)
110 {
111 @@ -177,8 +190,7 @@ static int add(struct allowedips_node __
112 RCU_INIT_POINTER(node->peer, peer);
113 list_add_tail(&node->peer_list, &peer->allowedips_list);
114 copy_and_assign_cidr(node, key, cidr, bits);
115 - rcu_assign_pointer(node->parent_bit, trie);
116 - rcu_assign_pointer(*trie, node);
117 + connect_node(trie, 2, node);
118 return 0;
119 }
120 if (node_placement(*trie, key, cidr, bits, &node, lock)) {
121 @@ -197,10 +209,10 @@ static int add(struct allowedips_node __
122 if (!node) {
123 down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
124 } else {
125 - down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
126 + const u8 bit = choose(node, key);
127 + down = rcu_dereference_protected(node->bit[bit], lockdep_is_held(lock));
128 if (!down) {
129 - rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
130 - rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
131 + connect_node(&node->bit[bit], bit, newnode);
132 return 0;
133 }
134 }
135 @@ -208,15 +220,11 @@ static int add(struct allowedips_node __
136 parent = node;
137
138 if (newnode->cidr == cidr) {
139 - rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
140 - rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
141 - if (!parent) {
142 - rcu_assign_pointer(newnode->parent_bit, trie);
143 - rcu_assign_pointer(*trie, newnode);
144 - } else {
145 - rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
146 - rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
147 - }
148 + choose_and_connect_node(newnode, down);
149 + if (!parent)
150 + connect_node(trie, 2, newnode);
151 + else
152 + choose_and_connect_node(parent, newnode);
153 return 0;
154 }
155
156 @@ -229,17 +237,12 @@ static int add(struct allowedips_node __
157 INIT_LIST_HEAD(&node->peer_list);
158 copy_and_assign_cidr(node, newnode->bits, cidr, bits);
159
160 - rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
161 - rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
162 - rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
163 - rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
164 - if (!parent) {
165 - rcu_assign_pointer(node->parent_bit, trie);
166 - rcu_assign_pointer(*trie, node);
167 - } else {
168 - rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
169 - rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
170 - }
171 + choose_and_connect_node(node, down);
172 + choose_and_connect_node(node, newnode);
173 + if (!parent)
174 + connect_node(trie, 2, node);
175 + else
176 + choose_and_connect_node(parent, node);
177 return 0;
178 }
179
180 @@ -297,7 +300,8 @@ int wg_allowedips_insert_v6(struct allow
181 void wg_allowedips_remove_by_peer(struct allowedips *table,
182 struct wg_peer *peer, struct mutex *lock)
183 {
184 - struct allowedips_node *node, *child, *tmp;
185 + struct allowedips_node *node, *child, **parent_bit, *parent, *tmp;
186 + bool free_parent;
187
188 if (list_empty(&peer->allowedips_list))
189 return;
190 @@ -307,19 +311,29 @@ void wg_allowedips_remove_by_peer(struct
191 RCU_INIT_POINTER(node->peer, NULL);
192 if (node->bit[0] && node->bit[1])
193 continue;
194 - child = rcu_dereference_protected(
195 - node->bit[!rcu_access_pointer(node->bit[0])],
196 - lockdep_is_held(lock));
197 + child = rcu_dereference_protected(node->bit[!rcu_access_pointer(node->bit[0])],
198 + lockdep_is_held(lock));
199 if (child)
200 - child->parent_bit = node->parent_bit;
201 - *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
202 + child->parent_bit_packed = node->parent_bit_packed;
203 + parent_bit = (struct allowedips_node **)(node->parent_bit_packed & ~3UL);
204 + *parent_bit = child;
205 + parent = (void *)parent_bit -
206 + offsetof(struct allowedips_node, bit[node->parent_bit_packed & 1]);
207 + free_parent = !rcu_access_pointer(node->bit[0]) &&
208 + !rcu_access_pointer(node->bit[1]) &&
209 + (node->parent_bit_packed & 3) <= 1 &&
210 + !rcu_access_pointer(parent->peer);
211 + if (free_parent)
212 + child = rcu_dereference_protected(
213 + parent->bit[!(node->parent_bit_packed & 1)],
214 + lockdep_is_held(lock));
215 call_rcu(&node->rcu, node_free_rcu);
216 -
217 - /* TODO: Note that we currently don't walk up and down in order to
218 - * free any potential filler nodes. This means that this function
219 - * doesn't free up as much as it could, which could be revisited
220 - * at some point.
221 - */
222 + if (!free_parent)
223 + continue;
224 + if (child)
225 + child->parent_bit_packed = parent->parent_bit_packed;
226 + *(struct allowedips_node **)(parent->parent_bit_packed & ~3UL) = child;
227 + call_rcu(&parent->rcu, node_free_rcu);
228 }
229 }
230
231 --- a/drivers/net/wireguard/allowedips.h
232 +++ b/drivers/net/wireguard/allowedips.h
233 @@ -19,7 +19,7 @@ struct allowedips_node {
234 u8 bits[16] __aligned(__alignof(u64));
235
236 /* Keep rarely used members at bottom to be beyond cache line. */
237 - struct allowedips_node *__rcu *parent_bit;
238 + unsigned long parent_bit_packed;
239 union {
240 struct list_head peer_list;
241 struct rcu_head rcu;
242 @@ -30,7 +30,7 @@ struct allowedips {
243 struct allowedips_node __rcu *root4;
244 struct allowedips_node __rcu *root6;
245 u64 seq;
246 -};
247 +} __aligned(4); /* We pack the lower 2 bits of &root, but m68k only gives 16-bit alignment. */
248
249 void wg_allowedips_init(struct allowedips *table);
250 void wg_allowedips_free(struct allowedips *table, struct mutex *mutex);
251 --- a/drivers/net/wireguard/selftest/allowedips.c
252 +++ b/drivers/net/wireguard/selftest/allowedips.c
253 @@ -19,32 +19,22 @@
254
255 #include <linux/siphash.h>
256
257 -static __init void swap_endian_and_apply_cidr(u8 *dst, const u8 *src, u8 bits,
258 - u8 cidr)
259 -{
260 - swap_endian(dst, src, bits);
261 - memset(dst + (cidr + 7) / 8, 0, bits / 8 - (cidr + 7) / 8);
262 - if (cidr)
263 - dst[(cidr + 7) / 8 - 1] &= ~0U << ((8 - (cidr % 8)) % 8);
264 -}
265 -
266 static __init void print_node(struct allowedips_node *node, u8 bits)
267 {
268 char *fmt_connection = KERN_DEBUG "\t\"%p/%d\" -> \"%p/%d\";\n";
269 - char *fmt_declaration = KERN_DEBUG
270 - "\t\"%p/%d\"[style=%s, color=\"#%06x\"];\n";
271 + char *fmt_declaration = KERN_DEBUG "\t\"%p/%d\"[style=%s, color=\"#%06x\"];\n";
272 + u8 ip1[16], ip2[16], cidr1, cidr2;
273 char *style = "dotted";
274 - u8 ip1[16], ip2[16];
275 u32 color = 0;
276
277 + if (node == NULL)
278 + return;
279 if (bits == 32) {
280 fmt_connection = KERN_DEBUG "\t\"%pI4/%d\" -> \"%pI4/%d\";\n";
281 - fmt_declaration = KERN_DEBUG
282 - "\t\"%pI4/%d\"[style=%s, color=\"#%06x\"];\n";
283 + fmt_declaration = KERN_DEBUG "\t\"%pI4/%d\"[style=%s, color=\"#%06x\"];\n";
284 } else if (bits == 128) {
285 fmt_connection = KERN_DEBUG "\t\"%pI6/%d\" -> \"%pI6/%d\";\n";
286 - fmt_declaration = KERN_DEBUG
287 - "\t\"%pI6/%d\"[style=%s, color=\"#%06x\"];\n";
288 + fmt_declaration = KERN_DEBUG "\t\"%pI6/%d\"[style=%s, color=\"#%06x\"];\n";
289 }
290 if (node->peer) {
291 hsiphash_key_t key = { { 0 } };
292 @@ -55,24 +45,20 @@ static __init void print_node(struct all
293 hsiphash_1u32(0xabad1dea, &key) % 200;
294 style = "bold";
295 }
296 - swap_endian_and_apply_cidr(ip1, node->bits, bits, node->cidr);
297 - printk(fmt_declaration, ip1, node->cidr, style, color);
298 + wg_allowedips_read_node(node, ip1, &cidr1);
299 + printk(fmt_declaration, ip1, cidr1, style, color);
300 if (node->bit[0]) {
301 - swap_endian_and_apply_cidr(ip2,
302 - rcu_dereference_raw(node->bit[0])->bits, bits,
303 - node->cidr);
304 - printk(fmt_connection, ip1, node->cidr, ip2,
305 - rcu_dereference_raw(node->bit[0])->cidr);
306 - print_node(rcu_dereference_raw(node->bit[0]), bits);
307 + wg_allowedips_read_node(rcu_dereference_raw(node->bit[0]), ip2, &cidr2);
308 + printk(fmt_connection, ip1, cidr1, ip2, cidr2);
309 }
310 if (node->bit[1]) {
311 - swap_endian_and_apply_cidr(ip2,
312 - rcu_dereference_raw(node->bit[1])->bits,
313 - bits, node->cidr);
314 - printk(fmt_connection, ip1, node->cidr, ip2,
315 - rcu_dereference_raw(node->bit[1])->cidr);
316 - print_node(rcu_dereference_raw(node->bit[1]), bits);
317 + wg_allowedips_read_node(rcu_dereference_raw(node->bit[1]), ip2, &cidr2);
318 + printk(fmt_connection, ip1, cidr1, ip2, cidr2);
319 }
320 + if (node->bit[0])
321 + print_node(rcu_dereference_raw(node->bit[0]), bits);
322 + if (node->bit[1])
323 + print_node(rcu_dereference_raw(node->bit[1]), bits);
324 }
325
326 static __init void print_tree(struct allowedips_node __rcu *top, u8 bits)
327 @@ -121,8 +107,8 @@ static __init inline union nf_inet_addr
328 {
329 union nf_inet_addr mask;
330
331 - memset(&mask, 0x00, 128 / 8);
332 - memset(&mask, 0xff, cidr / 8);
333 + memset(&mask, 0, sizeof(mask));
334 + memset(&mask.all, 0xff, cidr / 8);
335 if (cidr % 32)
336 mask.all[cidr / 32] = (__force u32)htonl(
337 (0xFFFFFFFFUL << (32 - (cidr % 32))) & 0xFFFFFFFFUL);
338 @@ -149,42 +135,36 @@ horrible_mask_self(struct horrible_allow
339 }
340
341 static __init inline bool
342 -horrible_match_v4(const struct horrible_allowedips_node *node,
343 - struct in_addr *ip)
344 +horrible_match_v4(const struct horrible_allowedips_node *node, struct in_addr *ip)
345 {
346 return (ip->s_addr & node->mask.ip) == node->ip.ip;
347 }
348
349 static __init inline bool
350 -horrible_match_v6(const struct horrible_allowedips_node *node,
351 - struct in6_addr *ip)
352 +horrible_match_v6(const struct horrible_allowedips_node *node, struct in6_addr *ip)
353 {
354 - return (ip->in6_u.u6_addr32[0] & node->mask.ip6[0]) ==
355 - node->ip.ip6[0] &&
356 - (ip->in6_u.u6_addr32[1] & node->mask.ip6[1]) ==
357 - node->ip.ip6[1] &&
358 - (ip->in6_u.u6_addr32[2] & node->mask.ip6[2]) ==
359 - node->ip.ip6[2] &&
360 + return (ip->in6_u.u6_addr32[0] & node->mask.ip6[0]) == node->ip.ip6[0] &&
361 + (ip->in6_u.u6_addr32[1] & node->mask.ip6[1]) == node->ip.ip6[1] &&
362 + (ip->in6_u.u6_addr32[2] & node->mask.ip6[2]) == node->ip.ip6[2] &&
363 (ip->in6_u.u6_addr32[3] & node->mask.ip6[3]) == node->ip.ip6[3];
364 }
365
366 static __init void
367 -horrible_insert_ordered(struct horrible_allowedips *table,
368 - struct horrible_allowedips_node *node)
369 +horrible_insert_ordered(struct horrible_allowedips *table, struct horrible_allowedips_node *node)
370 {
371 struct horrible_allowedips_node *other = NULL, *where = NULL;
372 u8 my_cidr = horrible_mask_to_cidr(node->mask);
373
374 hlist_for_each_entry(other, &table->head, table) {
375 - if (!memcmp(&other->mask, &node->mask,
376 - sizeof(union nf_inet_addr)) &&
377 - !memcmp(&other->ip, &node->ip,
378 - sizeof(union nf_inet_addr)) &&
379 - other->ip_version == node->ip_version) {
380 + if (other->ip_version == node->ip_version &&
381 + !memcmp(&other->mask, &node->mask, sizeof(union nf_inet_addr)) &&
382 + !memcmp(&other->ip, &node->ip, sizeof(union nf_inet_addr))) {
383 other->value = node->value;
384 kfree(node);
385 return;
386 }
387 + }
388 + hlist_for_each_entry(other, &table->head, table) {
389 where = other;
390 if (horrible_mask_to_cidr(other->mask) <= my_cidr)
391 break;
392 @@ -201,8 +181,7 @@ static __init int
393 horrible_allowedips_insert_v4(struct horrible_allowedips *table,
394 struct in_addr *ip, u8 cidr, void *value)
395 {
396 - struct horrible_allowedips_node *node = kzalloc(sizeof(*node),
397 - GFP_KERNEL);
398 + struct horrible_allowedips_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
399
400 if (unlikely(!node))
401 return -ENOMEM;
402 @@ -219,8 +198,7 @@ static __init int
403 horrible_allowedips_insert_v6(struct horrible_allowedips *table,
404 struct in6_addr *ip, u8 cidr, void *value)
405 {
406 - struct horrible_allowedips_node *node = kzalloc(sizeof(*node),
407 - GFP_KERNEL);
408 + struct horrible_allowedips_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
409
410 if (unlikely(!node))
411 return -ENOMEM;
412 @@ -234,39 +212,43 @@ horrible_allowedips_insert_v6(struct hor
413 }
414
415 static __init void *
416 -horrible_allowedips_lookup_v4(struct horrible_allowedips *table,
417 - struct in_addr *ip)
418 +horrible_allowedips_lookup_v4(struct horrible_allowedips *table, struct in_addr *ip)
419 {
420 struct horrible_allowedips_node *node;
421 - void *ret = NULL;
422
423 hlist_for_each_entry(node, &table->head, table) {
424 - if (node->ip_version != 4)
425 - continue;
426 - if (horrible_match_v4(node, ip)) {
427 - ret = node->value;
428 - break;
429 - }
430 + if (node->ip_version == 4 && horrible_match_v4(node, ip))
431 + return node->value;
432 }
433 - return ret;
434 + return NULL;
435 }
436
437 static __init void *
438 -horrible_allowedips_lookup_v6(struct horrible_allowedips *table,
439 - struct in6_addr *ip)
440 +horrible_allowedips_lookup_v6(struct horrible_allowedips *table, struct in6_addr *ip)
441 {
442 struct horrible_allowedips_node *node;
443 - void *ret = NULL;
444
445 hlist_for_each_entry(node, &table->head, table) {
446 - if (node->ip_version != 6)
447 + if (node->ip_version == 6 && horrible_match_v6(node, ip))
448 + return node->value;
449 + }
450 + return NULL;
451 +}
452 +
453 +
454 +static __init void
455 +horrible_allowedips_remove_by_value(struct horrible_allowedips *table, void *value)
456 +{
457 + struct horrible_allowedips_node *node;
458 + struct hlist_node *h;
459 +
460 + hlist_for_each_entry_safe(node, h, &table->head, table) {
461 + if (node->value != value)
462 continue;
463 - if (horrible_match_v6(node, ip)) {
464 - ret = node->value;
465 - break;
466 - }
467 + hlist_del(&node->table);
468 + kfree(node);
469 }
470 - return ret;
471 +
472 }
473
474 static __init bool randomized_test(void)
475 @@ -397,23 +379,33 @@ static __init bool randomized_test(void)
476 print_tree(t.root6, 128);
477 }
478
479 - for (i = 0; i < NUM_QUERIES; ++i) {
480 - prandom_bytes(ip, 4);
481 - if (lookup(t.root4, 32, ip) !=
482 - horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip)) {
483 - pr_err("allowedips random self-test: FAIL\n");
484 - goto free;
485 + for (j = 0;; ++j) {
486 + for (i = 0; i < NUM_QUERIES; ++i) {
487 + prandom_bytes(ip, 4);
488 + if (lookup(t.root4, 32, ip) != horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip)) {
489 + horrible_allowedips_lookup_v4(&h, (struct in_addr *)ip);
490 + pr_err("allowedips random v4 self-test: FAIL\n");
491 + goto free;
492 + }
493 + prandom_bytes(ip, 16);
494 + if (lookup(t.root6, 128, ip) != horrible_allowedips_lookup_v6(&h, (struct in6_addr *)ip)) {
495 + pr_err("allowedips random v6 self-test: FAIL\n");
496 + goto free;
497 + }
498 }
499 + if (j >= NUM_PEERS)
500 + break;
501 + mutex_lock(&mutex);
502 + wg_allowedips_remove_by_peer(&t, peers[j], &mutex);
503 + mutex_unlock(&mutex);
504 + horrible_allowedips_remove_by_value(&h, peers[j]);
505 }
506
507 - for (i = 0; i < NUM_QUERIES; ++i) {
508 - prandom_bytes(ip, 16);
509 - if (lookup(t.root6, 128, ip) !=
510 - horrible_allowedips_lookup_v6(&h, (struct in6_addr *)ip)) {
511 - pr_err("allowedips random self-test: FAIL\n");
512 - goto free;
513 - }
514 + if (t.root4 || t.root6) {
515 + pr_err("allowedips random self-test removal: FAIL\n");
516 + goto free;
517 }
518 +
519 ret = true;
520
521 free: