147c133919951d5d6f633d2a47f8cb074df6064b
[openwrt/staging/neocturne.git] /
1 From 1771bbcc5bc99f569dd82ec9e1b7c397a2fb50ac Mon Sep 17 00:00:00 2001
2 From: "Jason A. Donenfeld" <Jason@zx2c4.com>
3 Date: Mon, 22 Feb 2021 17:25:48 +0100
4 Subject: [PATCH 123/124] wireguard: queueing: get rid of per-peer ring buffers
5 MIME-Version: 1.0
6 Content-Type: text/plain; charset=UTF-8
7 Content-Transfer-Encoding: 8bit
8
9 commit 8b5553ace83cced775eefd0f3f18b5c6214ccf7a upstream.
10
11 Having two ring buffers per-peer means that every peer results in two
12 massive ring allocations. On an 8-core x86_64 machine, this commit
13 reduces the per-peer allocation from 18,688 bytes to 1,856 bytes, which
14 is an 90% reduction. Ninety percent! With some single-machine
15 deployments approaching 500,000 peers, we're talking about a reduction
16 from 7 gigs of memory down to 700 megs of memory.
17
18 In order to get rid of these per-peer allocations, this commit switches
19 to using a list-based queueing approach. Currently GSO fragments are
20 chained together using the skb->next pointer (the skb_list_* singly
21 linked list approach), so we form the per-peer queue around the unused
22 skb->prev pointer (which sort of makes sense because the links are
23 pointing backwards). Use of skb_queue_* is not possible here, because
24 that is based on doubly linked lists and spinlocks. Multiple cores can
25 write into the queue at any given time, because its writes occur in the
26 start_xmit path or in the udp_recv path. But reads happen in a single
27 workqueue item per-peer, amounting to a multi-producer, single-consumer
28 paradigm.
29
30 The MPSC queue is implemented locklessly and never blocks. However, it
31 is not linearizable (though it is serializable), with a very tight and
32 unlikely race on writes, which, when hit (some tiny fraction of the
33 0.15% of partial adds on a fully loaded 16-core x86_64 system), causes
34 the queue reader to terminate early. However, because every packet sent
35 queues up the same workqueue item after it is fully added, the worker
36 resumes again, and stopping early isn't actually a problem, since at
37 that point the packet wouldn't have yet been added to the encryption
38 queue. These properties allow us to avoid disabling interrupts or
39 spinning. The design is based on Dmitry Vyukov's algorithm [1].
40
41 Performance-wise, ordinarily list-based queues aren't preferable to
42 ringbuffers, because of cache misses when following pointers around.
43 However, we *already* have to follow the adjacent pointers when working
44 through fragments, so there shouldn't actually be any change there. A
45 potential downside is that dequeueing is a bit more complicated, but the
46 ptr_ring structure used prior had a spinlock when dequeueing, so all and
47 all the difference appears to be a wash.
48
49 Actually, from profiling, the biggest performance hit, by far, of this
50 commit winds up being atomic_add_unless(count, 1, max) and atomic_
51 dec(count), which account for the majority of CPU time, according to
52 perf. In that sense, the previous ring buffer was superior in that it
53 could check if it was full by head==tail, which the list-based approach
54 cannot do.
55
56 But all and all, this enables us to get massive memory savings, allowing
57 WireGuard to scale for real world deployments, without taking much of a
58 performance hit.
59
60 [1] http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue
61
62 Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
63 Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
64 Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
65 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
66 Signed-off-by: Jakub Kicinski <kuba@kernel.org>
67 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
68 ---
69 drivers/net/wireguard/device.c | 12 ++---
70 drivers/net/wireguard/device.h | 15 +++---
71 drivers/net/wireguard/peer.c | 28 ++++-------
72 drivers/net/wireguard/peer.h | 4 +-
73 drivers/net/wireguard/queueing.c | 86 +++++++++++++++++++++++++-------
74 drivers/net/wireguard/queueing.h | 45 ++++++++++++-----
75 drivers/net/wireguard/receive.c | 16 +++---
76 drivers/net/wireguard/send.c | 31 ++++--------
77 8 files changed, 144 insertions(+), 93 deletions(-)
78
79 --- a/drivers/net/wireguard/device.c
80 +++ b/drivers/net/wireguard/device.c
81 @@ -235,8 +235,8 @@ static void wg_destruct(struct net_devic
82 destroy_workqueue(wg->handshake_receive_wq);
83 destroy_workqueue(wg->handshake_send_wq);
84 destroy_workqueue(wg->packet_crypt_wq);
85 - wg_packet_queue_free(&wg->decrypt_queue, true);
86 - wg_packet_queue_free(&wg->encrypt_queue, true);
87 + wg_packet_queue_free(&wg->decrypt_queue);
88 + wg_packet_queue_free(&wg->encrypt_queue);
89 rcu_barrier(); /* Wait for all the peers to be actually freed. */
90 wg_ratelimiter_uninit();
91 memzero_explicit(&wg->static_identity, sizeof(wg->static_identity));
92 @@ -338,12 +338,12 @@ static int wg_newlink(struct net *src_ne
93 goto err_destroy_handshake_send;
94
95 ret = wg_packet_queue_init(&wg->encrypt_queue, wg_packet_encrypt_worker,
96 - true, MAX_QUEUED_PACKETS);
97 + MAX_QUEUED_PACKETS);
98 if (ret < 0)
99 goto err_destroy_packet_crypt;
100
101 ret = wg_packet_queue_init(&wg->decrypt_queue, wg_packet_decrypt_worker,
102 - true, MAX_QUEUED_PACKETS);
103 + MAX_QUEUED_PACKETS);
104 if (ret < 0)
105 goto err_free_encrypt_queue;
106
107 @@ -368,9 +368,9 @@ static int wg_newlink(struct net *src_ne
108 err_uninit_ratelimiter:
109 wg_ratelimiter_uninit();
110 err_free_decrypt_queue:
111 - wg_packet_queue_free(&wg->decrypt_queue, true);
112 + wg_packet_queue_free(&wg->decrypt_queue);
113 err_free_encrypt_queue:
114 - wg_packet_queue_free(&wg->encrypt_queue, true);
115 + wg_packet_queue_free(&wg->encrypt_queue);
116 err_destroy_packet_crypt:
117 destroy_workqueue(wg->packet_crypt_wq);
118 err_destroy_handshake_send:
119 --- a/drivers/net/wireguard/device.h
120 +++ b/drivers/net/wireguard/device.h
121 @@ -27,13 +27,14 @@ struct multicore_worker {
122
123 struct crypt_queue {
124 struct ptr_ring ring;
125 - union {
126 - struct {
127 - struct multicore_worker __percpu *worker;
128 - int last_cpu;
129 - };
130 - struct work_struct work;
131 - };
132 + struct multicore_worker __percpu *worker;
133 + int last_cpu;
134 +};
135 +
136 +struct prev_queue {
137 + struct sk_buff *head, *tail, *peeked;
138 + struct { struct sk_buff *next, *prev; } empty; // Match first 2 members of struct sk_buff.
139 + atomic_t count;
140 };
141
142 struct wg_device {
143 --- a/drivers/net/wireguard/peer.c
144 +++ b/drivers/net/wireguard/peer.c
145 @@ -32,27 +32,22 @@ struct wg_peer *wg_peer_create(struct wg
146 peer = kzalloc(sizeof(*peer), GFP_KERNEL);
147 if (unlikely(!peer))
148 return ERR_PTR(ret);
149 - peer->device = wg;
150 + if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
151 + goto err;
152
153 + peer->device = wg;
154 wg_noise_handshake_init(&peer->handshake, &wg->static_identity,
155 public_key, preshared_key, peer);
156 - if (dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))
157 - goto err_1;
158 - if (wg_packet_queue_init(&peer->tx_queue, wg_packet_tx_worker, false,
159 - MAX_QUEUED_PACKETS))
160 - goto err_2;
161 - if (wg_packet_queue_init(&peer->rx_queue, NULL, false,
162 - MAX_QUEUED_PACKETS))
163 - goto err_3;
164 -
165 peer->internal_id = atomic64_inc_return(&peer_counter);
166 peer->serial_work_cpu = nr_cpumask_bits;
167 wg_cookie_init(&peer->latest_cookie);
168 wg_timers_init(peer);
169 wg_cookie_checker_precompute_peer_keys(peer);
170 spin_lock_init(&peer->keypairs.keypair_update_lock);
171 - INIT_WORK(&peer->transmit_handshake_work,
172 - wg_packet_handshake_send_worker);
173 + INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker);
174 + INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker);
175 + wg_prev_queue_init(&peer->tx_queue);
176 + wg_prev_queue_init(&peer->rx_queue);
177 rwlock_init(&peer->endpoint_lock);
178 kref_init(&peer->refcount);
179 skb_queue_head_init(&peer->staged_packet_queue);
180 @@ -68,11 +63,7 @@ struct wg_peer *wg_peer_create(struct wg
181 pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id);
182 return peer;
183
184 -err_3:
185 - wg_packet_queue_free(&peer->tx_queue, false);
186 -err_2:
187 - dst_cache_destroy(&peer->endpoint_cache);
188 -err_1:
189 +err:
190 kfree(peer);
191 return ERR_PTR(ret);
192 }
193 @@ -197,8 +188,7 @@ static void rcu_release(struct rcu_head
194 struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu);
195
196 dst_cache_destroy(&peer->endpoint_cache);
197 - wg_packet_queue_free(&peer->rx_queue, false);
198 - wg_packet_queue_free(&peer->tx_queue, false);
199 + WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue));
200
201 /* The final zeroing takes care of clearing any remaining handshake key
202 * material and other potentially sensitive information.
203 --- a/drivers/net/wireguard/peer.h
204 +++ b/drivers/net/wireguard/peer.h
205 @@ -36,7 +36,7 @@ struct endpoint {
206
207 struct wg_peer {
208 struct wg_device *device;
209 - struct crypt_queue tx_queue, rx_queue;
210 + struct prev_queue tx_queue, rx_queue;
211 struct sk_buff_head staged_packet_queue;
212 int serial_work_cpu;
213 bool is_dead;
214 @@ -46,7 +46,7 @@ struct wg_peer {
215 rwlock_t endpoint_lock;
216 struct noise_handshake handshake;
217 atomic64_t last_sent_handshake;
218 - struct work_struct transmit_handshake_work, clear_peer_work;
219 + struct work_struct transmit_handshake_work, clear_peer_work, transmit_packet_work;
220 struct cookie latest_cookie;
221 struct hlist_node pubkey_hash;
222 u64 rx_bytes, tx_bytes;
223 --- a/drivers/net/wireguard/queueing.c
224 +++ b/drivers/net/wireguard/queueing.c
225 @@ -9,8 +9,7 @@ struct multicore_worker __percpu *
226 wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr)
227 {
228 int cpu;
229 - struct multicore_worker __percpu *worker =
230 - alloc_percpu(struct multicore_worker);
231 + struct multicore_worker __percpu *worker = alloc_percpu(struct multicore_worker);
232
233 if (!worker)
234 return NULL;
235 @@ -23,7 +22,7 @@ wg_packet_percpu_multicore_worker_alloc(
236 }
237
238 int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
239 - bool multicore, unsigned int len)
240 + unsigned int len)
241 {
242 int ret;
243
244 @@ -31,25 +30,78 @@ int wg_packet_queue_init(struct crypt_qu
245 ret = ptr_ring_init(&queue->ring, len, GFP_KERNEL);
246 if (ret)
247 return ret;
248 - if (function) {
249 - if (multicore) {
250 - queue->worker = wg_packet_percpu_multicore_worker_alloc(
251 - function, queue);
252 - if (!queue->worker) {
253 - ptr_ring_cleanup(&queue->ring, NULL);
254 - return -ENOMEM;
255 - }
256 - } else {
257 - INIT_WORK(&queue->work, function);
258 - }
259 + queue->worker = wg_packet_percpu_multicore_worker_alloc(function, queue);
260 + if (!queue->worker) {
261 + ptr_ring_cleanup(&queue->ring, NULL);
262 + return -ENOMEM;
263 }
264 return 0;
265 }
266
267 -void wg_packet_queue_free(struct crypt_queue *queue, bool multicore)
268 +void wg_packet_queue_free(struct crypt_queue *queue)
269 {
270 - if (multicore)
271 - free_percpu(queue->worker);
272 + free_percpu(queue->worker);
273 WARN_ON(!__ptr_ring_empty(&queue->ring));
274 ptr_ring_cleanup(&queue->ring, NULL);
275 }
276 +
277 +#define NEXT(skb) ((skb)->prev)
278 +#define STUB(queue) ((struct sk_buff *)&queue->empty)
279 +
280 +void wg_prev_queue_init(struct prev_queue *queue)
281 +{
282 + NEXT(STUB(queue)) = NULL;
283 + queue->head = queue->tail = STUB(queue);
284 + queue->peeked = NULL;
285 + atomic_set(&queue->count, 0);
286 + BUILD_BUG_ON(
287 + offsetof(struct sk_buff, next) != offsetof(struct prev_queue, empty.next) -
288 + offsetof(struct prev_queue, empty) ||
289 + offsetof(struct sk_buff, prev) != offsetof(struct prev_queue, empty.prev) -
290 + offsetof(struct prev_queue, empty));
291 +}
292 +
293 +static void __wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
294 +{
295 + WRITE_ONCE(NEXT(skb), NULL);
296 + WRITE_ONCE(NEXT(xchg_release(&queue->head, skb)), skb);
297 +}
298 +
299 +bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb)
300 +{
301 + if (!atomic_add_unless(&queue->count, 1, MAX_QUEUED_PACKETS))
302 + return false;
303 + __wg_prev_queue_enqueue(queue, skb);
304 + return true;
305 +}
306 +
307 +struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue)
308 +{
309 + struct sk_buff *tail = queue->tail, *next = smp_load_acquire(&NEXT(tail));
310 +
311 + if (tail == STUB(queue)) {
312 + if (!next)
313 + return NULL;
314 + queue->tail = next;
315 + tail = next;
316 + next = smp_load_acquire(&NEXT(next));
317 + }
318 + if (next) {
319 + queue->tail = next;
320 + atomic_dec(&queue->count);
321 + return tail;
322 + }
323 + if (tail != READ_ONCE(queue->head))
324 + return NULL;
325 + __wg_prev_queue_enqueue(queue, STUB(queue));
326 + next = smp_load_acquire(&NEXT(tail));
327 + if (next) {
328 + queue->tail = next;
329 + atomic_dec(&queue->count);
330 + return tail;
331 + }
332 + return NULL;
333 +}
334 +
335 +#undef NEXT
336 +#undef STUB
337 --- a/drivers/net/wireguard/queueing.h
338 +++ b/drivers/net/wireguard/queueing.h
339 @@ -17,12 +17,13 @@ struct wg_device;
340 struct wg_peer;
341 struct multicore_worker;
342 struct crypt_queue;
343 +struct prev_queue;
344 struct sk_buff;
345
346 /* queueing.c APIs: */
347 int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function,
348 - bool multicore, unsigned int len);
349 -void wg_packet_queue_free(struct crypt_queue *queue, bool multicore);
350 + unsigned int len);
351 +void wg_packet_queue_free(struct crypt_queue *queue);
352 struct multicore_worker __percpu *
353 wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr);
354
355 @@ -135,8 +136,31 @@ static inline int wg_cpumask_next_online
356 return cpu;
357 }
358
359 +void wg_prev_queue_init(struct prev_queue *queue);
360 +
361 +/* Multi producer */
362 +bool wg_prev_queue_enqueue(struct prev_queue *queue, struct sk_buff *skb);
363 +
364 +/* Single consumer */
365 +struct sk_buff *wg_prev_queue_dequeue(struct prev_queue *queue);
366 +
367 +/* Single consumer */
368 +static inline struct sk_buff *wg_prev_queue_peek(struct prev_queue *queue)
369 +{
370 + if (queue->peeked)
371 + return queue->peeked;
372 + queue->peeked = wg_prev_queue_dequeue(queue);
373 + return queue->peeked;
374 +}
375 +
376 +/* Single consumer */
377 +static inline void wg_prev_queue_drop_peeked(struct prev_queue *queue)
378 +{
379 + queue->peeked = NULL;
380 +}
381 +
382 static inline int wg_queue_enqueue_per_device_and_peer(
383 - struct crypt_queue *device_queue, struct crypt_queue *peer_queue,
384 + struct crypt_queue *device_queue, struct prev_queue *peer_queue,
385 struct sk_buff *skb, struct workqueue_struct *wq, int *next_cpu)
386 {
387 int cpu;
388 @@ -145,8 +169,9 @@ static inline int wg_queue_enqueue_per_d
389 /* We first queue this up for the peer ingestion, but the consumer
390 * will wait for the state to change to CRYPTED or DEAD before.
391 */
392 - if (unlikely(ptr_ring_produce_bh(&peer_queue->ring, skb)))
393 + if (unlikely(!wg_prev_queue_enqueue(peer_queue, skb)))
394 return -ENOSPC;
395 +
396 /* Then we queue it up in the device queue, which consumes the
397 * packet as soon as it can.
398 */
399 @@ -157,9 +182,7 @@ static inline int wg_queue_enqueue_per_d
400 return 0;
401 }
402
403 -static inline void wg_queue_enqueue_per_peer(struct crypt_queue *queue,
404 - struct sk_buff *skb,
405 - enum packet_state state)
406 +static inline void wg_queue_enqueue_per_peer_tx(struct sk_buff *skb, enum packet_state state)
407 {
408 /* We take a reference, because as soon as we call atomic_set, the
409 * peer can be freed from below us.
410 @@ -167,14 +190,12 @@ static inline void wg_queue_enqueue_per_
411 struct wg_peer *peer = wg_peer_get(PACKET_PEER(skb));
412
413 atomic_set_release(&PACKET_CB(skb)->state, state);
414 - queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu,
415 - peer->internal_id),
416 - peer->device->packet_crypt_wq, &queue->work);
417 + queue_work_on(wg_cpumask_choose_online(&peer->serial_work_cpu, peer->internal_id),
418 + peer->device->packet_crypt_wq, &peer->transmit_packet_work);
419 wg_peer_put(peer);
420 }
421
422 -static inline void wg_queue_enqueue_per_peer_napi(struct sk_buff *skb,
423 - enum packet_state state)
424 +static inline void wg_queue_enqueue_per_peer_rx(struct sk_buff *skb, enum packet_state state)
425 {
426 /* We take a reference, because as soon as we call atomic_set, the
427 * peer can be freed from below us.
428 --- a/drivers/net/wireguard/receive.c
429 +++ b/drivers/net/wireguard/receive.c
430 @@ -444,7 +444,6 @@ packet_processed:
431 int wg_packet_rx_poll(struct napi_struct *napi, int budget)
432 {
433 struct wg_peer *peer = container_of(napi, struct wg_peer, napi);
434 - struct crypt_queue *queue = &peer->rx_queue;
435 struct noise_keypair *keypair;
436 struct endpoint endpoint;
437 enum packet_state state;
438 @@ -455,11 +454,10 @@ int wg_packet_rx_poll(struct napi_struct
439 if (unlikely(budget <= 0))
440 return 0;
441
442 - while ((skb = __ptr_ring_peek(&queue->ring)) != NULL &&
443 + while ((skb = wg_prev_queue_peek(&peer->rx_queue)) != NULL &&
444 (state = atomic_read_acquire(&PACKET_CB(skb)->state)) !=
445 PACKET_STATE_UNCRYPTED) {
446 - __ptr_ring_discard_one(&queue->ring);
447 - peer = PACKET_PEER(skb);
448 + wg_prev_queue_drop_peeked(&peer->rx_queue);
449 keypair = PACKET_CB(skb)->keypair;
450 free = true;
451
452 @@ -508,7 +506,7 @@ void wg_packet_decrypt_worker(struct wor
453 enum packet_state state =
454 likely(decrypt_packet(skb, PACKET_CB(skb)->keypair)) ?
455 PACKET_STATE_CRYPTED : PACKET_STATE_DEAD;
456 - wg_queue_enqueue_per_peer_napi(skb, state);
457 + wg_queue_enqueue_per_peer_rx(skb, state);
458 if (need_resched())
459 cond_resched();
460 }
461 @@ -531,12 +529,10 @@ static void wg_packet_consume_data(struc
462 if (unlikely(READ_ONCE(peer->is_dead)))
463 goto err;
464
465 - ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue,
466 - &peer->rx_queue, skb,
467 - wg->packet_crypt_wq,
468 - &wg->decrypt_queue.last_cpu);
469 + ret = wg_queue_enqueue_per_device_and_peer(&wg->decrypt_queue, &peer->rx_queue, skb,
470 + wg->packet_crypt_wq, &wg->decrypt_queue.last_cpu);
471 if (unlikely(ret == -EPIPE))
472 - wg_queue_enqueue_per_peer_napi(skb, PACKET_STATE_DEAD);
473 + wg_queue_enqueue_per_peer_rx(skb, PACKET_STATE_DEAD);
474 if (likely(!ret || ret == -EPIPE)) {
475 rcu_read_unlock_bh();
476 return;
477 --- a/drivers/net/wireguard/send.c
478 +++ b/drivers/net/wireguard/send.c
479 @@ -239,8 +239,7 @@ void wg_packet_send_keepalive(struct wg_
480 wg_packet_send_staged_packets(peer);
481 }
482
483 -static void wg_packet_create_data_done(struct sk_buff *first,
484 - struct wg_peer *peer)
485 +static void wg_packet_create_data_done(struct wg_peer *peer, struct sk_buff *first)
486 {
487 struct sk_buff *skb, *next;
488 bool is_keepalive, data_sent = false;
489 @@ -262,22 +261,19 @@ static void wg_packet_create_data_done(s
490
491 void wg_packet_tx_worker(struct work_struct *work)
492 {
493 - struct crypt_queue *queue = container_of(work, struct crypt_queue,
494 - work);
495 + struct wg_peer *peer = container_of(work, struct wg_peer, transmit_packet_work);
496 struct noise_keypair *keypair;
497 enum packet_state state;
498 struct sk_buff *first;
499 - struct wg_peer *peer;
500
501 - while ((first = __ptr_ring_peek(&queue->ring)) != NULL &&
502 + while ((first = wg_prev_queue_peek(&peer->tx_queue)) != NULL &&
503 (state = atomic_read_acquire(&PACKET_CB(first)->state)) !=
504 PACKET_STATE_UNCRYPTED) {
505 - __ptr_ring_discard_one(&queue->ring);
506 - peer = PACKET_PEER(first);
507 + wg_prev_queue_drop_peeked(&peer->tx_queue);
508 keypair = PACKET_CB(first)->keypair;
509
510 if (likely(state == PACKET_STATE_CRYPTED))
511 - wg_packet_create_data_done(first, peer);
512 + wg_packet_create_data_done(peer, first);
513 else
514 kfree_skb_list(first);
515
516 @@ -306,16 +302,14 @@ void wg_packet_encrypt_worker(struct wor
517 break;
518 }
519 }
520 - wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first,
521 - state);
522 + wg_queue_enqueue_per_peer_tx(first, state);
523 if (need_resched())
524 cond_resched();
525 }
526 }
527
528 -static void wg_packet_create_data(struct sk_buff *first)
529 +static void wg_packet_create_data(struct wg_peer *peer, struct sk_buff *first)
530 {
531 - struct wg_peer *peer = PACKET_PEER(first);
532 struct wg_device *wg = peer->device;
533 int ret = -EINVAL;
534
535 @@ -323,13 +317,10 @@ static void wg_packet_create_data(struct
536 if (unlikely(READ_ONCE(peer->is_dead)))
537 goto err;
538
539 - ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue,
540 - &peer->tx_queue, first,
541 - wg->packet_crypt_wq,
542 - &wg->encrypt_queue.last_cpu);
543 + ret = wg_queue_enqueue_per_device_and_peer(&wg->encrypt_queue, &peer->tx_queue, first,
544 + wg->packet_crypt_wq, &wg->encrypt_queue.last_cpu);
545 if (unlikely(ret == -EPIPE))
546 - wg_queue_enqueue_per_peer(&peer->tx_queue, first,
547 - PACKET_STATE_DEAD);
548 + wg_queue_enqueue_per_peer_tx(first, PACKET_STATE_DEAD);
549 err:
550 rcu_read_unlock_bh();
551 if (likely(!ret || ret == -EPIPE))
552 @@ -393,7 +384,7 @@ void wg_packet_send_staged_packets(struc
553 packets.prev->next = NULL;
554 wg_peer_get(keypair->entry.peer);
555 PACKET_CB(packets.next)->keypair = keypair;
556 - wg_packet_create_data(packets.next);
557 + wg_packet_create_data(peer, packets.next);
558 return;
559
560 out_invalid: