c1e5858bce9bbb2ee23ee00016fc58d8c4fb52b9
[openwrt/openwrt.git] /
1 From ebd731dd71ec9728a5a87ec1cd695be15828c32c Mon Sep 17 00:00:00 2001
2 From: popcornmix <popcornmix@gmail.com>
3 Date: Mon, 28 Nov 2016 16:50:04 +0000
4 Subject: [PATCH] Improve __copy_to_user and __copy_from_user performance
5
6 Provide a __copy_from_user that uses memcpy. On BCM2708, use
7 optimised memcpy/memmove/memcmp/memset implementations.
8
9 arch/arm: Add mmiocpy/set aliases for memcpy/set
10
11 See: https://github.com/raspberrypi/linux/issues/1082
12
13 copy_from_user: CPU_SW_DOMAIN_PAN compatibility
14
15 The downstream copy_from_user acceleration must also play nice with
16 CONFIG_CPU_SW_DOMAIN_PAN.
17
18 See: https://github.com/raspberrypi/linux/issues/1381
19
20 Signed-off-by: Phil Elwell <phil@raspberrypi.org>
21 ---
22 arch/arm/include/asm/string.h | 5 +
23 arch/arm/include/asm/uaccess.h | 3 +
24 arch/arm/lib/Makefile | 15 +-
25 arch/arm/lib/arm-mem.h | 159 ++++++++++++
26 arch/arm/lib/copy_from_user.S | 4 +-
27 arch/arm/lib/exports_rpi.c | 37 +++
28 arch/arm/lib/memcmp_rpi.S | 285 +++++++++++++++++++++
29 arch/arm/lib/memcpy_rpi.S | 61 +++++
30 arch/arm/lib/memcpymove.h | 506 +++++++++++++++++++++++++++++++++++++
31 arch/arm/lib/memmove_rpi.S | 61 +++++
32 arch/arm/lib/memset_rpi.S | 123 +++++++++
33 arch/arm/lib/uaccess_with_memcpy.c | 120 ++++++++-
34 arch/arm/mach-bcm/Kconfig | 7 +
35 13 files changed, 1380 insertions(+), 6 deletions(-)
36 create mode 100644 arch/arm/lib/arm-mem.h
37 create mode 100644 arch/arm/lib/exports_rpi.c
38 create mode 100644 arch/arm/lib/memcmp_rpi.S
39 create mode 100644 arch/arm/lib/memcpy_rpi.S
40 create mode 100644 arch/arm/lib/memcpymove.h
41 create mode 100644 arch/arm/lib/memmove_rpi.S
42 create mode 100644 arch/arm/lib/memset_rpi.S
43
44 --- a/arch/arm/include/asm/string.h
45 +++ b/arch/arm/include/asm/string.h
46 @@ -24,6 +24,11 @@ extern void * memchr(const void *, int,
47 #define __HAVE_ARCH_MEMSET
48 extern void * memset(void *, int, __kernel_size_t);
49
50 +#ifdef CONFIG_BCM2835_FAST_MEMCPY
51 +#define __HAVE_ARCH_MEMCMP
52 +extern int memcmp(const void *, const void *, size_t);
53 +#endif
54 +
55 extern void __memzero(void *ptr, __kernel_size_t n);
56
57 #define memset(p,v,n) \
58 --- a/arch/arm/include/asm/uaccess.h
59 +++ b/arch/arm/include/asm/uaccess.h
60 @@ -489,6 +489,9 @@ do { \
61 extern unsigned long __must_check
62 arm_copy_from_user(void *to, const void __user *from, unsigned long n);
63
64 +extern unsigned long __must_check
65 +__copy_from_user_std(void *to, const void __user *from, unsigned long n);
66 +
67 static inline unsigned long __must_check
68 __arch_copy_from_user(void *to, const void __user *from, unsigned long n)
69 {
70 --- a/arch/arm/lib/Makefile
71 +++ b/arch/arm/lib/Makefile
72 @@ -6,9 +6,8 @@
73
74 lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
75 csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
76 - delay.o delay-loop.o findbit.o memchr.o memcpy.o \
77 - memmove.o memset.o memzero.o setbit.o \
78 - strchr.o strrchr.o \
79 + delay.o delay-loop.o findbit.o memchr.o memzero.o \
80 + setbit.o strchr.o strrchr.o \
81 testchangebit.o testclearbit.o testsetbit.o \
82 ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \
83 ucmpdi2.o lib1funcs.o div64.o \
84 @@ -18,6 +17,16 @@ lib-y := backtrace.o changebit.o csumip
85 mmu-y := clear_user.o copy_page.o getuser.o putuser.o \
86 copy_from_user.o copy_to_user.o
87
88 +# Choose optimised implementations for Raspberry Pi
89 +ifeq ($(CONFIG_BCM2835_FAST_MEMCPY),y)
90 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600
91 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672
92 + obj-$(CONFIG_MODULES) += exports_rpi.o
93 + lib-y += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o
94 +else
95 + lib-y += memcpy.o memmove.o memset.o
96 +endif
97 +
98 # using lib_ here won't override already available weak symbols
99 obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
100
101 --- /dev/null
102 +++ b/arch/arm/lib/arm-mem.h
103 @@ -0,0 +1,159 @@
104 +/*
105 +Copyright (c) 2013, Raspberry Pi Foundation
106 +Copyright (c) 2013, RISC OS Open Ltd
107 +All rights reserved.
108 +
109 +Redistribution and use in source and binary forms, with or without
110 +modification, are permitted provided that the following conditions are met:
111 + * Redistributions of source code must retain the above copyright
112 + notice, this list of conditions and the following disclaimer.
113 + * Redistributions in binary form must reproduce the above copyright
114 + notice, this list of conditions and the following disclaimer in the
115 + documentation and/or other materials provided with the distribution.
116 + * Neither the name of the copyright holder nor the
117 + names of its contributors may be used to endorse or promote products
118 + derived from this software without specific prior written permission.
119 +
120 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
121 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
122 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
123 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
124 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
125 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
126 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
127 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
129 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130 +*/
131 +
132 +.macro myfunc fname
133 + .func fname
134 + .global fname
135 +fname:
136 +.endm
137 +
138 +.macro preload_leading_step1 backwards, ptr, base
139 +/* If the destination is already 16-byte aligned, then we need to preload
140 + * between 0 and prefetch_distance (inclusive) cache lines ahead so there
141 + * are no gaps when the inner loop starts.
142 + */
143 + .if backwards
144 + sub ptr, base, #1
145 + bic ptr, ptr, #31
146 + .else
147 + bic ptr, base, #31
148 + .endif
149 + .set OFFSET, 0
150 + .rept prefetch_distance+1
151 + pld [ptr, #OFFSET]
152 + .if backwards
153 + .set OFFSET, OFFSET-32
154 + .else
155 + .set OFFSET, OFFSET+32
156 + .endif
157 + .endr
158 +.endm
159 +
160 +.macro preload_leading_step2 backwards, ptr, base, leading_bytes, tmp
161 +/* However, if the destination is not 16-byte aligned, we may need to
162 + * preload one more cache line than that. The question we need to ask is:
163 + * are the leading bytes more than the amount by which the source
164 + * pointer will be rounded down for preloading, and if so, by how many
165 + * cache lines?
166 + */
167 + .if backwards
168 +/* Here we compare against how many bytes we are into the
169 + * cache line, counting down from the highest such address.
170 + * Effectively, we want to calculate
171 + * leading_bytes = dst&15
172 + * cacheline_offset = 31-((src-leading_bytes-1)&31)
173 + * extra_needed = leading_bytes - cacheline_offset
174 + * and test if extra_needed is <= 0, or rearranging:
175 + * leading_bytes + (src-leading_bytes-1)&31 <= 31
176 + */
177 + mov tmp, base, lsl #32-5
178 + sbc tmp, tmp, leading_bytes, lsl #32-5
179 + adds tmp, tmp, leading_bytes, lsl #32-5
180 + bcc 61f
181 + pld [ptr, #-32*(prefetch_distance+1)]
182 + .else
183 +/* Effectively, we want to calculate
184 + * leading_bytes = (-dst)&15
185 + * cacheline_offset = (src+leading_bytes)&31
186 + * extra_needed = leading_bytes - cacheline_offset
187 + * and test if extra_needed is <= 0.
188 + */
189 + mov tmp, base, lsl #32-5
190 + add tmp, tmp, leading_bytes, lsl #32-5
191 + rsbs tmp, tmp, leading_bytes, lsl #32-5
192 + bls 61f
193 + pld [ptr, #32*(prefetch_distance+1)]
194 + .endif
195 +61:
196 +.endm
197 +
198 +.macro preload_trailing backwards, base, remain, tmp
199 + /* We need either 0, 1 or 2 extra preloads */
200 + .if backwards
201 + rsb tmp, base, #0
202 + mov tmp, tmp, lsl #32-5
203 + .else
204 + mov tmp, base, lsl #32-5
205 + .endif
206 + adds tmp, tmp, remain, lsl #32-5
207 + adceqs tmp, tmp, #0
208 + /* The instruction above has two effects: ensures Z is only
209 + * set if C was clear (so Z indicates that both shifted quantities
210 + * were 0), and clears C if Z was set (so C indicates that the sum
211 + * of the shifted quantities was greater and not equal to 32) */
212 + beq 82f
213 + .if backwards
214 + sub tmp, base, #1
215 + bic tmp, tmp, #31
216 + .else
217 + bic tmp, base, #31
218 + .endif
219 + bcc 81f
220 + .if backwards
221 + pld [tmp, #-32*(prefetch_distance+1)]
222 +81:
223 + pld [tmp, #-32*prefetch_distance]
224 + .else
225 + pld [tmp, #32*(prefetch_distance+2)]
226 +81:
227 + pld [tmp, #32*(prefetch_distance+1)]
228 + .endif
229 +82:
230 +.endm
231 +
232 +.macro preload_all backwards, narrow_case, shift, base, remain, tmp0, tmp1
233 + .if backwards
234 + sub tmp0, base, #1
235 + bic tmp0, tmp0, #31
236 + pld [tmp0]
237 + sub tmp1, base, remain, lsl #shift
238 + .else
239 + bic tmp0, base, #31
240 + pld [tmp0]
241 + add tmp1, base, remain, lsl #shift
242 + sub tmp1, tmp1, #1
243 + .endif
244 + bic tmp1, tmp1, #31
245 + cmp tmp1, tmp0
246 + beq 92f
247 + .if narrow_case
248 + /* In this case, all the data fits in either 1 or 2 cache lines */
249 + pld [tmp1]
250 + .else
251 +91:
252 + .if backwards
253 + sub tmp0, tmp0, #32
254 + .else
255 + add tmp0, tmp0, #32
256 + .endif
257 + cmp tmp0, tmp1
258 + pld [tmp0]
259 + bne 91b
260 + .endif
261 +92:
262 +.endm
263 --- a/arch/arm/lib/copy_from_user.S
264 +++ b/arch/arm/lib/copy_from_user.S
265 @@ -89,7 +89,8 @@
266
267 .text
268
269 -ENTRY(arm_copy_from_user)
270 +ENTRY(__copy_from_user_std)
271 +WEAK(arm_copy_from_user)
272 #ifdef CONFIG_CPU_SPECTRE
273 get_thread_info r3
274 ldr r3, [r3, #TI_ADDR_LIMIT]
275 @@ -102,7 +103,7 @@ ENTRY(arm_copy_from_user)
276
277 #include "copy_template.S"
278
279 -ENDPROC(arm_copy_from_user)
280 +ENDPROC(__copy_from_user_std)
281
282 .pushsection .fixup,"ax"
283 .align 0
284 --- /dev/null
285 +++ b/arch/arm/lib/exports_rpi.c
286 @@ -0,0 +1,37 @@
287 +/**
288 + * Copyright (c) 2014, Raspberry Pi (Trading) Ltd.
289 + *
290 + * Redistribution and use in source and binary forms, with or without
291 + * modification, are permitted provided that the following conditions
292 + * are met:
293 + * 1. Redistributions of source code must retain the above copyright
294 + * notice, this list of conditions, and the following disclaimer,
295 + * without modification.
296 + * 2. Redistributions in binary form must reproduce the above copyright
297 + * notice, this list of conditions and the following disclaimer in the
298 + * documentation and/or other materials provided with the distribution.
299 + * 3. The names of the above-listed copyright holders may not be used
300 + * to endorse or promote products derived from this software without
301 + * specific prior written permission.
302 + *
303 + * ALTERNATIVELY, this software may be distributed under the terms of the
304 + * GNU General Public License ("GPL") version 2, as published by the Free
305 + * Software Foundation.
306 + *
307 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
308 + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
309 + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
310 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
311 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
312 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
313 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
314 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
315 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
316 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
317 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
318 + */
319 +
320 +#include <linux/kernel.h>
321 +#include <linux/module.h>
322 +
323 +EXPORT_SYMBOL(memcmp);
324 --- /dev/null
325 +++ b/arch/arm/lib/memcmp_rpi.S
326 @@ -0,0 +1,285 @@
327 +/*
328 +Copyright (c) 2013, Raspberry Pi Foundation
329 +Copyright (c) 2013, RISC OS Open Ltd
330 +All rights reserved.
331 +
332 +Redistribution and use in source and binary forms, with or without
333 +modification, are permitted provided that the following conditions are met:
334 + * Redistributions of source code must retain the above copyright
335 + notice, this list of conditions and the following disclaimer.
336 + * Redistributions in binary form must reproduce the above copyright
337 + notice, this list of conditions and the following disclaimer in the
338 + documentation and/or other materials provided with the distribution.
339 + * Neither the name of the copyright holder nor the
340 + names of its contributors may be used to endorse or promote products
341 + derived from this software without specific prior written permission.
342 +
343 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
344 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
345 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
346 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
347 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
348 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
349 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
350 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
351 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
352 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
353 +*/
354 +
355 +#include <linux/linkage.h>
356 +#include "arm-mem.h"
357 +
358 +/* Prevent the stack from becoming executable */
359 +#if defined(__linux__) && defined(__ELF__)
360 +.section .note.GNU-stack,"",%progbits
361 +#endif
362 +
363 + .text
364 + .arch armv6
365 + .object_arch armv4
366 + .arm
367 + .altmacro
368 + .p2align 2
369 +
370 +.macro memcmp_process_head unaligned
371 + .if unaligned
372 + ldr DAT0, [S_1], #4
373 + ldr DAT1, [S_1], #4
374 + ldr DAT2, [S_1], #4
375 + ldr DAT3, [S_1], #4
376 + .else
377 + ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
378 + .endif
379 + ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
380 +.endm
381 +
382 +.macro memcmp_process_tail
383 + cmp DAT0, DAT4
384 + cmpeq DAT1, DAT5
385 + cmpeq DAT2, DAT6
386 + cmpeq DAT3, DAT7
387 + bne 200f
388 +.endm
389 +
390 +.macro memcmp_leading_31bytes
391 + movs DAT0, OFF, lsl #31
392 + ldrmib DAT0, [S_1], #1
393 + ldrcsh DAT1, [S_1], #2
394 + ldrmib DAT4, [S_2], #1
395 + ldrcsh DAT5, [S_2], #2
396 + movpl DAT0, #0
397 + movcc DAT1, #0
398 + movpl DAT4, #0
399 + movcc DAT5, #0
400 + submi N, N, #1
401 + subcs N, N, #2
402 + cmp DAT0, DAT4
403 + cmpeq DAT1, DAT5
404 + bne 200f
405 + movs DAT0, OFF, lsl #29
406 + ldrmi DAT0, [S_1], #4
407 + ldrcs DAT1, [S_1], #4
408 + ldrcs DAT2, [S_1], #4
409 + ldrmi DAT4, [S_2], #4
410 + ldmcsia S_2!, {DAT5, DAT6}
411 + movpl DAT0, #0
412 + movcc DAT1, #0
413 + movcc DAT2, #0
414 + movpl DAT4, #0
415 + movcc DAT5, #0
416 + movcc DAT6, #0
417 + submi N, N, #4
418 + subcs N, N, #8
419 + cmp DAT0, DAT4
420 + cmpeq DAT1, DAT5
421 + cmpeq DAT2, DAT6
422 + bne 200f
423 + tst OFF, #16
424 + beq 105f
425 + memcmp_process_head 1
426 + sub N, N, #16
427 + memcmp_process_tail
428 +105:
429 +.endm
430 +
431 +.macro memcmp_trailing_15bytes unaligned
432 + movs N, N, lsl #29
433 + .if unaligned
434 + ldrcs DAT0, [S_1], #4
435 + ldrcs DAT1, [S_1], #4
436 + .else
437 + ldmcsia S_1!, {DAT0, DAT1}
438 + .endif
439 + ldrmi DAT2, [S_1], #4
440 + ldmcsia S_2!, {DAT4, DAT5}
441 + ldrmi DAT6, [S_2], #4
442 + movcc DAT0, #0
443 + movcc DAT1, #0
444 + movpl DAT2, #0
445 + movcc DAT4, #0
446 + movcc DAT5, #0
447 + movpl DAT6, #0
448 + cmp DAT0, DAT4
449 + cmpeq DAT1, DAT5
450 + cmpeq DAT2, DAT6
451 + bne 200f
452 + movs N, N, lsl #2
453 + ldrcsh DAT0, [S_1], #2
454 + ldrmib DAT1, [S_1]
455 + ldrcsh DAT4, [S_2], #2
456 + ldrmib DAT5, [S_2]
457 + movcc DAT0, #0
458 + movpl DAT1, #0
459 + movcc DAT4, #0
460 + movpl DAT5, #0
461 + cmp DAT0, DAT4
462 + cmpeq DAT1, DAT5
463 + bne 200f
464 +.endm
465 +
466 +.macro memcmp_long_inner_loop unaligned
467 +110:
468 + memcmp_process_head unaligned
469 + pld [S_2, #prefetch_distance*32 + 16]
470 + memcmp_process_tail
471 + memcmp_process_head unaligned
472 + pld [S_1, OFF]
473 + memcmp_process_tail
474 + subs N, N, #32
475 + bhs 110b
476 + /* Just before the final (prefetch_distance+1) 32-byte blocks,
477 + * deal with final preloads */
478 + preload_trailing 0, S_1, N, DAT0
479 + preload_trailing 0, S_2, N, DAT0
480 + add N, N, #(prefetch_distance+2)*32 - 16
481 +120:
482 + memcmp_process_head unaligned
483 + memcmp_process_tail
484 + subs N, N, #16
485 + bhs 120b
486 + /* Trailing words and bytes */
487 + tst N, #15
488 + beq 199f
489 + memcmp_trailing_15bytes unaligned
490 +199: /* Reached end without detecting a difference */
491 + mov a1, #0
492 + setend le
493 + pop {DAT1-DAT6, pc}
494 +.endm
495 +
496 +.macro memcmp_short_inner_loop unaligned
497 + subs N, N, #16 /* simplifies inner loop termination */
498 + blo 122f
499 +120:
500 + memcmp_process_head unaligned
501 + memcmp_process_tail
502 + subs N, N, #16
503 + bhs 120b
504 +122: /* Trailing words and bytes */
505 + tst N, #15
506 + beq 199f
507 + memcmp_trailing_15bytes unaligned
508 +199: /* Reached end without detecting a difference */
509 + mov a1, #0
510 + setend le
511 + pop {DAT1-DAT6, pc}
512 +.endm
513 +
514 +/*
515 + * int memcmp(const void *s1, const void *s2, size_t n);
516 + * On entry:
517 + * a1 = pointer to buffer 1
518 + * a2 = pointer to buffer 2
519 + * a3 = number of bytes to compare (as unsigned chars)
520 + * On exit:
521 + * a1 = >0/=0/<0 if s1 >/=/< s2
522 + */
523 +
524 +.set prefetch_distance, 2
525 +
526 +ENTRY(memcmp)
527 + S_1 .req a1
528 + S_2 .req a2
529 + N .req a3
530 + DAT0 .req a4
531 + DAT1 .req v1
532 + DAT2 .req v2
533 + DAT3 .req v3
534 + DAT4 .req v4
535 + DAT5 .req v5
536 + DAT6 .req v6
537 + DAT7 .req ip
538 + OFF .req lr
539 +
540 + push {DAT1-DAT6, lr}
541 + setend be /* lowest-addressed bytes are most significant */
542 +
543 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
544 + cmp N, #(prefetch_distance+3)*32 - 1
545 + blo 170f
546 +
547 + /* Long case */
548 + /* Adjust N so that the decrement instruction can also test for
549 + * inner loop termination. We want it to stop when there are
550 + * (prefetch_distance+1) complete blocks to go. */
551 + sub N, N, #(prefetch_distance+2)*32
552 + preload_leading_step1 0, DAT0, S_1
553 + preload_leading_step1 0, DAT1, S_2
554 + tst S_2, #31
555 + beq 154f
556 + rsb OFF, S_2, #0 /* no need to AND with 15 here */
557 + preload_leading_step2 0, DAT0, S_1, OFF, DAT2
558 + preload_leading_step2 0, DAT1, S_2, OFF, DAT2
559 + memcmp_leading_31bytes
560 +154: /* Second source now cacheline (32-byte) aligned; we have at
561 + * least one prefetch to go. */
562 + /* Prefetch offset is best selected such that it lies in the
563 + * first 8 of each 32 bytes - but it's just as easy to aim for
564 + * the first one */
565 + and OFF, S_1, #31
566 + rsb OFF, OFF, #32*prefetch_distance
567 + tst S_1, #3
568 + bne 140f
569 + memcmp_long_inner_loop 0
570 +140: memcmp_long_inner_loop 1
571 +
572 +170: /* Short case */
573 + teq N, #0
574 + beq 199f
575 + preload_all 0, 0, 0, S_1, N, DAT0, DAT1
576 + preload_all 0, 0, 0, S_2, N, DAT0, DAT1
577 + tst S_2, #3
578 + beq 174f
579 +172: subs N, N, #1
580 + blo 199f
581 + ldrb DAT0, [S_1], #1
582 + ldrb DAT4, [S_2], #1
583 + cmp DAT0, DAT4
584 + bne 200f
585 + tst S_2, #3
586 + bne 172b
587 +174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
588 + tst S_1, #3
589 + bne 140f
590 + memcmp_short_inner_loop 0
591 +140: memcmp_short_inner_loop 1
592 +
593 +200: /* Difference found: determine sign. */
594 + movhi a1, #1
595 + movlo a1, #-1
596 + setend le
597 + pop {DAT1-DAT6, pc}
598 +
599 + .unreq S_1
600 + .unreq S_2
601 + .unreq N
602 + .unreq DAT0
603 + .unreq DAT1
604 + .unreq DAT2
605 + .unreq DAT3
606 + .unreq DAT4
607 + .unreq DAT5
608 + .unreq DAT6
609 + .unreq DAT7
610 + .unreq OFF
611 +ENDPROC(memcmp)
612 --- /dev/null
613 +++ b/arch/arm/lib/memcpy_rpi.S
614 @@ -0,0 +1,61 @@
615 +/*
616 +Copyright (c) 2013, Raspberry Pi Foundation
617 +Copyright (c) 2013, RISC OS Open Ltd
618 +All rights reserved.
619 +
620 +Redistribution and use in source and binary forms, with or without
621 +modification, are permitted provided that the following conditions are met:
622 + * Redistributions of source code must retain the above copyright
623 + notice, this list of conditions and the following disclaimer.
624 + * Redistributions in binary form must reproduce the above copyright
625 + notice, this list of conditions and the following disclaimer in the
626 + documentation and/or other materials provided with the distribution.
627 + * Neither the name of the copyright holder nor the
628 + names of its contributors may be used to endorse or promote products
629 + derived from this software without specific prior written permission.
630 +
631 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
632 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
633 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
634 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
635 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
636 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
637 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
638 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
639 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
640 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
641 +*/
642 +
643 +#include <linux/linkage.h>
644 +#include "arm-mem.h"
645 +#include "memcpymove.h"
646 +
647 +/* Prevent the stack from becoming executable */
648 +#if defined(__linux__) && defined(__ELF__)
649 +.section .note.GNU-stack,"",%progbits
650 +#endif
651 +
652 + .text
653 + .arch armv6
654 + .object_arch armv4
655 + .arm
656 + .altmacro
657 + .p2align 2
658 +
659 +/*
660 + * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
661 + * On entry:
662 + * a1 = pointer to destination
663 + * a2 = pointer to source
664 + * a3 = number of bytes to copy
665 + * On exit:
666 + * a1 preserved
667 + */
668 +
669 +.set prefetch_distance, 3
670 +
671 +ENTRY(mmiocpy)
672 +ENTRY(memcpy)
673 + memcpy 0
674 +ENDPROC(memcpy)
675 +ENDPROC(mmiocpy)
676 --- /dev/null
677 +++ b/arch/arm/lib/memcpymove.h
678 @@ -0,0 +1,506 @@
679 +/*
680 +Copyright (c) 2013, Raspberry Pi Foundation
681 +Copyright (c) 2013, RISC OS Open Ltd
682 +All rights reserved.
683 +
684 +Redistribution and use in source and binary forms, with or without
685 +modification, are permitted provided that the following conditions are met:
686 + * Redistributions of source code must retain the above copyright
687 + notice, this list of conditions and the following disclaimer.
688 + * Redistributions in binary form must reproduce the above copyright
689 + notice, this list of conditions and the following disclaimer in the
690 + documentation and/or other materials provided with the distribution.
691 + * Neither the name of the copyright holder nor the
692 + names of its contributors may be used to endorse or promote products
693 + derived from this software without specific prior written permission.
694 +
695 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
696 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
697 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
698 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
699 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
700 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
701 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
702 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
703 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
704 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
705 +*/
706 +
707 +.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
708 + .if words == 1
709 + .if backwards
710 + mov r1, r0, lsl #32-align*8
711 + ldr r0, [S, #-4]!
712 + orr r1, r1, r0, lsr #align*8
713 + str r1, [D, #-4]!
714 + .else
715 + mov r0, r1, lsr #align*8
716 + ldr r1, [S, #4]!
717 + orr r0, r0, r1, lsl #32-align*8
718 + str r0, [D], #4
719 + .endif
720 + .elseif words == 2
721 + .if backwards
722 + ldr r1, [S, #-4]!
723 + mov r2, r0, lsl #32-align*8
724 + ldr r0, [S, #-4]!
725 + orr r2, r2, r1, lsr #align*8
726 + mov r1, r1, lsl #32-align*8
727 + orr r1, r1, r0, lsr #align*8
728 + stmdb D!, {r1, r2}
729 + .else
730 + ldr r1, [S, #4]!
731 + mov r0, r2, lsr #align*8
732 + ldr r2, [S, #4]!
733 + orr r0, r0, r1, lsl #32-align*8
734 + mov r1, r1, lsr #align*8
735 + orr r1, r1, r2, lsl #32-align*8
736 + stmia D!, {r0, r1}
737 + .endif
738 + .elseif words == 4
739 + .if backwards
740 + ldmdb S!, {r2, r3}
741 + mov r4, r0, lsl #32-align*8
742 + ldmdb S!, {r0, r1}
743 + orr r4, r4, r3, lsr #align*8
744 + mov r3, r3, lsl #32-align*8
745 + orr r3, r3, r2, lsr #align*8
746 + mov r2, r2, lsl #32-align*8
747 + orr r2, r2, r1, lsr #align*8
748 + mov r1, r1, lsl #32-align*8
749 + orr r1, r1, r0, lsr #align*8
750 + stmdb D!, {r1, r2, r3, r4}
751 + .else
752 + ldmib S!, {r1, r2}
753 + mov r0, r4, lsr #align*8
754 + ldmib S!, {r3, r4}
755 + orr r0, r0, r1, lsl #32-align*8
756 + mov r1, r1, lsr #align*8
757 + orr r1, r1, r2, lsl #32-align*8
758 + mov r2, r2, lsr #align*8
759 + orr r2, r2, r3, lsl #32-align*8
760 + mov r3, r3, lsr #align*8
761 + orr r3, r3, r4, lsl #32-align*8
762 + stmia D!, {r0, r1, r2, r3}
763 + .endif
764 + .elseif words == 8
765 + .if backwards
766 + ldmdb S!, {r4, r5, r6, r7}
767 + mov r8, r0, lsl #32-align*8
768 + ldmdb S!, {r0, r1, r2, r3}
769 + .if use_pld
770 + pld [S, OFF]
771 + .endif
772 + orr r8, r8, r7, lsr #align*8
773 + mov r7, r7, lsl #32-align*8
774 + orr r7, r7, r6, lsr #align*8
775 + mov r6, r6, lsl #32-align*8
776 + orr r6, r6, r5, lsr #align*8
777 + mov r5, r5, lsl #32-align*8
778 + orr r5, r5, r4, lsr #align*8
779 + mov r4, r4, lsl #32-align*8
780 + orr r4, r4, r3, lsr #align*8
781 + mov r3, r3, lsl #32-align*8
782 + orr r3, r3, r2, lsr #align*8
783 + mov r2, r2, lsl #32-align*8
784 + orr r2, r2, r1, lsr #align*8
785 + mov r1, r1, lsl #32-align*8
786 + orr r1, r1, r0, lsr #align*8
787 + stmdb D!, {r5, r6, r7, r8}
788 + stmdb D!, {r1, r2, r3, r4}
789 + .else
790 + ldmib S!, {r1, r2, r3, r4}
791 + mov r0, r8, lsr #align*8
792 + ldmib S!, {r5, r6, r7, r8}
793 + .if use_pld
794 + pld [S, OFF]
795 + .endif
796 + orr r0, r0, r1, lsl #32-align*8
797 + mov r1, r1, lsr #align*8
798 + orr r1, r1, r2, lsl #32-align*8
799 + mov r2, r2, lsr #align*8
800 + orr r2, r2, r3, lsl #32-align*8
801 + mov r3, r3, lsr #align*8
802 + orr r3, r3, r4, lsl #32-align*8
803 + mov r4, r4, lsr #align*8
804 + orr r4, r4, r5, lsl #32-align*8
805 + mov r5, r5, lsr #align*8
806 + orr r5, r5, r6, lsl #32-align*8
807 + mov r6, r6, lsr #align*8
808 + orr r6, r6, r7, lsl #32-align*8
809 + mov r7, r7, lsr #align*8
810 + orr r7, r7, r8, lsl #32-align*8
811 + stmia D!, {r0, r1, r2, r3}
812 + stmia D!, {r4, r5, r6, r7}
813 + .endif
814 + .endif
815 +.endm
816 +
817 +.macro memcpy_leading_15bytes backwards, align
818 + movs DAT1, DAT2, lsl #31
819 + sub N, N, DAT2
820 + .if backwards
821 + ldrmib DAT0, [S, #-1]!
822 + ldrcsh DAT1, [S, #-2]!
823 + strmib DAT0, [D, #-1]!
824 + strcsh DAT1, [D, #-2]!
825 + .else
826 + ldrmib DAT0, [S], #1
827 + ldrcsh DAT1, [S], #2
828 + strmib DAT0, [D], #1
829 + strcsh DAT1, [D], #2
830 + .endif
831 + movs DAT1, DAT2, lsl #29
832 + .if backwards
833 + ldrmi DAT0, [S, #-4]!
834 + .if align == 0
835 + ldmcsdb S!, {DAT1, DAT2}
836 + .else
837 + ldrcs DAT2, [S, #-4]!
838 + ldrcs DAT1, [S, #-4]!
839 + .endif
840 + strmi DAT0, [D, #-4]!
841 + stmcsdb D!, {DAT1, DAT2}
842 + .else
843 + ldrmi DAT0, [S], #4
844 + .if align == 0
845 + ldmcsia S!, {DAT1, DAT2}
846 + .else
847 + ldrcs DAT1, [S], #4
848 + ldrcs DAT2, [S], #4
849 + .endif
850 + strmi DAT0, [D], #4
851 + stmcsia D!, {DAT1, DAT2}
852 + .endif
853 +.endm
854 +
855 +.macro memcpy_trailing_15bytes backwards, align
856 + movs N, N, lsl #29
857 + .if backwards
858 + .if align == 0
859 + ldmcsdb S!, {DAT0, DAT1}
860 + .else
861 + ldrcs DAT1, [S, #-4]!
862 + ldrcs DAT0, [S, #-4]!
863 + .endif
864 + ldrmi DAT2, [S, #-4]!
865 + stmcsdb D!, {DAT0, DAT1}
866 + strmi DAT2, [D, #-4]!
867 + .else
868 + .if align == 0
869 + ldmcsia S!, {DAT0, DAT1}
870 + .else
871 + ldrcs DAT0, [S], #4
872 + ldrcs DAT1, [S], #4
873 + .endif
874 + ldrmi DAT2, [S], #4
875 + stmcsia D!, {DAT0, DAT1}
876 + strmi DAT2, [D], #4
877 + .endif
878 + movs N, N, lsl #2
879 + .if backwards
880 + ldrcsh DAT0, [S, #-2]!
881 + ldrmib DAT1, [S, #-1]
882 + strcsh DAT0, [D, #-2]!
883 + strmib DAT1, [D, #-1]
884 + .else
885 + ldrcsh DAT0, [S], #2
886 + ldrmib DAT1, [S]
887 + strcsh DAT0, [D], #2
888 + strmib DAT1, [D]
889 + .endif
890 +.endm
891 +
892 +.macro memcpy_long_inner_loop backwards, align
893 + .if align != 0
894 + .if backwards
895 + ldr DAT0, [S, #-align]!
896 + .else
897 + ldr LAST, [S, #-align]!
898 + .endif
899 + .endif
900 +110:
901 + .if align == 0
902 + .if backwards
903 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
904 + pld [S, OFF]
905 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
906 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
907 + .else
908 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
909 + pld [S, OFF]
910 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
911 + stmia D!, {DAT4, DAT5, DAT6, LAST}
912 + .endif
913 + .else
914 + unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
915 + .endif
916 + subs N, N, #32
917 + bhs 110b
918 + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
919 + preload_trailing backwards, S, N, OFF
920 + add N, N, #(prefetch_distance+2)*32 - 32
921 +120:
922 + .if align == 0
923 + .if backwards
924 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
925 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
926 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
927 + .else
928 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
929 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
930 + stmia D!, {DAT4, DAT5, DAT6, LAST}
931 + .endif
932 + .else
933 + unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
934 + .endif
935 + subs N, N, #32
936 + bhs 120b
937 + tst N, #16
938 + .if align == 0
939 + .if backwards
940 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
941 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
942 + .else
943 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
944 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
945 + .endif
946 + .else
947 + beq 130f
948 + unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
949 +130:
950 + .endif
951 + /* Trailing words and bytes */
952 + tst N, #15
953 + beq 199f
954 + .if align != 0
955 + add S, S, #align
956 + .endif
957 + memcpy_trailing_15bytes backwards, align
958 +199:
959 + pop {DAT3, DAT4, DAT5, DAT6, DAT7}
960 + pop {D, DAT1, DAT2, pc}
961 +.endm
962 +
963 +.macro memcpy_medium_inner_loop backwards, align
964 +120:
965 + .if backwards
966 + .if align == 0
967 + ldmdb S!, {DAT0, DAT1, DAT2, LAST}
968 + .else
969 + ldr LAST, [S, #-4]!
970 + ldr DAT2, [S, #-4]!
971 + ldr DAT1, [S, #-4]!
972 + ldr DAT0, [S, #-4]!
973 + .endif
974 + stmdb D!, {DAT0, DAT1, DAT2, LAST}
975 + .else
976 + .if align == 0
977 + ldmia S!, {DAT0, DAT1, DAT2, LAST}
978 + .else
979 + ldr DAT0, [S], #4
980 + ldr DAT1, [S], #4
981 + ldr DAT2, [S], #4
982 + ldr LAST, [S], #4
983 + .endif
984 + stmia D!, {DAT0, DAT1, DAT2, LAST}
985 + .endif
986 + subs N, N, #16
987 + bhs 120b
988 + /* Trailing words and bytes */
989 + tst N, #15
990 + beq 199f
991 + memcpy_trailing_15bytes backwards, align
992 +199:
993 + pop {D, DAT1, DAT2, pc}
994 +.endm
995 +
996 +.macro memcpy_short_inner_loop backwards, align
997 + tst N, #16
998 + .if backwards
999 + .if align == 0
1000 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
1001 + .else
1002 + ldrne LAST, [S, #-4]!
1003 + ldrne DAT2, [S, #-4]!
1004 + ldrne DAT1, [S, #-4]!
1005 + ldrne DAT0, [S, #-4]!
1006 + .endif
1007 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
1008 + .else
1009 + .if align == 0
1010 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
1011 + .else
1012 + ldrne DAT0, [S], #4
1013 + ldrne DAT1, [S], #4
1014 + ldrne DAT2, [S], #4
1015 + ldrne LAST, [S], #4
1016 + .endif
1017 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
1018 + .endif
1019 + memcpy_trailing_15bytes backwards, align
1020 +199:
1021 + pop {D, DAT1, DAT2, pc}
1022 +.endm
1023 +
1024 +.macro memcpy backwards
1025 + D .req a1
1026 + S .req a2
1027 + N .req a3
1028 + DAT0 .req a4
1029 + DAT1 .req v1
1030 + DAT2 .req v2
1031 + DAT3 .req v3
1032 + DAT4 .req v4
1033 + DAT5 .req v5
1034 + DAT6 .req v6
1035 + DAT7 .req sl
1036 + LAST .req ip
1037 + OFF .req lr
1038 +
1039 + .cfi_startproc
1040 +
1041 + push {D, DAT1, DAT2, lr}
1042 +
1043 + .cfi_def_cfa_offset 16
1044 + .cfi_rel_offset D, 0
1045 + .cfi_undefined S
1046 + .cfi_undefined N
1047 + .cfi_undefined DAT0
1048 + .cfi_rel_offset DAT1, 4
1049 + .cfi_rel_offset DAT2, 8
1050 + .cfi_undefined LAST
1051 + .cfi_rel_offset lr, 12
1052 +
1053 + .if backwards
1054 + add D, D, N
1055 + add S, S, N
1056 + .endif
1057 +
1058 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1059 + cmp N, #31
1060 + blo 170f
1061 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
1062 + cmp N, #(prefetch_distance+3)*32 - 1
1063 + blo 160f
1064 +
1065 + /* Long case */
1066 + push {DAT3, DAT4, DAT5, DAT6, DAT7}
1067 +
1068 + .cfi_def_cfa_offset 36
1069 + .cfi_rel_offset D, 20
1070 + .cfi_rel_offset DAT1, 24
1071 + .cfi_rel_offset DAT2, 28
1072 + .cfi_rel_offset DAT3, 0
1073 + .cfi_rel_offset DAT4, 4
1074 + .cfi_rel_offset DAT5, 8
1075 + .cfi_rel_offset DAT6, 12
1076 + .cfi_rel_offset DAT7, 16
1077 + .cfi_rel_offset lr, 32
1078 +
1079 + /* Adjust N so that the decrement instruction can also test for
1080 + * inner loop termination. We want it to stop when there are
1081 + * (prefetch_distance+1) complete blocks to go. */
1082 + sub N, N, #(prefetch_distance+2)*32
1083 + preload_leading_step1 backwards, DAT0, S
1084 + .if backwards
1085 + /* Bug in GAS: it accepts, but mis-assembles the instruction
1086 + * ands DAT2, D, #60, 2
1087 + * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
1088 + */
1089 + .word 0xE210513C
1090 + beq 154f
1091 + .else
1092 + ands DAT2, D, #15
1093 + beq 154f
1094 + rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
1095 + .endif
1096 + preload_leading_step2 backwards, DAT0, S, DAT2, OFF
1097 + memcpy_leading_15bytes backwards, 1
1098 +154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
1099 + /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
1100 + .if backwards
1101 + rsb OFF, S, #3
1102 + and OFF, OFF, #28
1103 + sub OFF, OFF, #32*(prefetch_distance+1)
1104 + .else
1105 + and OFF, S, #28
1106 + rsb OFF, OFF, #32*prefetch_distance
1107 + .endif
1108 + movs DAT0, S, lsl #31
1109 + bhi 157f
1110 + bcs 156f
1111 + bmi 155f
1112 + memcpy_long_inner_loop backwards, 0
1113 +155: memcpy_long_inner_loop backwards, 1
1114 +156: memcpy_long_inner_loop backwards, 2
1115 +157: memcpy_long_inner_loop backwards, 3
1116 +
1117 + .cfi_def_cfa_offset 16
1118 + .cfi_rel_offset D, 0
1119 + .cfi_rel_offset DAT1, 4
1120 + .cfi_rel_offset DAT2, 8
1121 + .cfi_same_value DAT3
1122 + .cfi_same_value DAT4
1123 + .cfi_same_value DAT5
1124 + .cfi_same_value DAT6
1125 + .cfi_same_value DAT7
1126 + .cfi_rel_offset lr, 12
1127 +
1128 +160: /* Medium case */
1129 + preload_all backwards, 0, 0, S, N, DAT2, OFF
1130 + sub N, N, #16 /* simplifies inner loop termination */
1131 + .if backwards
1132 + ands DAT2, D, #15
1133 + beq 164f
1134 + .else
1135 + ands DAT2, D, #15
1136 + beq 164f
1137 + rsb DAT2, DAT2, #16
1138 + .endif
1139 + memcpy_leading_15bytes backwards, align
1140 +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
1141 + tst S, #3
1142 + bne 140f
1143 + memcpy_medium_inner_loop backwards, 0
1144 +140: memcpy_medium_inner_loop backwards, 1
1145 +
1146 +170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
1147 + teq N, #0
1148 + beq 199f
1149 + preload_all backwards, 1, 0, S, N, DAT2, LAST
1150 + tst D, #3
1151 + beq 174f
1152 +172: subs N, N, #1
1153 + blo 199f
1154 + .if backwards
1155 + ldrb DAT0, [S, #-1]!
1156 + strb DAT0, [D, #-1]!
1157 + .else
1158 + ldrb DAT0, [S], #1
1159 + strb DAT0, [D], #1
1160 + .endif
1161 + tst D, #3
1162 + bne 172b
1163 +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
1164 + tst S, #3
1165 + bne 140f
1166 + memcpy_short_inner_loop backwards, 0
1167 +140: memcpy_short_inner_loop backwards, 1
1168 +
1169 + .cfi_endproc
1170 +
1171 + .unreq D
1172 + .unreq S
1173 + .unreq N
1174 + .unreq DAT0
1175 + .unreq DAT1
1176 + .unreq DAT2
1177 + .unreq DAT3
1178 + .unreq DAT4
1179 + .unreq DAT5
1180 + .unreq DAT6
1181 + .unreq DAT7
1182 + .unreq LAST
1183 + .unreq OFF
1184 +.endm
1185 --- /dev/null
1186 +++ b/arch/arm/lib/memmove_rpi.S
1187 @@ -0,0 +1,61 @@
1188 +/*
1189 +Copyright (c) 2013, Raspberry Pi Foundation
1190 +Copyright (c) 2013, RISC OS Open Ltd
1191 +All rights reserved.
1192 +
1193 +Redistribution and use in source and binary forms, with or without
1194 +modification, are permitted provided that the following conditions are met:
1195 + * Redistributions of source code must retain the above copyright
1196 + notice, this list of conditions and the following disclaimer.
1197 + * Redistributions in binary form must reproduce the above copyright
1198 + notice, this list of conditions and the following disclaimer in the
1199 + documentation and/or other materials provided with the distribution.
1200 + * Neither the name of the copyright holder nor the
1201 + names of its contributors may be used to endorse or promote products
1202 + derived from this software without specific prior written permission.
1203 +
1204 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1205 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1206 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1207 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1208 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1209 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1210 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1211 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1212 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1213 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1214 +*/
1215 +
1216 +#include <linux/linkage.h>
1217 +#include "arm-mem.h"
1218 +#include "memcpymove.h"
1219 +
1220 +/* Prevent the stack from becoming executable */
1221 +#if defined(__linux__) && defined(__ELF__)
1222 +.section .note.GNU-stack,"",%progbits
1223 +#endif
1224 +
1225 + .text
1226 + .arch armv6
1227 + .object_arch armv4
1228 + .arm
1229 + .altmacro
1230 + .p2align 2
1231 +
1232 +/*
1233 + * void *memmove(void *s1, const void *s2, size_t n);
1234 + * On entry:
1235 + * a1 = pointer to destination
1236 + * a2 = pointer to source
1237 + * a3 = number of bytes to copy
1238 + * On exit:
1239 + * a1 preserved
1240 + */
1241 +
1242 +.set prefetch_distance, 3
1243 +
1244 +ENTRY(memmove)
1245 + cmp a2, a1
1246 + bpl memcpy /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
1247 + memcpy 1
1248 +ENDPROC(memmove)
1249 --- /dev/null
1250 +++ b/arch/arm/lib/memset_rpi.S
1251 @@ -0,0 +1,123 @@
1252 +/*
1253 +Copyright (c) 2013, Raspberry Pi Foundation
1254 +Copyright (c) 2013, RISC OS Open Ltd
1255 +All rights reserved.
1256 +
1257 +Redistribution and use in source and binary forms, with or without
1258 +modification, are permitted provided that the following conditions are met:
1259 + * Redistributions of source code must retain the above copyright
1260 + notice, this list of conditions and the following disclaimer.
1261 + * Redistributions in binary form must reproduce the above copyright
1262 + notice, this list of conditions and the following disclaimer in the
1263 + documentation and/or other materials provided with the distribution.
1264 + * Neither the name of the copyright holder nor the
1265 + names of its contributors may be used to endorse or promote products
1266 + derived from this software without specific prior written permission.
1267 +
1268 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1269 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1270 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1271 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1272 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1273 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1274 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1275 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1276 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1277 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1278 +*/
1279 +
1280 +#include <linux/linkage.h>
1281 +#include "arm-mem.h"
1282 +
1283 +/* Prevent the stack from becoming executable */
1284 +#if defined(__linux__) && defined(__ELF__)
1285 +.section .note.GNU-stack,"",%progbits
1286 +#endif
1287 +
1288 + .text
1289 + .arch armv6
1290 + .object_arch armv4
1291 + .arm
1292 + .altmacro
1293 + .p2align 2
1294 +
1295 +/*
1296 + * void *memset(void *s, int c, size_t n);
1297 + * On entry:
1298 + * a1 = pointer to buffer to fill
1299 + * a2 = byte pattern to fill with (caller-narrowed)
1300 + * a3 = number of bytes to fill
1301 + * On exit:
1302 + * a1 preserved
1303 + */
1304 +ENTRY(mmioset)
1305 +ENTRY(memset)
1306 + S .req a1
1307 + DAT0 .req a2
1308 + N .req a3
1309 + DAT1 .req a4
1310 + DAT2 .req ip
1311 + DAT3 .req lr
1312 +
1313 + orr DAT0, DAT0, lsl #8
1314 + push {S, lr}
1315 + orr DAT0, DAT0, lsl #16
1316 + mov DAT1, DAT0
1317 +
1318 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1319 + cmp N, #31
1320 + blo 170f
1321 +
1322 +161: sub N, N, #16 /* simplifies inner loop termination */
1323 + /* Leading words and bytes */
1324 + tst S, #15
1325 + beq 164f
1326 + rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */
1327 + movs DAT2, DAT3, lsl #31
1328 + submi N, N, #1
1329 + strmib DAT0, [S], #1
1330 + subcs N, N, #2
1331 + strcsh DAT0, [S], #2
1332 + movs DAT2, DAT3, lsl #29
1333 + submi N, N, #4
1334 + strmi DAT0, [S], #4
1335 + subcs N, N, #8
1336 + stmcsia S!, {DAT0, DAT1}
1337 +164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
1338 + mov DAT2, DAT0
1339 + mov DAT3, DAT0
1340 + /* Now the inner loop of 16-byte stores */
1341 +165: stmia S!, {DAT0, DAT1, DAT2, DAT3}
1342 + subs N, N, #16
1343 + bhs 165b
1344 +166: /* Trailing words and bytes */
1345 + movs N, N, lsl #29
1346 + stmcsia S!, {DAT0, DAT1}
1347 + strmi DAT0, [S], #4
1348 + movs N, N, lsl #2
1349 + strcsh DAT0, [S], #2
1350 + strmib DAT0, [S]
1351 +199: pop {S, pc}
1352 +
1353 +170: /* Short case */
1354 + mov DAT2, DAT0
1355 + mov DAT3, DAT0
1356 + tst S, #3
1357 + beq 174f
1358 +172: subs N, N, #1
1359 + blo 199b
1360 + strb DAT0, [S], #1
1361 + tst S, #3
1362 + bne 172b
1363 +174: tst N, #16
1364 + stmneia S!, {DAT0, DAT1, DAT2, DAT3}
1365 + b 166b
1366 +
1367 + .unreq S
1368 + .unreq DAT0
1369 + .unreq N
1370 + .unreq DAT1
1371 + .unreq DAT2
1372 + .unreq DAT3
1373 +ENDPROC(memset)
1374 +ENDPROC(mmioset)
1375 --- a/arch/arm/lib/uaccess_with_memcpy.c
1376 +++ b/arch/arm/lib/uaccess_with_memcpy.c
1377 @@ -22,6 +22,14 @@
1378 #include <asm/current.h>
1379 #include <asm/page.h>
1380
1381 +#ifndef COPY_FROM_USER_THRESHOLD
1382 +#define COPY_FROM_USER_THRESHOLD 64
1383 +#endif
1384 +
1385 +#ifndef COPY_TO_USER_THRESHOLD
1386 +#define COPY_TO_USER_THRESHOLD 64
1387 +#endif
1388 +
1389 static int
1390 pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1391 {
1392 @@ -84,7 +92,44 @@ pin_page_for_write(const void __user *_a
1393 return 1;
1394 }
1395
1396 -static unsigned long noinline
1397 +static int
1398 +pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1399 +{
1400 + unsigned long addr = (unsigned long)_addr;
1401 + pgd_t *pgd;
1402 + pmd_t *pmd;
1403 + pte_t *pte;
1404 + pud_t *pud;
1405 + spinlock_t *ptl;
1406 +
1407 + pgd = pgd_offset(current->mm, addr);
1408 + if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
1409 + {
1410 + return 0;
1411 + }
1412 + pud = pud_offset(pgd, addr);
1413 + if (unlikely(pud_none(*pud) || pud_bad(*pud)))
1414 + {
1415 + return 0;
1416 + }
1417 +
1418 + pmd = pmd_offset(pud, addr);
1419 + if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
1420 + return 0;
1421 +
1422 + pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
1423 + if (unlikely(!pte_present(*pte) || !pte_young(*pte))) {
1424 + pte_unmap_unlock(pte, ptl);
1425 + return 0;
1426 + }
1427 +
1428 + *ptep = pte;
1429 + *ptlp = ptl;
1430 +
1431 + return 1;
1432 +}
1433 +
1434 +unsigned long noinline
1435 __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
1436 {
1437 unsigned long ua_flags;
1438 @@ -137,6 +182,57 @@ out:
1439 return n;
1440 }
1441
1442 +unsigned long noinline
1443 +__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n)
1444 +{
1445 + unsigned long ua_flags;
1446 + int atomic;
1447 +
1448 + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
1449 + memcpy(to, (const void *)from, n);
1450 + return 0;
1451 + }
1452 +
1453 + /* the mmap semaphore is taken only if not in an atomic context */
1454 + atomic = in_atomic();
1455 +
1456 + if (!atomic)
1457 + down_read(&current->mm->mmap_sem);
1458 + while (n) {
1459 + pte_t *pte;
1460 + spinlock_t *ptl;
1461 + int tocopy;
1462 +
1463 + while (!pin_page_for_read(from, &pte, &ptl)) {
1464 + char temp;
1465 + if (!atomic)
1466 + up_read(&current->mm->mmap_sem);
1467 + if (__get_user(temp, (char __user *)from))
1468 + goto out;
1469 + if (!atomic)
1470 + down_read(&current->mm->mmap_sem);
1471 + }
1472 +
1473 + tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1;
1474 + if (tocopy > n)
1475 + tocopy = n;
1476 +
1477 + ua_flags = uaccess_save_and_enable();
1478 + memcpy(to, (const void *)from, tocopy);
1479 + uaccess_restore(ua_flags);
1480 + to += tocopy;
1481 + from += tocopy;
1482 + n -= tocopy;
1483 +
1484 + pte_unmap_unlock(pte, ptl);
1485 + }
1486 + if (!atomic)
1487 + up_read(&current->mm->mmap_sem);
1488 +
1489 +out:
1490 + return n;
1491 +}
1492 +
1493 unsigned long
1494 arm_copy_to_user(void __user *to, const void *from, unsigned long n)
1495 {
1496 @@ -147,7 +243,7 @@ arm_copy_to_user(void __user *to, const
1497 * With frame pointer disabled, tail call optimization kicks in
1498 * as well making this test almost invisible.
1499 */
1500 - if (n < 64) {
1501 + if (n < COPY_TO_USER_THRESHOLD) {
1502 unsigned long ua_flags = uaccess_save_and_enable();
1503 n = __copy_to_user_std(to, from, n);
1504 uaccess_restore(ua_flags);
1505 @@ -156,6 +252,26 @@ arm_copy_to_user(void __user *to, const
1506 }
1507 return n;
1508 }
1509 +
1510 +unsigned long __must_check
1511 +arm_copy_from_user(void *to, const void __user *from, unsigned long n)
1512 +{
1513 + /*
1514 + * This test is stubbed out of the main function above to keep
1515 + * the overhead for small copies low by avoiding a large
1516 + * register dump on the stack just to reload them right away.
1517 + * With frame pointer disabled, tail call optimization kicks in
1518 + * as well making this test almost invisible.
1519 + */
1520 + if (n < COPY_TO_USER_THRESHOLD) {
1521 + unsigned long ua_flags = uaccess_save_and_enable();
1522 + n = __copy_from_user_std(to, from, n);
1523 + uaccess_restore(ua_flags);
1524 + } else {
1525 + n = __copy_from_user_memcpy(to, from, n);
1526 + }
1527 + return n;
1528 +}
1529
1530 static unsigned long noinline
1531 __clear_user_memset(void __user *addr, unsigned long n)
1532 --- a/arch/arm/mach-bcm/Kconfig
1533 +++ b/arch/arm/mach-bcm/Kconfig
1534 @@ -174,6 +174,13 @@ config ARCH_BCM_53573
1535 The base chip is BCM53573 and there are some packaging modifications
1536 like BCM47189 and BCM47452.
1537
1538 +config BCM2835_FAST_MEMCPY
1539 + bool "Enable optimized __copy_to_user and __copy_from_user"
1540 + depends on ARCH_BCM2835 && ARCH_MULTI_V6
1541 + default y
1542 + help
1543 + Optimized versions of __copy_to_user and __copy_from_user for Pi1.
1544 +
1545 config ARCH_BCM_63XX
1546 bool "Broadcom BCM63xx DSL SoC"
1547 depends on ARCH_MULTI_V7