e549ce963d892a1f696f98e2e3ec7e7d37307368
[openwrt/staging/stintel.git] /
1 From a013e78cd135c415124496edd439b1031102c33c Mon Sep 17 00:00:00 2001
2 From: popcornmix <popcornmix@gmail.com>
3 Date: Mon, 28 Nov 2016 16:50:04 +0000
4 Subject: [PATCH] Improve __copy_to_user and __copy_from_user
5 performance
6
7 Provide a __copy_from_user that uses memcpy. On BCM2708, use
8 optimised memcpy/memmove/memcmp/memset implementations.
9
10 arch/arm: Add mmiocpy/set aliases for memcpy/set
11
12 See: https://github.com/raspberrypi/linux/issues/1082
13
14 copy_from_user: CPU_SW_DOMAIN_PAN compatibility
15
16 The downstream copy_from_user acceleration must also play nice with
17 CONFIG_CPU_SW_DOMAIN_PAN.
18
19 See: https://github.com/raspberrypi/linux/issues/1381
20
21 Signed-off-by: Phil Elwell <phil@raspberrypi.org>
22
23 Fix copy_from_user if BCM2835_FAST_MEMCPY=n
24
25 The change which introduced CONFIG_BCM2835_FAST_MEMCPY unconditionally
26 changed the behaviour of arm_copy_from_user. The page pinning code
27 is not safe on ARMv7 if LPAE & high memory is enabled and causes
28 crashes which look like PTE corruption.
29
30 Make __copy_from_user_memcpy conditional on CONFIG_2835_FAST_MEMCPY=y
31 which is really an ARMv6 / Pi1 optimization and not necessary on newer
32 ARM processors.
33
34 arm: fix mmap unlocks in uaccess_with_memcpy.c
35
36 This is a regression that was added with the commit 192a4e923ef092924dd013e7326f2ec520ee4783 as of rpi-5.8.y, since that is when the move to the mmap locking API was introduced - d8ed45c5dcd455fc5848d47f86883a1b872ac0d0
37
38 The issue is that when the patch to improve performance for the __copy_to_user and __copy_from_user functions were added for the Raspberry Pi, some of the mmaps were incorrectly mapped to write instead of read. This would cause a verity of issues, and in my case, prevent the booting of a squashfs filesystem on rpi-5.8-y and above. An example of the panic you would see from this can be seen at https://pastebin.com/raw/jBz5xCzL
39
40 Signed-off-by: Christian Lamparter <chunkeey@gmail.com>
41 Signed-off-by: Christopher Blake <chrisrblake93@gmail.com>
42 ---
43 arch/arm/include/asm/string.h | 5 +
44 arch/arm/include/asm/uaccess.h | 3 +
45 arch/arm/lib/Makefile | 14 +-
46 arch/arm/lib/arm-mem.h | 159 +++++++++
47 arch/arm/lib/copy_from_user.S | 4 +-
48 arch/arm/lib/exports_rpi.c | 37 +++
49 arch/arm/lib/memcmp_rpi.S | 285 ++++++++++++++++
50 arch/arm/lib/memcpy_rpi.S | 61 ++++
51 arch/arm/lib/memcpymove.h | 506 +++++++++++++++++++++++++++++
52 arch/arm/lib/memmove_rpi.S | 61 ++++
53 arch/arm/lib/memset_rpi.S | 128 ++++++++
54 arch/arm/lib/uaccess_with_memcpy.c | 130 +++++++-
55 arch/arm/mach-bcm/Kconfig | 7 +
56 13 files changed, 1394 insertions(+), 6 deletions(-)
57 create mode 100644 arch/arm/lib/arm-mem.h
58 create mode 100644 arch/arm/lib/exports_rpi.c
59 create mode 100644 arch/arm/lib/memcmp_rpi.S
60 create mode 100644 arch/arm/lib/memcpy_rpi.S
61 create mode 100644 arch/arm/lib/memcpymove.h
62 create mode 100644 arch/arm/lib/memmove_rpi.S
63 create mode 100644 arch/arm/lib/memset_rpi.S
64
65 --- a/arch/arm/include/asm/string.h
66 +++ b/arch/arm/include/asm/string.h
67 @@ -39,4 +39,9 @@ static inline void *memset64(uint64_t *p
68 return __memset64(p, v, n * 8, v >> 32);
69 }
70
71 +#ifdef CONFIG_BCM2835_FAST_MEMCPY
72 +#define __HAVE_ARCH_MEMCMP
73 +extern int memcmp(const void *, const void *, size_t);
74 +#endif
75 +
76 #endif
77 --- a/arch/arm/include/asm/uaccess.h
78 +++ b/arch/arm/include/asm/uaccess.h
79 @@ -518,6 +518,9 @@ do { \
80 extern unsigned long __must_check
81 arm_copy_from_user(void *to, const void __user *from, unsigned long n);
82
83 +extern unsigned long __must_check
84 +__copy_from_user_std(void *to, const void __user *from, unsigned long n);
85 +
86 static inline unsigned long __must_check
87 raw_copy_from_user(void *to, const void __user *from, unsigned long n)
88 {
89 --- a/arch/arm/lib/Makefile
90 +++ b/arch/arm/lib/Makefile
91 @@ -7,8 +7,8 @@
92
93 lib-y := changebit.o csumipv6.o csumpartial.o \
94 csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
95 - delay.o delay-loop.o findbit.o memchr.o memcpy.o \
96 - memmove.o memset.o setbit.o \
97 + delay.o delay-loop.o findbit.o memchr.o \
98 + setbit.o \
99 strchr.o strrchr.o \
100 testchangebit.o testclearbit.o testsetbit.o \
101 ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \
102 @@ -25,6 +25,16 @@ else
103 lib-y += backtrace.o
104 endif
105
106 +# Choose optimised implementations for Raspberry Pi
107 +ifeq ($(CONFIG_BCM2835_FAST_MEMCPY),y)
108 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600
109 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672
110 + obj-$(CONFIG_MODULES) += exports_rpi.o
111 + lib-y += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o
112 +else
113 + lib-y += memcpy.o memmove.o memset.o
114 +endif
115 +
116 # using lib_ here won't override already available weak symbols
117 obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
118
119 --- /dev/null
120 +++ b/arch/arm/lib/arm-mem.h
121 @@ -0,0 +1,159 @@
122 +/*
123 +Copyright (c) 2013, Raspberry Pi Foundation
124 +Copyright (c) 2013, RISC OS Open Ltd
125 +All rights reserved.
126 +
127 +Redistribution and use in source and binary forms, with or without
128 +modification, are permitted provided that the following conditions are met:
129 + * Redistributions of source code must retain the above copyright
130 + notice, this list of conditions and the following disclaimer.
131 + * Redistributions in binary form must reproduce the above copyright
132 + notice, this list of conditions and the following disclaimer in the
133 + documentation and/or other materials provided with the distribution.
134 + * Neither the name of the copyright holder nor the
135 + names of its contributors may be used to endorse or promote products
136 + derived from this software without specific prior written permission.
137 +
138 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
139 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
140 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
141 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
142 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
143 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
144 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
145 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
146 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
147 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
148 +*/
149 +
150 +.macro myfunc fname
151 + .func fname
152 + .global fname
153 +fname:
154 +.endm
155 +
156 +.macro preload_leading_step1 backwards, ptr, base
157 +/* If the destination is already 16-byte aligned, then we need to preload
158 + * between 0 and prefetch_distance (inclusive) cache lines ahead so there
159 + * are no gaps when the inner loop starts.
160 + */
161 + .if backwards
162 + sub ptr, base, #1
163 + bic ptr, ptr, #31
164 + .else
165 + bic ptr, base, #31
166 + .endif
167 + .set OFFSET, 0
168 + .rept prefetch_distance+1
169 + pld [ptr, #OFFSET]
170 + .if backwards
171 + .set OFFSET, OFFSET-32
172 + .else
173 + .set OFFSET, OFFSET+32
174 + .endif
175 + .endr
176 +.endm
177 +
178 +.macro preload_leading_step2 backwards, ptr, base, leading_bytes, tmp
179 +/* However, if the destination is not 16-byte aligned, we may need to
180 + * preload one more cache line than that. The question we need to ask is:
181 + * are the leading bytes more than the amount by which the source
182 + * pointer will be rounded down for preloading, and if so, by how many
183 + * cache lines?
184 + */
185 + .if backwards
186 +/* Here we compare against how many bytes we are into the
187 + * cache line, counting down from the highest such address.
188 + * Effectively, we want to calculate
189 + * leading_bytes = dst&15
190 + * cacheline_offset = 31-((src-leading_bytes-1)&31)
191 + * extra_needed = leading_bytes - cacheline_offset
192 + * and test if extra_needed is <= 0, or rearranging:
193 + * leading_bytes + (src-leading_bytes-1)&31 <= 31
194 + */
195 + mov tmp, base, lsl #32-5
196 + sbc tmp, tmp, leading_bytes, lsl #32-5
197 + adds tmp, tmp, leading_bytes, lsl #32-5
198 + bcc 61f
199 + pld [ptr, #-32*(prefetch_distance+1)]
200 + .else
201 +/* Effectively, we want to calculate
202 + * leading_bytes = (-dst)&15
203 + * cacheline_offset = (src+leading_bytes)&31
204 + * extra_needed = leading_bytes - cacheline_offset
205 + * and test if extra_needed is <= 0.
206 + */
207 + mov tmp, base, lsl #32-5
208 + add tmp, tmp, leading_bytes, lsl #32-5
209 + rsbs tmp, tmp, leading_bytes, lsl #32-5
210 + bls 61f
211 + pld [ptr, #32*(prefetch_distance+1)]
212 + .endif
213 +61:
214 +.endm
215 +
216 +.macro preload_trailing backwards, base, remain, tmp
217 + /* We need either 0, 1 or 2 extra preloads */
218 + .if backwards
219 + rsb tmp, base, #0
220 + mov tmp, tmp, lsl #32-5
221 + .else
222 + mov tmp, base, lsl #32-5
223 + .endif
224 + adds tmp, tmp, remain, lsl #32-5
225 + adceqs tmp, tmp, #0
226 + /* The instruction above has two effects: ensures Z is only
227 + * set if C was clear (so Z indicates that both shifted quantities
228 + * were 0), and clears C if Z was set (so C indicates that the sum
229 + * of the shifted quantities was greater and not equal to 32) */
230 + beq 82f
231 + .if backwards
232 + sub tmp, base, #1
233 + bic tmp, tmp, #31
234 + .else
235 + bic tmp, base, #31
236 + .endif
237 + bcc 81f
238 + .if backwards
239 + pld [tmp, #-32*(prefetch_distance+1)]
240 +81:
241 + pld [tmp, #-32*prefetch_distance]
242 + .else
243 + pld [tmp, #32*(prefetch_distance+2)]
244 +81:
245 + pld [tmp, #32*(prefetch_distance+1)]
246 + .endif
247 +82:
248 +.endm
249 +
250 +.macro preload_all backwards, narrow_case, shift, base, remain, tmp0, tmp1
251 + .if backwards
252 + sub tmp0, base, #1
253 + bic tmp0, tmp0, #31
254 + pld [tmp0]
255 + sub tmp1, base, remain, lsl #shift
256 + .else
257 + bic tmp0, base, #31
258 + pld [tmp0]
259 + add tmp1, base, remain, lsl #shift
260 + sub tmp1, tmp1, #1
261 + .endif
262 + bic tmp1, tmp1, #31
263 + cmp tmp1, tmp0
264 + beq 92f
265 + .if narrow_case
266 + /* In this case, all the data fits in either 1 or 2 cache lines */
267 + pld [tmp1]
268 + .else
269 +91:
270 + .if backwards
271 + sub tmp0, tmp0, #32
272 + .else
273 + add tmp0, tmp0, #32
274 + .endif
275 + cmp tmp0, tmp1
276 + pld [tmp0]
277 + bne 91b
278 + .endif
279 +92:
280 +.endm
281 --- a/arch/arm/lib/copy_from_user.S
282 +++ b/arch/arm/lib/copy_from_user.S
283 @@ -107,7 +107,8 @@
284
285 .text
286
287 -ENTRY(arm_copy_from_user)
288 +ENTRY(__copy_from_user_std)
289 +WEAK(arm_copy_from_user)
290 #ifdef CONFIG_CPU_SPECTRE
291 get_thread_info r3
292 ldr r3, [r3, #TI_ADDR_LIMIT]
293 @@ -117,6 +118,7 @@ ENTRY(arm_copy_from_user)
294 #include "copy_template.S"
295
296 ENDPROC(arm_copy_from_user)
297 +ENDPROC(__copy_from_user_std)
298
299 .pushsection .text.fixup,"ax"
300 .align 0
301 --- /dev/null
302 +++ b/arch/arm/lib/exports_rpi.c
303 @@ -0,0 +1,37 @@
304 +/**
305 + * Copyright (c) 2014, Raspberry Pi (Trading) Ltd.
306 + *
307 + * Redistribution and use in source and binary forms, with or without
308 + * modification, are permitted provided that the following conditions
309 + * are met:
310 + * 1. Redistributions of source code must retain the above copyright
311 + * notice, this list of conditions, and the following disclaimer,
312 + * without modification.
313 + * 2. Redistributions in binary form must reproduce the above copyright
314 + * notice, this list of conditions and the following disclaimer in the
315 + * documentation and/or other materials provided with the distribution.
316 + * 3. The names of the above-listed copyright holders may not be used
317 + * to endorse or promote products derived from this software without
318 + * specific prior written permission.
319 + *
320 + * ALTERNATIVELY, this software may be distributed under the terms of the
321 + * GNU General Public License ("GPL") version 2, as published by the Free
322 + * Software Foundation.
323 + *
324 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
325 + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
326 + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
327 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
328 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
329 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
330 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
331 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
332 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
333 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
334 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
335 + */
336 +
337 +#include <linux/kernel.h>
338 +#include <linux/module.h>
339 +
340 +EXPORT_SYMBOL(memcmp);
341 --- /dev/null
342 +++ b/arch/arm/lib/memcmp_rpi.S
343 @@ -0,0 +1,285 @@
344 +/*
345 +Copyright (c) 2013, Raspberry Pi Foundation
346 +Copyright (c) 2013, RISC OS Open Ltd
347 +All rights reserved.
348 +
349 +Redistribution and use in source and binary forms, with or without
350 +modification, are permitted provided that the following conditions are met:
351 + * Redistributions of source code must retain the above copyright
352 + notice, this list of conditions and the following disclaimer.
353 + * Redistributions in binary form must reproduce the above copyright
354 + notice, this list of conditions and the following disclaimer in the
355 + documentation and/or other materials provided with the distribution.
356 + * Neither the name of the copyright holder nor the
357 + names of its contributors may be used to endorse or promote products
358 + derived from this software without specific prior written permission.
359 +
360 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
361 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
362 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
363 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
364 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
365 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
366 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
367 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
368 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
369 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
370 +*/
371 +
372 +#include <linux/linkage.h>
373 +#include "arm-mem.h"
374 +
375 +/* Prevent the stack from becoming executable */
376 +#if defined(__linux__) && defined(__ELF__)
377 +.section .note.GNU-stack,"",%progbits
378 +#endif
379 +
380 + .text
381 + .arch armv6
382 + .object_arch armv4
383 + .arm
384 + .altmacro
385 + .p2align 2
386 +
387 +.macro memcmp_process_head unaligned
388 + .if unaligned
389 + ldr DAT0, [S_1], #4
390 + ldr DAT1, [S_1], #4
391 + ldr DAT2, [S_1], #4
392 + ldr DAT3, [S_1], #4
393 + .else
394 + ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
395 + .endif
396 + ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
397 +.endm
398 +
399 +.macro memcmp_process_tail
400 + cmp DAT0, DAT4
401 + cmpeq DAT1, DAT5
402 + cmpeq DAT2, DAT6
403 + cmpeq DAT3, DAT7
404 + bne 200f
405 +.endm
406 +
407 +.macro memcmp_leading_31bytes
408 + movs DAT0, OFF, lsl #31
409 + ldrmib DAT0, [S_1], #1
410 + ldrcsh DAT1, [S_1], #2
411 + ldrmib DAT4, [S_2], #1
412 + ldrcsh DAT5, [S_2], #2
413 + movpl DAT0, #0
414 + movcc DAT1, #0
415 + movpl DAT4, #0
416 + movcc DAT5, #0
417 + submi N, N, #1
418 + subcs N, N, #2
419 + cmp DAT0, DAT4
420 + cmpeq DAT1, DAT5
421 + bne 200f
422 + movs DAT0, OFF, lsl #29
423 + ldrmi DAT0, [S_1], #4
424 + ldrcs DAT1, [S_1], #4
425 + ldrcs DAT2, [S_1], #4
426 + ldrmi DAT4, [S_2], #4
427 + ldmcsia S_2!, {DAT5, DAT6}
428 + movpl DAT0, #0
429 + movcc DAT1, #0
430 + movcc DAT2, #0
431 + movpl DAT4, #0
432 + movcc DAT5, #0
433 + movcc DAT6, #0
434 + submi N, N, #4
435 + subcs N, N, #8
436 + cmp DAT0, DAT4
437 + cmpeq DAT1, DAT5
438 + cmpeq DAT2, DAT6
439 + bne 200f
440 + tst OFF, #16
441 + beq 105f
442 + memcmp_process_head 1
443 + sub N, N, #16
444 + memcmp_process_tail
445 +105:
446 +.endm
447 +
448 +.macro memcmp_trailing_15bytes unaligned
449 + movs N, N, lsl #29
450 + .if unaligned
451 + ldrcs DAT0, [S_1], #4
452 + ldrcs DAT1, [S_1], #4
453 + .else
454 + ldmcsia S_1!, {DAT0, DAT1}
455 + .endif
456 + ldrmi DAT2, [S_1], #4
457 + ldmcsia S_2!, {DAT4, DAT5}
458 + ldrmi DAT6, [S_2], #4
459 + movcc DAT0, #0
460 + movcc DAT1, #0
461 + movpl DAT2, #0
462 + movcc DAT4, #0
463 + movcc DAT5, #0
464 + movpl DAT6, #0
465 + cmp DAT0, DAT4
466 + cmpeq DAT1, DAT5
467 + cmpeq DAT2, DAT6
468 + bne 200f
469 + movs N, N, lsl #2
470 + ldrcsh DAT0, [S_1], #2
471 + ldrmib DAT1, [S_1]
472 + ldrcsh DAT4, [S_2], #2
473 + ldrmib DAT5, [S_2]
474 + movcc DAT0, #0
475 + movpl DAT1, #0
476 + movcc DAT4, #0
477 + movpl DAT5, #0
478 + cmp DAT0, DAT4
479 + cmpeq DAT1, DAT5
480 + bne 200f
481 +.endm
482 +
483 +.macro memcmp_long_inner_loop unaligned
484 +110:
485 + memcmp_process_head unaligned
486 + pld [S_2, #prefetch_distance*32 + 16]
487 + memcmp_process_tail
488 + memcmp_process_head unaligned
489 + pld [S_1, OFF]
490 + memcmp_process_tail
491 + subs N, N, #32
492 + bhs 110b
493 + /* Just before the final (prefetch_distance+1) 32-byte blocks,
494 + * deal with final preloads */
495 + preload_trailing 0, S_1, N, DAT0
496 + preload_trailing 0, S_2, N, DAT0
497 + add N, N, #(prefetch_distance+2)*32 - 16
498 +120:
499 + memcmp_process_head unaligned
500 + memcmp_process_tail
501 + subs N, N, #16
502 + bhs 120b
503 + /* Trailing words and bytes */
504 + tst N, #15
505 + beq 199f
506 + memcmp_trailing_15bytes unaligned
507 +199: /* Reached end without detecting a difference */
508 + mov a1, #0
509 + setend le
510 + pop {DAT1-DAT6, pc}
511 +.endm
512 +
513 +.macro memcmp_short_inner_loop unaligned
514 + subs N, N, #16 /* simplifies inner loop termination */
515 + blo 122f
516 +120:
517 + memcmp_process_head unaligned
518 + memcmp_process_tail
519 + subs N, N, #16
520 + bhs 120b
521 +122: /* Trailing words and bytes */
522 + tst N, #15
523 + beq 199f
524 + memcmp_trailing_15bytes unaligned
525 +199: /* Reached end without detecting a difference */
526 + mov a1, #0
527 + setend le
528 + pop {DAT1-DAT6, pc}
529 +.endm
530 +
531 +/*
532 + * int memcmp(const void *s1, const void *s2, size_t n);
533 + * On entry:
534 + * a1 = pointer to buffer 1
535 + * a2 = pointer to buffer 2
536 + * a3 = number of bytes to compare (as unsigned chars)
537 + * On exit:
538 + * a1 = >0/=0/<0 if s1 >/=/< s2
539 + */
540 +
541 +.set prefetch_distance, 2
542 +
543 +ENTRY(memcmp)
544 + S_1 .req a1
545 + S_2 .req a2
546 + N .req a3
547 + DAT0 .req a4
548 + DAT1 .req v1
549 + DAT2 .req v2
550 + DAT3 .req v3
551 + DAT4 .req v4
552 + DAT5 .req v5
553 + DAT6 .req v6
554 + DAT7 .req ip
555 + OFF .req lr
556 +
557 + push {DAT1-DAT6, lr}
558 + setend be /* lowest-addressed bytes are most significant */
559 +
560 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
561 + cmp N, #(prefetch_distance+3)*32 - 1
562 + blo 170f
563 +
564 + /* Long case */
565 + /* Adjust N so that the decrement instruction can also test for
566 + * inner loop termination. We want it to stop when there are
567 + * (prefetch_distance+1) complete blocks to go. */
568 + sub N, N, #(prefetch_distance+2)*32
569 + preload_leading_step1 0, DAT0, S_1
570 + preload_leading_step1 0, DAT1, S_2
571 + tst S_2, #31
572 + beq 154f
573 + rsb OFF, S_2, #0 /* no need to AND with 15 here */
574 + preload_leading_step2 0, DAT0, S_1, OFF, DAT2
575 + preload_leading_step2 0, DAT1, S_2, OFF, DAT2
576 + memcmp_leading_31bytes
577 +154: /* Second source now cacheline (32-byte) aligned; we have at
578 + * least one prefetch to go. */
579 + /* Prefetch offset is best selected such that it lies in the
580 + * first 8 of each 32 bytes - but it's just as easy to aim for
581 + * the first one */
582 + and OFF, S_1, #31
583 + rsb OFF, OFF, #32*prefetch_distance
584 + tst S_1, #3
585 + bne 140f
586 + memcmp_long_inner_loop 0
587 +140: memcmp_long_inner_loop 1
588 +
589 +170: /* Short case */
590 + teq N, #0
591 + beq 199f
592 + preload_all 0, 0, 0, S_1, N, DAT0, DAT1
593 + preload_all 0, 0, 0, S_2, N, DAT0, DAT1
594 + tst S_2, #3
595 + beq 174f
596 +172: subs N, N, #1
597 + blo 199f
598 + ldrb DAT0, [S_1], #1
599 + ldrb DAT4, [S_2], #1
600 + cmp DAT0, DAT4
601 + bne 200f
602 + tst S_2, #3
603 + bne 172b
604 +174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
605 + tst S_1, #3
606 + bne 140f
607 + memcmp_short_inner_loop 0
608 +140: memcmp_short_inner_loop 1
609 +
610 +200: /* Difference found: determine sign. */
611 + movhi a1, #1
612 + movlo a1, #-1
613 + setend le
614 + pop {DAT1-DAT6, pc}
615 +
616 + .unreq S_1
617 + .unreq S_2
618 + .unreq N
619 + .unreq DAT0
620 + .unreq DAT1
621 + .unreq DAT2
622 + .unreq DAT3
623 + .unreq DAT4
624 + .unreq DAT5
625 + .unreq DAT6
626 + .unreq DAT7
627 + .unreq OFF
628 +ENDPROC(memcmp)
629 --- /dev/null
630 +++ b/arch/arm/lib/memcpy_rpi.S
631 @@ -0,0 +1,61 @@
632 +/*
633 +Copyright (c) 2013, Raspberry Pi Foundation
634 +Copyright (c) 2013, RISC OS Open Ltd
635 +All rights reserved.
636 +
637 +Redistribution and use in source and binary forms, with or without
638 +modification, are permitted provided that the following conditions are met:
639 + * Redistributions of source code must retain the above copyright
640 + notice, this list of conditions and the following disclaimer.
641 + * Redistributions in binary form must reproduce the above copyright
642 + notice, this list of conditions and the following disclaimer in the
643 + documentation and/or other materials provided with the distribution.
644 + * Neither the name of the copyright holder nor the
645 + names of its contributors may be used to endorse or promote products
646 + derived from this software without specific prior written permission.
647 +
648 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
649 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
650 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
651 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
652 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
653 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
654 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
655 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
656 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
657 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
658 +*/
659 +
660 +#include <linux/linkage.h>
661 +#include "arm-mem.h"
662 +#include "memcpymove.h"
663 +
664 +/* Prevent the stack from becoming executable */
665 +#if defined(__linux__) && defined(__ELF__)
666 +.section .note.GNU-stack,"",%progbits
667 +#endif
668 +
669 + .text
670 + .arch armv6
671 + .object_arch armv4
672 + .arm
673 + .altmacro
674 + .p2align 2
675 +
676 +/*
677 + * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
678 + * On entry:
679 + * a1 = pointer to destination
680 + * a2 = pointer to source
681 + * a3 = number of bytes to copy
682 + * On exit:
683 + * a1 preserved
684 + */
685 +
686 +.set prefetch_distance, 3
687 +
688 +ENTRY(mmiocpy)
689 +ENTRY(memcpy)
690 + memcpy 0
691 +ENDPROC(memcpy)
692 +ENDPROC(mmiocpy)
693 --- /dev/null
694 +++ b/arch/arm/lib/memcpymove.h
695 @@ -0,0 +1,506 @@
696 +/*
697 +Copyright (c) 2013, Raspberry Pi Foundation
698 +Copyright (c) 2013, RISC OS Open Ltd
699 +All rights reserved.
700 +
701 +Redistribution and use in source and binary forms, with or without
702 +modification, are permitted provided that the following conditions are met:
703 + * Redistributions of source code must retain the above copyright
704 + notice, this list of conditions and the following disclaimer.
705 + * Redistributions in binary form must reproduce the above copyright
706 + notice, this list of conditions and the following disclaimer in the
707 + documentation and/or other materials provided with the distribution.
708 + * Neither the name of the copyright holder nor the
709 + names of its contributors may be used to endorse or promote products
710 + derived from this software without specific prior written permission.
711 +
712 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
713 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
714 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
715 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
716 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
717 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
718 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
719 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
720 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
721 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
722 +*/
723 +
724 +.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
725 + .if words == 1
726 + .if backwards
727 + mov r1, r0, lsl #32-align*8
728 + ldr r0, [S, #-4]!
729 + orr r1, r1, r0, lsr #align*8
730 + str r1, [D, #-4]!
731 + .else
732 + mov r0, r1, lsr #align*8
733 + ldr r1, [S, #4]!
734 + orr r0, r0, r1, lsl #32-align*8
735 + str r0, [D], #4
736 + .endif
737 + .elseif words == 2
738 + .if backwards
739 + ldr r1, [S, #-4]!
740 + mov r2, r0, lsl #32-align*8
741 + ldr r0, [S, #-4]!
742 + orr r2, r2, r1, lsr #align*8
743 + mov r1, r1, lsl #32-align*8
744 + orr r1, r1, r0, lsr #align*8
745 + stmdb D!, {r1, r2}
746 + .else
747 + ldr r1, [S, #4]!
748 + mov r0, r2, lsr #align*8
749 + ldr r2, [S, #4]!
750 + orr r0, r0, r1, lsl #32-align*8
751 + mov r1, r1, lsr #align*8
752 + orr r1, r1, r2, lsl #32-align*8
753 + stmia D!, {r0, r1}
754 + .endif
755 + .elseif words == 4
756 + .if backwards
757 + ldmdb S!, {r2, r3}
758 + mov r4, r0, lsl #32-align*8
759 + ldmdb S!, {r0, r1}
760 + orr r4, r4, r3, lsr #align*8
761 + mov r3, r3, lsl #32-align*8
762 + orr r3, r3, r2, lsr #align*8
763 + mov r2, r2, lsl #32-align*8
764 + orr r2, r2, r1, lsr #align*8
765 + mov r1, r1, lsl #32-align*8
766 + orr r1, r1, r0, lsr #align*8
767 + stmdb D!, {r1, r2, r3, r4}
768 + .else
769 + ldmib S!, {r1, r2}
770 + mov r0, r4, lsr #align*8
771 + ldmib S!, {r3, r4}
772 + orr r0, r0, r1, lsl #32-align*8
773 + mov r1, r1, lsr #align*8
774 + orr r1, r1, r2, lsl #32-align*8
775 + mov r2, r2, lsr #align*8
776 + orr r2, r2, r3, lsl #32-align*8
777 + mov r3, r3, lsr #align*8
778 + orr r3, r3, r4, lsl #32-align*8
779 + stmia D!, {r0, r1, r2, r3}
780 + .endif
781 + .elseif words == 8
782 + .if backwards
783 + ldmdb S!, {r4, r5, r6, r7}
784 + mov r8, r0, lsl #32-align*8
785 + ldmdb S!, {r0, r1, r2, r3}
786 + .if use_pld
787 + pld [S, OFF]
788 + .endif
789 + orr r8, r8, r7, lsr #align*8
790 + mov r7, r7, lsl #32-align*8
791 + orr r7, r7, r6, lsr #align*8
792 + mov r6, r6, lsl #32-align*8
793 + orr r6, r6, r5, lsr #align*8
794 + mov r5, r5, lsl #32-align*8
795 + orr r5, r5, r4, lsr #align*8
796 + mov r4, r4, lsl #32-align*8
797 + orr r4, r4, r3, lsr #align*8
798 + mov r3, r3, lsl #32-align*8
799 + orr r3, r3, r2, lsr #align*8
800 + mov r2, r2, lsl #32-align*8
801 + orr r2, r2, r1, lsr #align*8
802 + mov r1, r1, lsl #32-align*8
803 + orr r1, r1, r0, lsr #align*8
804 + stmdb D!, {r5, r6, r7, r8}
805 + stmdb D!, {r1, r2, r3, r4}
806 + .else
807 + ldmib S!, {r1, r2, r3, r4}
808 + mov r0, r8, lsr #align*8
809 + ldmib S!, {r5, r6, r7, r8}
810 + .if use_pld
811 + pld [S, OFF]
812 + .endif
813 + orr r0, r0, r1, lsl #32-align*8
814 + mov r1, r1, lsr #align*8
815 + orr r1, r1, r2, lsl #32-align*8
816 + mov r2, r2, lsr #align*8
817 + orr r2, r2, r3, lsl #32-align*8
818 + mov r3, r3, lsr #align*8
819 + orr r3, r3, r4, lsl #32-align*8
820 + mov r4, r4, lsr #align*8
821 + orr r4, r4, r5, lsl #32-align*8
822 + mov r5, r5, lsr #align*8
823 + orr r5, r5, r6, lsl #32-align*8
824 + mov r6, r6, lsr #align*8
825 + orr r6, r6, r7, lsl #32-align*8
826 + mov r7, r7, lsr #align*8
827 + orr r7, r7, r8, lsl #32-align*8
828 + stmia D!, {r0, r1, r2, r3}
829 + stmia D!, {r4, r5, r6, r7}
830 + .endif
831 + .endif
832 +.endm
833 +
834 +.macro memcpy_leading_15bytes backwards, align
835 + movs DAT1, DAT2, lsl #31
836 + sub N, N, DAT2
837 + .if backwards
838 + ldrmib DAT0, [S, #-1]!
839 + ldrcsh DAT1, [S, #-2]!
840 + strmib DAT0, [D, #-1]!
841 + strcsh DAT1, [D, #-2]!
842 + .else
843 + ldrmib DAT0, [S], #1
844 + ldrcsh DAT1, [S], #2
845 + strmib DAT0, [D], #1
846 + strcsh DAT1, [D], #2
847 + .endif
848 + movs DAT1, DAT2, lsl #29
849 + .if backwards
850 + ldrmi DAT0, [S, #-4]!
851 + .if align == 0
852 + ldmcsdb S!, {DAT1, DAT2}
853 + .else
854 + ldrcs DAT2, [S, #-4]!
855 + ldrcs DAT1, [S, #-4]!
856 + .endif
857 + strmi DAT0, [D, #-4]!
858 + stmcsdb D!, {DAT1, DAT2}
859 + .else
860 + ldrmi DAT0, [S], #4
861 + .if align == 0
862 + ldmcsia S!, {DAT1, DAT2}
863 + .else
864 + ldrcs DAT1, [S], #4
865 + ldrcs DAT2, [S], #4
866 + .endif
867 + strmi DAT0, [D], #4
868 + stmcsia D!, {DAT1, DAT2}
869 + .endif
870 +.endm
871 +
872 +.macro memcpy_trailing_15bytes backwards, align
873 + movs N, N, lsl #29
874 + .if backwards
875 + .if align == 0
876 + ldmcsdb S!, {DAT0, DAT1}
877 + .else
878 + ldrcs DAT1, [S, #-4]!
879 + ldrcs DAT0, [S, #-4]!
880 + .endif
881 + ldrmi DAT2, [S, #-4]!
882 + stmcsdb D!, {DAT0, DAT1}
883 + strmi DAT2, [D, #-4]!
884 + .else
885 + .if align == 0
886 + ldmcsia S!, {DAT0, DAT1}
887 + .else
888 + ldrcs DAT0, [S], #4
889 + ldrcs DAT1, [S], #4
890 + .endif
891 + ldrmi DAT2, [S], #4
892 + stmcsia D!, {DAT0, DAT1}
893 + strmi DAT2, [D], #4
894 + .endif
895 + movs N, N, lsl #2
896 + .if backwards
897 + ldrcsh DAT0, [S, #-2]!
898 + ldrmib DAT1, [S, #-1]
899 + strcsh DAT0, [D, #-2]!
900 + strmib DAT1, [D, #-1]
901 + .else
902 + ldrcsh DAT0, [S], #2
903 + ldrmib DAT1, [S]
904 + strcsh DAT0, [D], #2
905 + strmib DAT1, [D]
906 + .endif
907 +.endm
908 +
909 +.macro memcpy_long_inner_loop backwards, align
910 + .if align != 0
911 + .if backwards
912 + ldr DAT0, [S, #-align]!
913 + .else
914 + ldr LAST, [S, #-align]!
915 + .endif
916 + .endif
917 +110:
918 + .if align == 0
919 + .if backwards
920 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
921 + pld [S, OFF]
922 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
923 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
924 + .else
925 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
926 + pld [S, OFF]
927 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
928 + stmia D!, {DAT4, DAT5, DAT6, LAST}
929 + .endif
930 + .else
931 + unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
932 + .endif
933 + subs N, N, #32
934 + bhs 110b
935 + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
936 + preload_trailing backwards, S, N, OFF
937 + add N, N, #(prefetch_distance+2)*32 - 32
938 +120:
939 + .if align == 0
940 + .if backwards
941 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
942 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
943 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
944 + .else
945 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
946 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
947 + stmia D!, {DAT4, DAT5, DAT6, LAST}
948 + .endif
949 + .else
950 + unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
951 + .endif
952 + subs N, N, #32
953 + bhs 120b
954 + tst N, #16
955 + .if align == 0
956 + .if backwards
957 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
958 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
959 + .else
960 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
961 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
962 + .endif
963 + .else
964 + beq 130f
965 + unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
966 +130:
967 + .endif
968 + /* Trailing words and bytes */
969 + tst N, #15
970 + beq 199f
971 + .if align != 0
972 + add S, S, #align
973 + .endif
974 + memcpy_trailing_15bytes backwards, align
975 +199:
976 + pop {DAT3, DAT4, DAT5, DAT6, DAT7}
977 + pop {D, DAT1, DAT2, pc}
978 +.endm
979 +
980 +.macro memcpy_medium_inner_loop backwards, align
981 +120:
982 + .if backwards
983 + .if align == 0
984 + ldmdb S!, {DAT0, DAT1, DAT2, LAST}
985 + .else
986 + ldr LAST, [S, #-4]!
987 + ldr DAT2, [S, #-4]!
988 + ldr DAT1, [S, #-4]!
989 + ldr DAT0, [S, #-4]!
990 + .endif
991 + stmdb D!, {DAT0, DAT1, DAT2, LAST}
992 + .else
993 + .if align == 0
994 + ldmia S!, {DAT0, DAT1, DAT2, LAST}
995 + .else
996 + ldr DAT0, [S], #4
997 + ldr DAT1, [S], #4
998 + ldr DAT2, [S], #4
999 + ldr LAST, [S], #4
1000 + .endif
1001 + stmia D!, {DAT0, DAT1, DAT2, LAST}
1002 + .endif
1003 + subs N, N, #16
1004 + bhs 120b
1005 + /* Trailing words and bytes */
1006 + tst N, #15
1007 + beq 199f
1008 + memcpy_trailing_15bytes backwards, align
1009 +199:
1010 + pop {D, DAT1, DAT2, pc}
1011 +.endm
1012 +
1013 +.macro memcpy_short_inner_loop backwards, align
1014 + tst N, #16
1015 + .if backwards
1016 + .if align == 0
1017 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
1018 + .else
1019 + ldrne LAST, [S, #-4]!
1020 + ldrne DAT2, [S, #-4]!
1021 + ldrne DAT1, [S, #-4]!
1022 + ldrne DAT0, [S, #-4]!
1023 + .endif
1024 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
1025 + .else
1026 + .if align == 0
1027 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
1028 + .else
1029 + ldrne DAT0, [S], #4
1030 + ldrne DAT1, [S], #4
1031 + ldrne DAT2, [S], #4
1032 + ldrne LAST, [S], #4
1033 + .endif
1034 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
1035 + .endif
1036 + memcpy_trailing_15bytes backwards, align
1037 +199:
1038 + pop {D, DAT1, DAT2, pc}
1039 +.endm
1040 +
1041 +.macro memcpy backwards
1042 + D .req a1
1043 + S .req a2
1044 + N .req a3
1045 + DAT0 .req a4
1046 + DAT1 .req v1
1047 + DAT2 .req v2
1048 + DAT3 .req v3
1049 + DAT4 .req v4
1050 + DAT5 .req v5
1051 + DAT6 .req v6
1052 + DAT7 .req sl
1053 + LAST .req ip
1054 + OFF .req lr
1055 +
1056 + .cfi_startproc
1057 +
1058 + push {D, DAT1, DAT2, lr}
1059 +
1060 + .cfi_def_cfa_offset 16
1061 + .cfi_rel_offset D, 0
1062 + .cfi_undefined S
1063 + .cfi_undefined N
1064 + .cfi_undefined DAT0
1065 + .cfi_rel_offset DAT1, 4
1066 + .cfi_rel_offset DAT2, 8
1067 + .cfi_undefined LAST
1068 + .cfi_rel_offset lr, 12
1069 +
1070 + .if backwards
1071 + add D, D, N
1072 + add S, S, N
1073 + .endif
1074 +
1075 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1076 + cmp N, #31
1077 + blo 170f
1078 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
1079 + cmp N, #(prefetch_distance+3)*32 - 1
1080 + blo 160f
1081 +
1082 + /* Long case */
1083 + push {DAT3, DAT4, DAT5, DAT6, DAT7}
1084 +
1085 + .cfi_def_cfa_offset 36
1086 + .cfi_rel_offset D, 20
1087 + .cfi_rel_offset DAT1, 24
1088 + .cfi_rel_offset DAT2, 28
1089 + .cfi_rel_offset DAT3, 0
1090 + .cfi_rel_offset DAT4, 4
1091 + .cfi_rel_offset DAT5, 8
1092 + .cfi_rel_offset DAT6, 12
1093 + .cfi_rel_offset DAT7, 16
1094 + .cfi_rel_offset lr, 32
1095 +
1096 + /* Adjust N so that the decrement instruction can also test for
1097 + * inner loop termination. We want it to stop when there are
1098 + * (prefetch_distance+1) complete blocks to go. */
1099 + sub N, N, #(prefetch_distance+2)*32
1100 + preload_leading_step1 backwards, DAT0, S
1101 + .if backwards
1102 + /* Bug in GAS: it accepts, but mis-assembles the instruction
1103 + * ands DAT2, D, #60, 2
1104 + * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
1105 + */
1106 + .word 0xE210513C
1107 + beq 154f
1108 + .else
1109 + ands DAT2, D, #15
1110 + beq 154f
1111 + rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
1112 + .endif
1113 + preload_leading_step2 backwards, DAT0, S, DAT2, OFF
1114 + memcpy_leading_15bytes backwards, 1
1115 +154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
1116 + /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
1117 + .if backwards
1118 + rsb OFF, S, #3
1119 + and OFF, OFF, #28
1120 + sub OFF, OFF, #32*(prefetch_distance+1)
1121 + .else
1122 + and OFF, S, #28
1123 + rsb OFF, OFF, #32*prefetch_distance
1124 + .endif
1125 + movs DAT0, S, lsl #31
1126 + bhi 157f
1127 + bcs 156f
1128 + bmi 155f
1129 + memcpy_long_inner_loop backwards, 0
1130 +155: memcpy_long_inner_loop backwards, 1
1131 +156: memcpy_long_inner_loop backwards, 2
1132 +157: memcpy_long_inner_loop backwards, 3
1133 +
1134 + .cfi_def_cfa_offset 16
1135 + .cfi_rel_offset D, 0
1136 + .cfi_rel_offset DAT1, 4
1137 + .cfi_rel_offset DAT2, 8
1138 + .cfi_same_value DAT3
1139 + .cfi_same_value DAT4
1140 + .cfi_same_value DAT5
1141 + .cfi_same_value DAT6
1142 + .cfi_same_value DAT7
1143 + .cfi_rel_offset lr, 12
1144 +
1145 +160: /* Medium case */
1146 + preload_all backwards, 0, 0, S, N, DAT2, OFF
1147 + sub N, N, #16 /* simplifies inner loop termination */
1148 + .if backwards
1149 + ands DAT2, D, #15
1150 + beq 164f
1151 + .else
1152 + ands DAT2, D, #15
1153 + beq 164f
1154 + rsb DAT2, DAT2, #16
1155 + .endif
1156 + memcpy_leading_15bytes backwards, align
1157 +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
1158 + tst S, #3
1159 + bne 140f
1160 + memcpy_medium_inner_loop backwards, 0
1161 +140: memcpy_medium_inner_loop backwards, 1
1162 +
1163 +170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
1164 + teq N, #0
1165 + beq 199f
1166 + preload_all backwards, 1, 0, S, N, DAT2, LAST
1167 + tst D, #3
1168 + beq 174f
1169 +172: subs N, N, #1
1170 + blo 199f
1171 + .if backwards
1172 + ldrb DAT0, [S, #-1]!
1173 + strb DAT0, [D, #-1]!
1174 + .else
1175 + ldrb DAT0, [S], #1
1176 + strb DAT0, [D], #1
1177 + .endif
1178 + tst D, #3
1179 + bne 172b
1180 +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
1181 + tst S, #3
1182 + bne 140f
1183 + memcpy_short_inner_loop backwards, 0
1184 +140: memcpy_short_inner_loop backwards, 1
1185 +
1186 + .cfi_endproc
1187 +
1188 + .unreq D
1189 + .unreq S
1190 + .unreq N
1191 + .unreq DAT0
1192 + .unreq DAT1
1193 + .unreq DAT2
1194 + .unreq DAT3
1195 + .unreq DAT4
1196 + .unreq DAT5
1197 + .unreq DAT6
1198 + .unreq DAT7
1199 + .unreq LAST
1200 + .unreq OFF
1201 +.endm
1202 --- /dev/null
1203 +++ b/arch/arm/lib/memmove_rpi.S
1204 @@ -0,0 +1,61 @@
1205 +/*
1206 +Copyright (c) 2013, Raspberry Pi Foundation
1207 +Copyright (c) 2013, RISC OS Open Ltd
1208 +All rights reserved.
1209 +
1210 +Redistribution and use in source and binary forms, with or without
1211 +modification, are permitted provided that the following conditions are met:
1212 + * Redistributions of source code must retain the above copyright
1213 + notice, this list of conditions and the following disclaimer.
1214 + * Redistributions in binary form must reproduce the above copyright
1215 + notice, this list of conditions and the following disclaimer in the
1216 + documentation and/or other materials provided with the distribution.
1217 + * Neither the name of the copyright holder nor the
1218 + names of its contributors may be used to endorse or promote products
1219 + derived from this software without specific prior written permission.
1220 +
1221 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1222 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1223 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1224 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1225 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1226 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1227 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1228 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1229 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1230 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1231 +*/
1232 +
1233 +#include <linux/linkage.h>
1234 +#include "arm-mem.h"
1235 +#include "memcpymove.h"
1236 +
1237 +/* Prevent the stack from becoming executable */
1238 +#if defined(__linux__) && defined(__ELF__)
1239 +.section .note.GNU-stack,"",%progbits
1240 +#endif
1241 +
1242 + .text
1243 + .arch armv6
1244 + .object_arch armv4
1245 + .arm
1246 + .altmacro
1247 + .p2align 2
1248 +
1249 +/*
1250 + * void *memmove(void *s1, const void *s2, size_t n);
1251 + * On entry:
1252 + * a1 = pointer to destination
1253 + * a2 = pointer to source
1254 + * a3 = number of bytes to copy
1255 + * On exit:
1256 + * a1 preserved
1257 + */
1258 +
1259 +.set prefetch_distance, 3
1260 +
1261 +ENTRY(memmove)
1262 + cmp a2, a1
1263 + bpl memcpy /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
1264 + memcpy 1
1265 +ENDPROC(memmove)
1266 --- /dev/null
1267 +++ b/arch/arm/lib/memset_rpi.S
1268 @@ -0,0 +1,128 @@
1269 +/*
1270 +Copyright (c) 2013, Raspberry Pi Foundation
1271 +Copyright (c) 2013, RISC OS Open Ltd
1272 +All rights reserved.
1273 +
1274 +Redistribution and use in source and binary forms, with or without
1275 +modification, are permitted provided that the following conditions are met:
1276 + * Redistributions of source code must retain the above copyright
1277 + notice, this list of conditions and the following disclaimer.
1278 + * Redistributions in binary form must reproduce the above copyright
1279 + notice, this list of conditions and the following disclaimer in the
1280 + documentation and/or other materials provided with the distribution.
1281 + * Neither the name of the copyright holder nor the
1282 + names of its contributors may be used to endorse or promote products
1283 + derived from this software without specific prior written permission.
1284 +
1285 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1286 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1287 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1288 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1289 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1290 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1291 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1292 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1293 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1294 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1295 +*/
1296 +
1297 +#include <linux/linkage.h>
1298 +#include "arm-mem.h"
1299 +
1300 +/* Prevent the stack from becoming executable */
1301 +#if defined(__linux__) && defined(__ELF__)
1302 +.section .note.GNU-stack,"",%progbits
1303 +#endif
1304 +
1305 + .text
1306 + .arch armv6
1307 + .object_arch armv4
1308 + .arm
1309 + .altmacro
1310 + .p2align 2
1311 +
1312 +/*
1313 + * void *memset(void *s, int c, size_t n);
1314 + * On entry:
1315 + * a1 = pointer to buffer to fill
1316 + * a2 = byte pattern to fill with (caller-narrowed)
1317 + * a3 = number of bytes to fill
1318 + * On exit:
1319 + * a1 preserved
1320 + */
1321 +ENTRY(mmioset)
1322 +ENTRY(memset)
1323 +ENTRY(__memset32)
1324 +ENTRY(__memset64)
1325 +
1326 + S .req a1
1327 + DAT0 .req a2
1328 + N .req a3
1329 + DAT1 .req a4
1330 + DAT2 .req ip
1331 + DAT3 .req lr
1332 +
1333 + orr DAT0, DAT0, DAT0, lsl #8
1334 + push {S, lr}
1335 + orr DAT0, DAT0, DAT0, lsl #16
1336 + mov DAT1, DAT0
1337 +
1338 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1339 + cmp N, #31
1340 + blo 170f
1341 +
1342 +161: sub N, N, #16 /* simplifies inner loop termination */
1343 + /* Leading words and bytes */
1344 + tst S, #15
1345 + beq 164f
1346 + rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */
1347 + movs DAT2, DAT3, lsl #31
1348 + submi N, N, #1
1349 + strmib DAT0, [S], #1
1350 + subcs N, N, #2
1351 + strcsh DAT0, [S], #2
1352 + movs DAT2, DAT3, lsl #29
1353 + submi N, N, #4
1354 + strmi DAT0, [S], #4
1355 + subcs N, N, #8
1356 + stmcsia S!, {DAT0, DAT1}
1357 +164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
1358 + mov DAT2, DAT0
1359 + mov DAT3, DAT0
1360 + /* Now the inner loop of 16-byte stores */
1361 +165: stmia S!, {DAT0, DAT1, DAT2, DAT3}
1362 + subs N, N, #16
1363 + bhs 165b
1364 +166: /* Trailing words and bytes */
1365 + movs N, N, lsl #29
1366 + stmcsia S!, {DAT0, DAT1}
1367 + strmi DAT0, [S], #4
1368 + movs N, N, lsl #2
1369 + strcsh DAT0, [S], #2
1370 + strmib DAT0, [S]
1371 +199: pop {S, pc}
1372 +
1373 +170: /* Short case */
1374 + mov DAT2, DAT0
1375 + mov DAT3, DAT0
1376 + tst S, #3
1377 + beq 174f
1378 +172: subs N, N, #1
1379 + blo 199b
1380 + strb DAT0, [S], #1
1381 + tst S, #3
1382 + bne 172b
1383 +174: tst N, #16
1384 + stmneia S!, {DAT0, DAT1, DAT2, DAT3}
1385 + b 166b
1386 +
1387 + .unreq S
1388 + .unreq DAT0
1389 + .unreq N
1390 + .unreq DAT1
1391 + .unreq DAT2
1392 + .unreq DAT3
1393 +ENDPROC(__memset64)
1394 +ENDPROC(__memset32)
1395 +ENDPROC(memset)
1396 +ENDPROC(mmioset)
1397 --- a/arch/arm/lib/uaccess_with_memcpy.c
1398 +++ b/arch/arm/lib/uaccess_with_memcpy.c
1399 @@ -19,6 +19,14 @@
1400 #include <asm/current.h>
1401 #include <asm/page.h>
1402
1403 +#ifndef COPY_FROM_USER_THRESHOLD
1404 +#define COPY_FROM_USER_THRESHOLD 64
1405 +#endif
1406 +
1407 +#ifndef COPY_TO_USER_THRESHOLD
1408 +#define COPY_TO_USER_THRESHOLD 64
1409 +#endif
1410 +
1411 static int
1412 pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1413 {
1414 @@ -43,7 +51,7 @@ pin_page_for_write(const void __user *_a
1415 return 0;
1416
1417 pmd = pmd_offset(pud, addr);
1418 - if (unlikely(pmd_none(*pmd)))
1419 + if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
1420 return 0;
1421
1422 /*
1423 @@ -86,7 +94,46 @@ pin_page_for_write(const void __user *_a
1424 return 1;
1425 }
1426
1427 -static unsigned long noinline
1428 +static int
1429 +pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1430 +{
1431 + unsigned long addr = (unsigned long)_addr;
1432 + pgd_t *pgd;
1433 + p4d_t *p4d;
1434 + pmd_t *pmd;
1435 + pte_t *pte;
1436 + pud_t *pud;
1437 + spinlock_t *ptl;
1438 +
1439 + pgd = pgd_offset(current->mm, addr);
1440 + if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
1441 + return 0;
1442 +
1443 + p4d = p4d_offset(pgd, addr);
1444 + if (unlikely(p4d_none(*p4d) || p4d_bad(*p4d)))
1445 + return 0;
1446 +
1447 + pud = pud_offset(p4d, addr);
1448 + if (unlikely(pud_none(*pud) || pud_bad(*pud)))
1449 + return 0;
1450 +
1451 + pmd = pmd_offset(pud, addr);
1452 + if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
1453 + return 0;
1454 +
1455 + pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
1456 + if (unlikely(!pte_present(*pte) || !pte_young(*pte))) {
1457 + pte_unmap_unlock(pte, ptl);
1458 + return 0;
1459 + }
1460 +
1461 + *ptep = pte;
1462 + *ptlp = ptl;
1463 +
1464 + return 1;
1465 +}
1466 +
1467 +unsigned long noinline
1468 __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
1469 {
1470 unsigned long ua_flags;
1471 @@ -139,6 +186,57 @@ out:
1472 return n;
1473 }
1474
1475 +unsigned long noinline
1476 +__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n)
1477 +{
1478 + unsigned long ua_flags;
1479 + int atomic;
1480 +
1481 + if (unlikely(uaccess_kernel())) {
1482 + memcpy(to, (const void *)from, n);
1483 + return 0;
1484 + }
1485 +
1486 + /* the mmap semaphore is taken only if not in an atomic context */
1487 + atomic = in_atomic();
1488 +
1489 + if (!atomic)
1490 + mmap_read_lock(current->mm);
1491 + while (n) {
1492 + pte_t *pte;
1493 + spinlock_t *ptl;
1494 + int tocopy;
1495 +
1496 + while (!pin_page_for_read(from, &pte, &ptl)) {
1497 + char temp;
1498 + if (!atomic)
1499 + mmap_read_unlock(current->mm);
1500 + if (__get_user(temp, (char __user *)from))
1501 + goto out;
1502 + if (!atomic)
1503 + mmap_read_lock(current->mm);
1504 + }
1505 +
1506 + tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1;
1507 + if (tocopy > n)
1508 + tocopy = n;
1509 +
1510 + ua_flags = uaccess_save_and_enable();
1511 + memcpy(to, (const void *)from, tocopy);
1512 + uaccess_restore(ua_flags);
1513 + to += tocopy;
1514 + from += tocopy;
1515 + n -= tocopy;
1516 +
1517 + pte_unmap_unlock(pte, ptl);
1518 + }
1519 + if (!atomic)
1520 + mmap_read_unlock(current->mm);
1521 +
1522 +out:
1523 + return n;
1524 +}
1525 +
1526 unsigned long
1527 arm_copy_to_user(void __user *to, const void *from, unsigned long n)
1528 {
1529 @@ -149,7 +247,7 @@ arm_copy_to_user(void __user *to, const
1530 * With frame pointer disabled, tail call optimization kicks in
1531 * as well making this test almost invisible.
1532 */
1533 - if (n < 64) {
1534 + if (n < COPY_TO_USER_THRESHOLD) {
1535 unsigned long ua_flags = uaccess_save_and_enable();
1536 n = __copy_to_user_std(to, from, n);
1537 uaccess_restore(ua_flags);
1538 @@ -159,6 +257,32 @@ arm_copy_to_user(void __user *to, const
1539 }
1540 return n;
1541 }
1542 +
1543 +unsigned long __must_check
1544 +arm_copy_from_user(void *to, const void __user *from, unsigned long n)
1545 +{
1546 +#ifdef CONFIG_BCM2835_FAST_MEMCPY
1547 + /*
1548 + * This test is stubbed out of the main function above to keep
1549 + * the overhead for small copies low by avoiding a large
1550 + * register dump on the stack just to reload them right away.
1551 + * With frame pointer disabled, tail call optimization kicks in
1552 + * as well making this test almost invisible.
1553 + */
1554 + if (n < COPY_TO_USER_THRESHOLD) {
1555 + unsigned long ua_flags = uaccess_save_and_enable();
1556 + n = __copy_from_user_std(to, from, n);
1557 + uaccess_restore(ua_flags);
1558 + } else {
1559 + n = __copy_from_user_memcpy(to, from, n);
1560 + }
1561 +#else
1562 + unsigned long ua_flags = uaccess_save_and_enable();
1563 + n = __copy_from_user_std(to, from, n);
1564 + uaccess_restore(ua_flags);
1565 +#endif
1566 + return n;
1567 +}
1568
1569 static unsigned long noinline
1570 __clear_user_memset(void __user *addr, unsigned long n)
1571 --- a/arch/arm/mach-bcm/Kconfig
1572 +++ b/arch/arm/mach-bcm/Kconfig
1573 @@ -184,6 +184,13 @@ config ARCH_BCM_53573
1574 The base chip is BCM53573 and there are some packaging modifications
1575 like BCM47189 and BCM47452.
1576
1577 +config BCM2835_FAST_MEMCPY
1578 + bool "Enable optimized __copy_to_user and __copy_from_user"
1579 + depends on ARCH_BCM2835 && ARCH_MULTI_V6
1580 + default y
1581 + help
1582 + Optimized versions of __copy_to_user and __copy_from_user for Pi1.
1583 +
1584 config ARCH_BCM_63XX
1585 bool "Broadcom BCM63xx DSL SoC"
1586 depends on ARCH_MULTI_V7