From: Thomas Gleixner Date: Thu, 11 Oct 2007 09:15:35 +0000 (+0200) Subject: x86_64: prepare shared lib/csum-copy.S X-Git-Url: http://git.cdn.openwrt.org/?a=commitdiff_plain;h=c9954d39e7d8b2f12517aa02d4d7b96eb4761a1d;p=openwrt%2Fstaging%2Fblogic.git x86_64: prepare shared lib/csum-copy.S Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index 30eede7e18fd..4ecc6181db4d 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -7,7 +7,7 @@ CFLAGS_csum-partial.o := -funroll-loops obj-y := io.o iomap_copy_64.o obj-$(CONFIG_SMP) += msr-on-cpu.o -lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ +lib-y := csum-partial.o csum-copy_64.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ thunk_64.o clear_page.o copy_page_64.o bitstr_64.o bitops.o lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S deleted file mode 100644 index f0dba36578ea..000000000000 --- a/arch/x86_64/lib/csum-copy.S +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file COPYING in the main directory of this archive - * for more details. No warranty for anything given at all. - */ -#include -#include -#include - -/* - * Checksum copy with exception handling. - * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the - * destination is zeroed. - * - * Input - * rdi source - * rsi destination - * edx len (32bit) - * ecx sum (32bit) - * r8 src_err_ptr (int) - * r9 dst_err_ptr (int) - * - * Output - * eax 64bit sum. undefined in case of exception. - * - * Wrappers need to take care of valid exception sum and zeroing. - * They also should align source or destination to 8 bytes. - */ - - .macro source -10: - .section __ex_table,"a" - .align 8 - .quad 10b,.Lbad_source - .previous - .endm - - .macro dest -20: - .section __ex_table,"a" - .align 8 - .quad 20b,.Lbad_dest - .previous - .endm - - .macro ignore L=.Lignore -30: - .section __ex_table,"a" - .align 8 - .quad 30b,\L - .previous - .endm - - -ENTRY(csum_partial_copy_generic) - CFI_STARTPROC - cmpl $3*64,%edx - jle .Lignore - -.Lignore: - subq $7*8,%rsp - CFI_ADJUST_CFA_OFFSET 7*8 - movq %rbx,2*8(%rsp) - CFI_REL_OFFSET rbx, 2*8 - movq %r12,3*8(%rsp) - CFI_REL_OFFSET r12, 3*8 - movq %r14,4*8(%rsp) - CFI_REL_OFFSET r14, 4*8 - movq %r13,5*8(%rsp) - CFI_REL_OFFSET r13, 5*8 - movq %rbp,6*8(%rsp) - CFI_REL_OFFSET rbp, 6*8 - - movq %r8,(%rsp) - movq %r9,1*8(%rsp) - - movl %ecx,%eax - movl %edx,%ecx - - xorl %r9d,%r9d - movq %rcx,%r12 - - shrq $6,%r12 - jz .Lhandle_tail /* < 64 */ - - clc - - /* main loop. clear in 64 byte blocks */ - /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ - /* r11: temp3, rdx: temp4, r12 loopcnt */ - /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ - .p2align 4 -.Lloop: - source - movq (%rdi),%rbx - source - movq 8(%rdi),%r8 - source - movq 16(%rdi),%r11 - source - movq 24(%rdi),%rdx - - source - movq 32(%rdi),%r10 - source - movq 40(%rdi),%rbp - source - movq 48(%rdi),%r14 - source - movq 56(%rdi),%r13 - - ignore 2f - prefetcht0 5*64(%rdi) -2: - adcq %rbx,%rax - adcq %r8,%rax - adcq %r11,%rax - adcq %rdx,%rax - adcq %r10,%rax - adcq %rbp,%rax - adcq %r14,%rax - adcq %r13,%rax - - decl %r12d - - dest - movq %rbx,(%rsi) - dest - movq %r8,8(%rsi) - dest - movq %r11,16(%rsi) - dest - movq %rdx,24(%rsi) - - dest - movq %r10,32(%rsi) - dest - movq %rbp,40(%rsi) - dest - movq %r14,48(%rsi) - dest - movq %r13,56(%rsi) - -3: - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - - jnz .Lloop - - adcq %r9,%rax - - /* do last upto 56 bytes */ -.Lhandle_tail: - /* ecx: count */ - movl %ecx,%r10d - andl $63,%ecx - shrl $3,%ecx - jz .Lfold - clc - .p2align 4 -.Lloop_8: - source - movq (%rdi),%rbx - adcq %rbx,%rax - decl %ecx - dest - movq %rbx,(%rsi) - leaq 8(%rsi),%rsi /* preserve carry */ - leaq 8(%rdi),%rdi - jnz .Lloop_8 - adcq %r9,%rax /* add in carry */ - -.Lfold: - /* reduce checksum to 32bits */ - movl %eax,%ebx - shrq $32,%rax - addl %ebx,%eax - adcl %r9d,%eax - - /* do last upto 6 bytes */ -.Lhandle_7: - movl %r10d,%ecx - andl $7,%ecx - shrl $1,%ecx - jz .Lhandle_1 - movl $2,%edx - xorl %ebx,%ebx - clc - .p2align 4 -.Lloop_1: - source - movw (%rdi),%bx - adcl %ebx,%eax - decl %ecx - dest - movw %bx,(%rsi) - leaq 2(%rdi),%rdi - leaq 2(%rsi),%rsi - jnz .Lloop_1 - adcl %r9d,%eax /* add in carry */ - - /* handle last odd byte */ -.Lhandle_1: - testl $1,%r10d - jz .Lende - xorl %ebx,%ebx - source - movb (%rdi),%bl - dest - movb %bl,(%rsi) - addl %ebx,%eax - adcl %r9d,%eax /* carry */ - - CFI_REMEMBER_STATE -.Lende: - movq 2*8(%rsp),%rbx - CFI_RESTORE rbx - movq 3*8(%rsp),%r12 - CFI_RESTORE r12 - movq 4*8(%rsp),%r14 - CFI_RESTORE r14 - movq 5*8(%rsp),%r13 - CFI_RESTORE r13 - movq 6*8(%rsp),%rbp - CFI_RESTORE rbp - addq $7*8,%rsp - CFI_ADJUST_CFA_OFFSET -7*8 - ret - CFI_RESTORE_STATE - - /* Exception handlers. Very simple, zeroing is done in the wrappers */ -.Lbad_source: - movq (%rsp),%rax - testq %rax,%rax - jz .Lende - movl $-EFAULT,(%rax) - jmp .Lende - -.Lbad_dest: - movq 8(%rsp),%rax - testq %rax,%rax - jz .Lende - movl $-EFAULT,(%rax) - jmp .Lende - CFI_ENDPROC -ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86_64/lib/csum-copy_64.S b/arch/x86_64/lib/csum-copy_64.S new file mode 100644 index 000000000000..f0dba36578ea --- /dev/null +++ b/arch/x86_64/lib/csum-copy_64.S @@ -0,0 +1,249 @@ +/* + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. No warranty for anything given at all. + */ +#include +#include +#include + +/* + * Checksum copy with exception handling. + * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the + * destination is zeroed. + * + * Input + * rdi source + * rsi destination + * edx len (32bit) + * ecx sum (32bit) + * r8 src_err_ptr (int) + * r9 dst_err_ptr (int) + * + * Output + * eax 64bit sum. undefined in case of exception. + * + * Wrappers need to take care of valid exception sum and zeroing. + * They also should align source or destination to 8 bytes. + */ + + .macro source +10: + .section __ex_table,"a" + .align 8 + .quad 10b,.Lbad_source + .previous + .endm + + .macro dest +20: + .section __ex_table,"a" + .align 8 + .quad 20b,.Lbad_dest + .previous + .endm + + .macro ignore L=.Lignore +30: + .section __ex_table,"a" + .align 8 + .quad 30b,\L + .previous + .endm + + +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC + cmpl $3*64,%edx + jle .Lignore + +.Lignore: + subq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET 7*8 + movq %rbx,2*8(%rsp) + CFI_REL_OFFSET rbx, 2*8 + movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12, 3*8 + movq %r14,4*8(%rsp) + CFI_REL_OFFSET r14, 4*8 + movq %r13,5*8(%rsp) + CFI_REL_OFFSET r13, 5*8 + movq %rbp,6*8(%rsp) + CFI_REL_OFFSET rbp, 6*8 + + movq %r8,(%rsp) + movq %r9,1*8(%rsp) + + movl %ecx,%eax + movl %edx,%ecx + + xorl %r9d,%r9d + movq %rcx,%r12 + + shrq $6,%r12 + jz .Lhandle_tail /* < 64 */ + + clc + + /* main loop. clear in 64 byte blocks */ + /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ + /* r11: temp3, rdx: temp4, r12 loopcnt */ + /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ + .p2align 4 +.Lloop: + source + movq (%rdi),%rbx + source + movq 8(%rdi),%r8 + source + movq 16(%rdi),%r11 + source + movq 24(%rdi),%rdx + + source + movq 32(%rdi),%r10 + source + movq 40(%rdi),%rbp + source + movq 48(%rdi),%r14 + source + movq 56(%rdi),%r13 + + ignore 2f + prefetcht0 5*64(%rdi) +2: + adcq %rbx,%rax + adcq %r8,%rax + adcq %r11,%rax + adcq %rdx,%rax + adcq %r10,%rax + adcq %rbp,%rax + adcq %r14,%rax + adcq %r13,%rax + + decl %r12d + + dest + movq %rbx,(%rsi) + dest + movq %r8,8(%rsi) + dest + movq %r11,16(%rsi) + dest + movq %rdx,24(%rsi) + + dest + movq %r10,32(%rsi) + dest + movq %rbp,40(%rsi) + dest + movq %r14,48(%rsi) + dest + movq %r13,56(%rsi) + +3: + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Lloop + + adcq %r9,%rax + + /* do last upto 56 bytes */ +.Lhandle_tail: + /* ecx: count */ + movl %ecx,%r10d + andl $63,%ecx + shrl $3,%ecx + jz .Lfold + clc + .p2align 4 +.Lloop_8: + source + movq (%rdi),%rbx + adcq %rbx,%rax + decl %ecx + dest + movq %rbx,(%rsi) + leaq 8(%rsi),%rsi /* preserve carry */ + leaq 8(%rdi),%rdi + jnz .Lloop_8 + adcq %r9,%rax /* add in carry */ + +.Lfold: + /* reduce checksum to 32bits */ + movl %eax,%ebx + shrq $32,%rax + addl %ebx,%eax + adcl %r9d,%eax + + /* do last upto 6 bytes */ +.Lhandle_7: + movl %r10d,%ecx + andl $7,%ecx + shrl $1,%ecx + jz .Lhandle_1 + movl $2,%edx + xorl %ebx,%ebx + clc + .p2align 4 +.Lloop_1: + source + movw (%rdi),%bx + adcl %ebx,%eax + decl %ecx + dest + movw %bx,(%rsi) + leaq 2(%rdi),%rdi + leaq 2(%rsi),%rsi + jnz .Lloop_1 + adcl %r9d,%eax /* add in carry */ + + /* handle last odd byte */ +.Lhandle_1: + testl $1,%r10d + jz .Lende + xorl %ebx,%ebx + source + movb (%rdi),%bl + dest + movb %bl,(%rsi) + addl %ebx,%eax + adcl %r9d,%eax /* carry */ + + CFI_REMEMBER_STATE +.Lende: + movq 2*8(%rsp),%rbx + CFI_RESTORE rbx + movq 3*8(%rsp),%r12 + CFI_RESTORE r12 + movq 4*8(%rsp),%r14 + CFI_RESTORE r14 + movq 5*8(%rsp),%r13 + CFI_RESTORE r13 + movq 6*8(%rsp),%rbp + CFI_RESTORE rbp + addq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET -7*8 + ret + CFI_RESTORE_STATE + + /* Exception handlers. Very simple, zeroing is done in the wrappers */ +.Lbad_source: + movq (%rsp),%rax + testq %rax,%rax + jz .Lende + movl $-EFAULT,(%rax) + jmp .Lende + +.Lbad_dest: + movq 8(%rsp),%rax + testq %rax,%rax + jz .Lende + movl $-EFAULT,(%rax) + jmp .Lende + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic)