minix/lib/libc/arch/sparc64/string/memcpy.S
Ben Gras 2fe8fb192f Full switch to clang/ELF. Drop ack. Simplify.
There is important information about booting non-ack images in
docs/UPDATING. ack/aout-format images can't be built any more, and
booting clang/ELF-format ones is a little different. Updating to the
new boot monitor is recommended.

Changes in this commit:

	. drop boot monitor -> allowing dropping ack support
	. facility to copy ELF boot files to /boot so that old boot monitor
	  can still boot fairly easily, see UPDATING
	. no more ack-format libraries -> single-case libraries
	. some cleanup of OBJECT_FMT, COMPILER_TYPE, etc cases
	. drop several ack toolchain commands, but not all support
	  commands (e.g. aal is gone but acksize is not yet).
	. a few libc files moved to netbsd libc dir
	. new /bin/date as minix date used code in libc/
	. test compile fix
	. harmonize includes
	. /usr/lib is no longer special: without ack, /usr/lib plays no
	  kind of special bootstrapping role any more and bootstrapping
	  is done exclusively through packages, so releases depend even
	  less on the state of the machine making them now.
	. rename nbsd_lib* to lib*
	. reduce mtree
2012-02-14 14:52:02 +01:00

1959 lines
37 KiB
ArmAsm

/* $NetBSD: memcpy.S,v 1.2 2001/08/01 05:52:12 eeh Exp $ */
/*
* Copyright (c) 2001 Eduardo E. Horvath
*
* This software was developed by the Computer Systems Engineering group
* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
* contributed to Berkeley.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <machine/asm.h>
#ifndef _LOCORE
#define _LOCORE
#endif
#include <machine/ctlreg.h>
#include <machine/frame.h>
#include <machine/psl.h>
#if defined(LIBC_SCCS) && !defined(lint)
RCSID("$NetBSD: memcpy.S,v 1.2 2001/08/01 05:52:12 eeh Exp $")
#endif /* LIBC_SCCS and not lint */
#define EMPTY nop
#define NOTREACHED ta 1
#define BCOPY_SMALL 16
#define BLOCK_SIZE 64
#if 0
#define ASI_STORE ASI_BLK_COMMIT_P
#else
#define ASI_STORE ASI_BLK_P
#endif
#if 1
/*
* kernel bcopy/memcpy
* Assumes regions do not overlap; has no useful return value.
*
* Must not use %g7 (see copyin/copyout above).
*/
ENTRY(memcpy) /* dest, src, size */
/*
* Swap args for bcopy. Gcc generates calls to memcpy for
* structure assignments.
*/
mov %o0, %o3
mov %o1, %o0
mov %o3, %o1
#endif
ENTRY(bcopy) /* src, dest, size */
#ifdef DEBUG
set pmapdebug, %o4
ld [%o4], %o4
btst 0x80, %o4 ! PDB_COPY
bz,pt %icc, 3f
nop
save %sp, -CC64FSZ, %sp
mov %i0, %o1
set 2f, %o0
mov %i1, %o2
call printf
mov %i2, %o3
! ta 1; nop
restore
.data
2: .asciz "bcopy(%p->%p,%x)\n"
_ALIGN
.text
3:
#endif
/*
* Check for overlaps and punt.
*
* If src <= dest <= src+len we have a problem.
*/
sub %o1, %o0, %o3
cmp %o3, %o2
blu,pn %xcc, Lovbcopy
cmp %o2, BCOPY_SMALL
Lbcopy_start:
bge,pt %xcc, 2f ! if >= this many, go be fancy.
cmp %o2, 256
mov %o1, %o5 ! Save memcpy return value
/*
* Not much to copy, just do it a byte at a time.
*/
deccc %o2 ! while (--len >= 0)
bl 1f
EMPTY
0:
inc %o0
ldsb [%o0 - 1], %o4 ! (++dst)[-1] = *src++;
stb %o4, [%o1]
deccc %o2
bge 0b
inc %o1
1:
retl
mov %o5, %o0
NOTREACHED
/*
* Overlapping bcopies -- punt.
*/
Lovbcopy:
/*
* Since src comes before dst, and the regions might overlap,
* we have to do the copy starting at the end and working backwards.
*
* We could optimize this, but it almost never happens.
*/
mov %o1, %o5 ! Retval
add %o2, %o0, %o0 ! src += len
add %o2, %o1, %o1 ! dst += len
deccc %o2
bl,pn %xcc, 1f
dec %o0
0:
dec %o1
ldsb [%o0], %o4
dec %o0
deccc %o2
bge,pt %xcc, 0b
stb %o4, [%o1]
1:
retl
mov %o5, %o0
/*
* Plenty of data to copy, so try to do it optimally.
*/
2:
#if 1
! If it is big enough, use VIS instructions
bge Lbcopy_block
nop
#endif
Lbcopy_fancy:
!!
!! First align the output to a 8-byte entity
!!
save %sp, -CC64FSZ, %sp
mov %i0, %o0
mov %i1, %o1
mov %i2, %o2
btst 1, %o1
bz,pt %icc, 4f
btst 2, %o1
ldub [%o0], %o4 ! Load 1st byte
deccc 1, %o2
ble,pn %xcc, Lbcopy_finish ! XXXX
inc 1, %o0
stb %o4, [%o1] ! Store 1st byte
inc 1, %o1 ! Update address
btst 2, %o1
4:
bz,pt %icc, 4f
btst 1, %o0
bz,a 1f
lduh [%o0], %o4 ! Load short
ldub [%o0], %o4 ! Load bytes
ldub [%o0+1], %o3
sllx %o4, 8, %o4
or %o3, %o4, %o4
1:
deccc 2, %o2
ble,pn %xcc, Lbcopy_finish ! XXXX
inc 2, %o0
sth %o4, [%o1] ! Store 1st short
inc 2, %o1
4:
btst 4, %o1
bz,pt %xcc, 4f
btst 3, %o0
bz,a,pt %xcc, 1f
lduw [%o0], %o4 ! Load word -1
btst 1, %o0
bz,a,pt %icc, 2f
lduh [%o0], %o4
ldub [%o0], %o4
lduh [%o0+1], %o3
sllx %o4, 16, %o4
or %o4, %o3, %o4
ldub [%o0+3], %o3
sllx %o4, 8, %o4
ba,pt %icc, 1f
or %o4, %o3, %o4
2:
lduh [%o0+2], %o3
sllx %o4, 16, %o4
or %o4, %o3, %o4
1:
deccc 4, %o2
ble,pn %xcc, Lbcopy_finish ! XXXX
inc 4, %o0
st %o4, [%o1] ! Store word
inc 4, %o1
4:
!!
!! We are now 32-bit aligned in the dest.
!!
Lbcopy__common:
and %o0, 7, %o4 ! Shift amount
andn %o0, 7, %o0 ! Source addr
brz,pt %o4, Lbcopy_noshift8 ! No shift version...
sllx %o4, 3, %o4 ! In bits
mov 8<<3, %o3
ldx [%o0], %l0 ! Load word -1
sub %o3, %o4, %o3 ! Reverse shift
deccc 16*8, %o2 ! Have enough room?
sllx %l0, %o4, %l0
bl,pn %xcc, 2f
and %o3, 0x38, %o3
Lbcopy_unrolled8:
/*
* This is about as close to optimal as you can get, since
* the shifts require EU0 and cannot be paired, and you have
* 3 dependent operations on the data.
*/
! ldx [%o0+0*8], %l0 ! Already done
! sllx %l0, %o4, %l0 ! Already done
ldx [%o0+1*8], %l1
ldx [%o0+2*8], %l2
ldx [%o0+3*8], %l3
ldx [%o0+4*8], %l4
ldx [%o0+5*8], %l5
ldx [%o0+6*8], %l6
#if 1
ba,pt %icc, 1f
ldx [%o0+7*8], %l7
.align 8
1:
srlx %l1, %o3, %g1
inc 8*8, %o0
sllx %l1, %o4, %l1
or %g1, %l0, %o5
ldx [%o0+0*8], %l0
stx %o5, [%o1+0*8]
srlx %l2, %o3, %g1
sllx %l2, %o4, %l2
or %g1, %l1, %o5
ldx [%o0+1*8], %l1
stx %o5, [%o1+1*8]
srlx %l3, %o3, %g1
sllx %l3, %o4, %l3
or %g1, %l2, %o5
ldx [%o0+2*8], %l2
stx %o5, [%o1+2*8]
srlx %l4, %o3, %g1
sllx %l4, %o4, %l4
or %g1, %l3, %o5
ldx [%o0+3*8], %l3
stx %o5, [%o1+3*8]
srlx %l5, %o3, %g1
sllx %l5, %o4, %l5
or %g1, %l4, %o5
ldx [%o0+4*8], %l4
stx %o5, [%o1+4*8]
srlx %l6, %o3, %g1
sllx %l6, %o4, %l6
or %g1, %l5, %o5
ldx [%o0+5*8], %l5
stx %o5, [%o1+5*8]
srlx %l7, %o3, %g1
sllx %l7, %o4, %l7
or %g1, %l6, %o5
ldx [%o0+6*8], %l6
stx %o5, [%o1+6*8]
srlx %l0, %o3, %g1
deccc 8*8, %o2 ! Have enough room?
sllx %l0, %o4, %l0 ! Next loop
or %g1, %l7, %o5
ldx [%o0+7*8], %l7
stx %o5, [%o1+7*8]
bge,pt %xcc, 1b
inc 8*8, %o1
Lbcopy_unrolled8_cleanup:
!!
!! Finished 8 byte block, unload the regs.
!!
srlx %l1, %o3, %g1
inc 7*8, %o0
sllx %l1, %o4, %l1
or %g1, %l0, %o5
stx %o5, [%o1+0*8]
srlx %l2, %o3, %g1
sllx %l2, %o4, %l2
or %g1, %l1, %o5
stx %o5, [%o1+1*8]
srlx %l3, %o3, %g1
sllx %l3, %o4, %l3
or %g1, %l2, %o5
stx %o5, [%o1+2*8]
srlx %l4, %o3, %g1
sllx %l4, %o4, %l4
or %g1, %l3, %o5
stx %o5, [%o1+3*8]
srlx %l5, %o3, %g1
sllx %l5, %o4, %l5
or %g1, %l4, %o5
stx %o5, [%o1+4*8]
srlx %l6, %o3, %g1
sllx %l6, %o4, %l6
or %g1, %l5, %o5
stx %o5, [%o1+5*8]
srlx %l7, %o3, %g1
sllx %l7, %o4, %l7
or %g1, %l6, %o5
stx %o5, [%o1+6*8]
inc 7*8, %o1
mov %l7, %l0 ! Save our unused data
dec 7*8, %o2
#else
/*
* This version also handles aligned copies at almost the
* same speed. It should take the same number of cycles
* as the previous version, but is slightly slower, probably
* due to i$ issues.
*/
ldx [%o0+7*8], %l7
ba,pt %icc, 1f
clr %g1
.align 32
1:
srlx %l1, %o3, %g1
bz,pn %xcc, 3f
inc 8*8, %o0
sllx %l1, %o4, %l1
or %g1, %l0, %o5
ba,pt %icc, 4f
ldx [%o0+0*8], %l0
nop
3:
mov %l0, %o5
ldx [%o0+0*8], %l0
4:
bz,pn %icc, 3f
stx %o5, [%o1+0*8]
srlx %l2, %o3, %g1
sllx %l2, %o4, %l2
3:
or %g1, %l1, %o5
ldx [%o0+1*8], %l1
bz,pn %icc, 3f
stx %o5, [%o1+1*8]
srlx %l3, %o3, %g1
sllx %l3, %o4, %l3
3:
or %g1, %l2, %o5
ldx [%o0+2*8], %l2
bz,pn %icc, 3f
stx %o5, [%o1+2*8]
srlx %l4, %o3, %g1
sllx %l4, %o4, %l4
3:
or %g1, %l3, %o5
ldx [%o0+3*8], %l3
bz,pn %icc, 3f
stx %o5, [%o1+3*8]
srlx %l5, %o3, %g1
sllx %l5, %o4, %l5
3:
or %g1, %l4, %o5
ldx [%o0+4*8], %l4
bz,pn %icc, 3f
stx %o5, [%o1+4*8]
srlx %l6, %o3, %g1
sllx %l6, %o4, %l6
3:
or %g1, %l5, %o5
ldx [%o0+5*8], %l5
bz,pn %icc, 3f
stx %o5, [%o1+5*8]
srlx %l7, %o3, %g1
sllx %l7, %o4, %l7
3:
or %g1, %l6, %o5
ldx [%o0+6*8], %l6
bz,pn %icc, 3f
stx %o5, [%o1+6*8]
srlx %l0, %o3, %g1
sllx %l0, %o4, %l0 ! Next loop
3:
or %g1, %l7, %o5
ldx [%o0+7*8], %l7
deccc 8*8, %o2 ! Have enough room?
stx %o5, [%o1+7*8]
inc 8*8, %o1
bge,pt %xcc, 1b
tst %o4
!!
!! Now unload all those regs
!!
Lbcopy_unrolled8_cleanup:
srlx %l1, %o3, %g1
bz,pn %xcc, 3f
inc 7*8, %o0 ! Point at the last load
sllx %l1, %o4, %l1
ba,pt %icc, 4f
or %g1, %l0, %o5
3:
mov %l0, %o5
4:
bz,pn %icc, 3f
stx %o5, [%o1+0*8]
srlx %l2, %o3, %g1
sllx %l2, %o4, %l2
3:
or %g1, %l1, %o5
bz,pn %icc, 3f
stx %o5, [%o1+1*8]
srlx %l3, %o3, %g1
sllx %l3, %o4, %l3
3:
or %g1, %l2, %o5
bz,pn %icc, 3f
stx %o5, [%o1+2*8]
srlx %l4, %o3, %g1
sllx %l4, %o4, %l4
3:
or %g1, %l3, %o5
bz,pn %icc, 3f
stx %o5, [%o1+3*8]
srlx %l5, %o3, %g1
sllx %l5, %o4, %l5
3:
or %g1, %l4, %o5
bz,pn %icc, 3f
stx %o5, [%o1+4*8]
srlx %l6, %o3, %g1
sllx %l6, %o4, %l6
3:
or %g1, %l5, %o5
bz,pn %icc, 3f
stx %o5, [%o1+5*8]
srlx %l7, %o3, %g1
sllx %l7, %o4, %l7
3:
or %g1, %l6, %o5
mov %l7, %l0 ! Shuffle to %l0
stx %o5, [%o1+6*8]
or %g1, %l7, %o5
dec 7*8, %o2
inc 7*8, %o1 ! Point at last store
#endif
2:
inccc 16*8, %o2
bz,pn %icc, Lbcopy_complete
!! Unrolled 8 times
Lbcopy_aligned8:
! ldx [%o0], %l0 ! Already done
! sllx %l0, %o4, %l0 ! Shift high word
deccc 8, %o2 ! Pre-decrement
bl,pn %xcc, Lbcopy_finish
1:
ldx [%o0+8], %l1 ! Load word 0
inc 8, %o0
srlx %l1, %o3, %o5
or %o5, %l0, %o5 ! Combine
stx %o5, [%o1] ! Store result
inc 8, %o1
deccc 8, %o2
bge,pn %xcc, 1b
sllx %l1, %o4, %l0
btst 7, %o2 ! Done?
bz,pt %xcc, Lbcopy_complete
!!
!! Loadup the last dregs into %l0 and shift it into place
!!
srlx %o3, 3, %o5 ! # bytes in %l0
dec 8, %o5 ! - 8
!! n-8 - (by - 8) -> n - by
subcc %o2, %o5, %g0 ! # bytes we need
ble,pt %icc, Lbcopy_finish
nop
ldx [%o0+8], %l1 ! Need another word
srlx %l1, %o3, %l1
ba,pt %icc, Lbcopy_finish
or %l0, %l1, %l0 ! All loaded up.
Lbcopy_noshift8:
deccc 8*8, %o2 ! Have enough room?
bl,pn %xcc, 2f
nop
ba,pt %icc, 1f
nop
.align 32
1:
ldx [%o0+0*8], %l0
ldx [%o0+1*8], %l1
ldx [%o0+2*8], %l2
ldx [%o0+3*8], %l3
stx %l0, [%o1+0*8]
stx %l1, [%o1+1*8]
stx %l2, [%o1+2*8]
stx %l3, [%o1+3*8]
ldx [%o0+4*8], %l4
ldx [%o0+5*8], %l5
ldx [%o0+6*8], %l6
ldx [%o0+7*8], %l7
inc 8*8, %o0
stx %l4, [%o1+4*8]
stx %l5, [%o1+5*8]
deccc 8*8, %o2
stx %l6, [%o1+6*8]
stx %l7, [%o1+7*8]
stx %l2, [%o1+2*8]
bge,pt %xcc, 1b
inc 8*8, %o1
2:
inc 8*8, %o2
1:
deccc 8, %o2
bl,pn %icc, 1f ! < 0 --> sub word
nop
ldx [%o0], %o5
inc 8, %o0
stx %o5, [%o1]
bg,pt %icc, 1b ! Exactly 0 --> done
inc 8, %o1
1:
btst 7, %o2 ! Done?
bz,pt %xcc, Lbcopy_complete
clr %o4
ldx [%o0], %l0
Lbcopy_finish:
brz,pn %o2, 2f ! 100% complete?
cmp %o2, 8 ! Exactly 8 bytes?
bz,a,pn %xcc, 2f
stx %l0, [%o1]
btst 4, %o2 ! Word store?
bz %xcc, 1f
srlx %l0, 32, %o5 ! Shift high word down
stw %o5, [%o1]
inc 4, %o1
mov %l0, %o5 ! Operate on the low bits
1:
btst 2, %o2
mov %o5, %l0
bz 1f
srlx %l0, 16, %o5
sth %o5, [%o1] ! Store short
inc 2, %o1
mov %l0, %o5 ! Operate on low bytes
1:
mov %o5, %l0
btst 1, %o2 ! Byte aligned?
bz 2f
srlx %l0, 8, %o5
stb %o5, [%o1] ! Store last byte
inc 1, %o1 ! Update address
2:
Lbcopy_complete:
#if 0
!!
!! verify copy success.
!!
mov %i0, %o2
mov %i1, %o4
mov %i2, %l4
0:
ldub [%o2], %o1
inc %o2
ldub [%o4], %o3
inc %o4
cmp %o3, %o1
bnz 1f
dec %l4
brnz %l4, 0b
nop
ba 2f
nop
1:
set 0f, %o0
call printf
sub %i2, %l4, %o5
set 1f, %o0
mov %i0, %o1
mov %i1, %o2
call printf
mov %i2, %o3
ta 1
.data
0: .asciz "bcopy failed: %x@%p != %x@%p byte %d\n"
1: .asciz "bcopy(%p, %p, %lx)\n"
.align 8
.text
2:
#endif
ret
restore %i1, %g0, %o0
#if 1
/*
* Block copy. Useful for >256 byte copies.
*
* Benchmarking has shown this always seems to be slower than
* the integer version, so this is disabled. Maybe someone will
* figure out why sometime.
*/
Lbcopy_block:
#ifdef _KERNEL
/*
* Kernel:
*
* Here we use VIS instructions to do a block clear of a page.
* But before we can do that we need to save and enable the FPU.
* The last owner of the FPU registers is fpproc, and
* fpproc->p_md.md_fpstate is the current fpstate. If that's not
* null, call savefpstate() with it to store our current fp state.
*
* Next, allocate an aligned fpstate on the stack. We will properly
* nest calls on a particular stack so this should not be a problem.
*
* Now we grab either curproc (or if we're on the interrupt stack
* proc0). We stash its existing fpstate in a local register and
* put our new fpstate in curproc->p_md.md_fpstate. We point
* fpproc at curproc (or proc0) and enable the FPU.
*
* If we are ever preempted, our FPU state will be saved in our
* fpstate. Then, when we're resumed and we take an FPDISABLED
* trap, the trap handler will be able to fish our FPU state out
* of curproc (or proc0).
*
* On exiting this routine we undo the damage: restore the original
* pointer to curproc->p_md.md_fpstate, clear our fpproc, and disable
* the MMU.
*
*
* Register usage, Kernel only (after save):
*
* %i0 src
* %i1 dest
* %i2 size
*
* %l0 XXXX DEBUG old fpstate
* %l1 fpproc (hi bits only)
* %l2 orig fpproc
* %l3 orig fpstate
* %l5 curproc
* %l6 old fpstate
*
* Register ussage, Kernel and user:
*
* %g1 src (retval for memcpy)
*
* %o0 src
* %o1 dest
* %o2 end dest
* %o5 last safe fetchable address
*/
#if 1
ENABLE_FPU(0)
#else
save %sp, -(CC64FSZ+FS_SIZE+BLOCK_SIZE), %sp ! Allocate an fpstate
sethi %hi(FPPROC), %l1
LDPTR [%l1 + %lo(FPPROC)], %l2 ! Load fpproc
add %sp, (CC64FSZ+STKB+BLOCK_SIZE-1), %l0 ! Calculate pointer to fpstate
brz,pt %l2, 1f ! fpproc == NULL?
andn %l0, BLOCK_ALIGN, %l0 ! And make it block aligned
LDPTR [%l2 + P_FPSTATE], %l3
brz,pn %l3, 1f ! Make sure we have an fpstate
mov %l3, %o0
call _C_LABEL(savefpstate) ! Save the old fpstate
set EINTSTACK-STKB, %l4 ! Are we on intr stack?
cmp %sp, %l4
bgu,pt %xcc, 1f
set INTSTACK-STKB, %l4
cmp %sp, %l4
blu %xcc, 1f
0:
sethi %hi(_C_LABEL(proc0)), %l4 ! Yes, use proc0
ba,pt %xcc, 2f ! XXXX needs to change to CPUs idle proc
or %l4, %lo(_C_LABEL(proc0)), %l5
1:
sethi %hi(CURPROC), %l4 ! Use curproc
LDPTR [%l4 + %lo(CURPROC)], %l5
brz,pn %l5, 0b ! If curproc is NULL need to use proc0
nop
2:
LDPTR [%l5 + P_FPSTATE], %l6 ! Save old fpstate
STPTR %l0, [%l5 + P_FPSTATE] ! Insert new fpstate
STPTR %l5, [%l1 + %lo(FPPROC)] ! Set new fpproc
wr %g0, FPRS_FEF, %fprs ! Enable FPU
#endif
mov %i0, %o0 ! Src addr.
mov %i1, %o1 ! Store our dest ptr here.
mov %i2, %o2 ! Len counter
#endif
!!
!! First align the output to a 64-bit entity
!!
mov %o1, %g1 ! memcpy retval
add %o0, %o2, %o5 ! End of source block
andn %o0, 7, %o3 ! Start of block
dec %o5
fzero %f0
andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr.
ldd [%o3], %f2 ! Load 1st word
dec 8, %o3 ! Move %o3 1 word back
btst 1, %o1
bz 4f
mov -7, %o4 ! Lowest src addr possible
alignaddr %o0, %o4, %o4 ! Base addr for load.
cmp %o3, %o4
be,pt %xcc, 1f ! Already loaded?
mov %o4, %o3
fmovd %f2, %f0 ! No. Shift
ldd [%o3+8], %f2 ! And load
1:
faligndata %f0, %f2, %f4 ! Isolate 1st byte
stda %f4, [%o1] ASI_FL8_P ! Store 1st byte
inc 1, %o1 ! Update address
inc 1, %o0
dec 1, %o2
4:
btst 2, %o1
bz 4f
mov -6, %o4 ! Calculate src - 6
alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
cmp %o3, %o4 ! Addresses same?
be,pt %xcc, 1f
mov %o4, %o3
fmovd %f2, %f0 ! Shuffle data
ldd [%o3+8], %f2 ! Load word 0
1:
faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
stda %f4, [%o1] ASI_FL16_P ! Store 1st short
dec 2, %o2
inc 2, %o1
inc 2, %o0
4:
brz,pn %o2, Lbcopy_blockfinish ! XXXX
btst 4, %o1
bz 4f
mov -4, %o4
alignaddr %o0, %o4, %o4 ! calculate shift mask and dest.
cmp %o3, %o4 ! Addresses same?
beq,pt %xcc, 1f
mov %o4, %o3
fmovd %f2, %f0 ! Shuffle data
ldd [%o3+8], %f2 ! Load word 0
1:
faligndata %f0, %f2, %f4 ! Move 1st short low part of f8
st %f5, [%o1] ! Store word
dec 4, %o2
inc 4, %o1
inc 4, %o0
4:
brz,pn %o2, Lbcopy_blockfinish ! XXXX
!!
!! We are now 32-bit aligned in the dest.
!!
Lbcopy_block_common:
mov -0, %o4
alignaddr %o0, %o4, %o4 ! base - shift
cmp %o3, %o4 ! Addresses same?
beq,pt %xcc, 1f
mov %o4, %o3
fmovd %f2, %f0 ! Shuffle data
ldd [%o3+8], %f2 ! Load word 0
1:
add %o3, 8, %o0 ! now use %o0 for src
!!
!! Continue until our dest is block aligned
!!
Lbcopy_block_aligned8:
1:
brz %o2, Lbcopy_blockfinish
btst BLOCK_ALIGN, %o1 ! Block aligned?
bz 1f
faligndata %f0, %f2, %f4 ! Generate result
deccc 8, %o2
ble,pn %icc, Lbcopy_blockfinish ! Should never happen
fmovd %f4, %f48
std %f4, [%o1] ! Store result
inc 8, %o1
fmovd %f2, %f0
inc 8, %o0
ba,pt %xcc, 1b ! Not yet.
ldd [%o0], %f2 ! Load next part
Lbcopy_block_aligned64:
1:
/*
* 64-byte aligned -- ready for block operations.
*
* Here we have the destination block aligned, but the
* source pointer may not be. Sub-word alignment will
* be handled by faligndata instructions. But the source
* can still be potentially aligned to 8 different words
* in our 64-bit block, so we have 8 different copy routines.
*
* Once we figure out our source alignment, we branch
* to the appropriate copy routine, which sets up the
* alignment for faligndata and loads (sets) the values
* into the source registers and does the copy loop.
*
* When were down to less than 1 block to store, we
* exit the copy loop and execute cleanup code.
*
* Block loads and stores are not properly interlocked.
* Stores save one reg/cycle, so you can start overwriting
* registers the cycle after the store is issued.
*
* Block loads require a block load to a different register
* block or a membar #Sync before accessing the loaded
* data.
*
* Since the faligndata instructions may be offset as far
* as 7 registers into a block (if you are shifting source
* 7 -> dest 0), you need 3 source register blocks for full
* performance: one you are copying, one you are loading,
* and one for interlocking. Otherwise, we would need to
* sprinkle the code with membar #Sync and lose the advantage
* of running faligndata in parallel with block stores. This
* means we are fetching a full 128 bytes ahead of the stores.
* We need to make sure the prefetch does not inadvertently
* cross a page boundary and fault on data that we will never
* store.
*
*/
#if 1
and %o0, BLOCK_ALIGN, %o3
srax %o3, 3, %o3 ! Isolate the offset
brz %o3, L100 ! 0->0
btst 4, %o3
bnz %xcc, 4f
btst 2, %o3
bnz %xcc, 2f
btst 1, %o3
ba,pt %xcc, L101 ! 0->1
nop /* XXX spitfire bug */
2:
bz %xcc, L102 ! 0->2
nop
ba,pt %xcc, L103 ! 0->3
nop /* XXX spitfire bug */
4:
bnz %xcc, 2f
btst 1, %o3
bz %xcc, L104 ! 0->4
nop
ba,pt %xcc, L105 ! 0->5
nop /* XXX spitfire bug */
2:
bz %xcc, L106 ! 0->6
nop
ba,pt %xcc, L107 ! 0->7
nop /* XXX spitfire bug */
#else
!!
!! Isolate the word offset, which just happens to be
!! the slot in our jump table.
!!
!! This is 6 insns, most of which cannot be paired,
!! which is about the same as the above version.
!!
rd %pc, %o4
1:
and %o0, 0x31, %o3
add %o3, (Lbcopy_block_jmp - 1b), %o3
jmpl %o4 + %o3, %g0
nop
!!
!! Jump table
!!
Lbcopy_block_jmp:
ba,a,pt %xcc, L100
nop
ba,a,pt %xcc, L101
nop
ba,a,pt %xcc, L102
nop
ba,a,pt %xcc, L103
nop
ba,a,pt %xcc, L104
nop
ba,a,pt %xcc, L105
nop
ba,a,pt %xcc, L106
nop
ba,a,pt %xcc, L107
nop
#endif
!!
!! Source is block aligned.
!!
!! Just load a block and go.
!!
L100:
#ifdef RETURN_NAME
sethi %hi(1f), %g1
ba,pt %icc, 2f
or %g1, %lo(1f), %g1
1:
.asciz "L100"
.align 8
2:
#endif
fmovd %f0 , %f62
ldda [%o0] ASI_BLK_P, %f0
inc BLOCK_SIZE, %o0
cmp %o0, %o5
bleu,a,pn %icc, 3f
ldda [%o0] ASI_BLK_P, %f16
ba,pt %icc, 3f
membar #Sync
.align 32 ! ICache align.
3:
faligndata %f62, %f0, %f32
inc BLOCK_SIZE, %o0
faligndata %f0, %f2, %f34
dec BLOCK_SIZE, %o2
faligndata %f2, %f4, %f36
cmp %o0, %o5
faligndata %f4, %f6, %f38
faligndata %f6, %f8, %f40
faligndata %f8, %f10, %f42
faligndata %f10, %f12, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f12, %f14, %f46
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f48
membar #Sync
2:
stda %f32, [%o1] ASI_STORE
faligndata %f14, %f16, %f32
inc BLOCK_SIZE, %o0
faligndata %f16, %f18, %f34
inc BLOCK_SIZE, %o1
faligndata %f18, %f20, %f36
dec BLOCK_SIZE, %o2
faligndata %f20, %f22, %f38
cmp %o0, %o5
faligndata %f22, %f24, %f40
faligndata %f24, %f26, %f42
faligndata %f26, %f28, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f28, %f30, %f46
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f0
membar #Sync
2:
stda %f32, [%o1] ASI_STORE
faligndata %f30, %f48, %f32
inc BLOCK_SIZE, %o0
faligndata %f48, %f50, %f34
inc BLOCK_SIZE, %o1
faligndata %f50, %f52, %f36
dec BLOCK_SIZE, %o2
faligndata %f52, %f54, %f38
cmp %o0, %o5
faligndata %f54, %f56, %f40
faligndata %f56, %f58, %f42
faligndata %f58, %f60, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f60, %f62, %f46
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top
membar #Sync
2:
stda %f32, [%o1] ASI_STORE
ba 3b
inc BLOCK_SIZE, %o1
!!
!! Source at BLOCK_ALIGN+8
!!
!! We need to load almost 1 complete block by hand.
!!
L101:
#ifdef RETURN_NAME
sethi %hi(1f), %g1
ba,pt %icc, 2f
or %g1, %lo(1f), %g1
1:
.asciz "L101"
.align 8
2:
#endif
! fmovd %f0, %f0 ! Hoist fmovd
ldd [%o0], %f2
inc 8, %o0
ldd [%o0], %f4
inc 8, %o0
ldd [%o0], %f6
inc 8, %o0
ldd [%o0], %f8
inc 8, %o0
ldd [%o0], %f10
inc 8, %o0
ldd [%o0], %f12
inc 8, %o0
ldd [%o0], %f14
inc 8, %o0
cmp %o0, %o5
bleu,a,pn %icc, 3f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
3:
faligndata %f0, %f2, %f32
inc BLOCK_SIZE, %o0
faligndata %f2, %f4, %f34
cmp %o0, %o5
faligndata %f4, %f6, %f36
dec BLOCK_SIZE, %o2
faligndata %f6, %f8, %f38
faligndata %f8, %f10, %f40
faligndata %f10, %f12, %f42
faligndata %f12, %f14, %f44
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f48
membar #Sync
2:
brlez,pn %o2, Lbcopy_blockdone
faligndata %f14, %f16, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f16, %f18, %f32
inc BLOCK_SIZE, %o0
faligndata %f18, %f20, %f34
inc BLOCK_SIZE, %o1
faligndata %f20, %f22, %f36
cmp %o0, %o5
faligndata %f22, %f24, %f38
dec BLOCK_SIZE, %o2
faligndata %f24, %f26, %f40
faligndata %f26, %f28, %f42
faligndata %f28, %f30, %f44
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f0
membar #Sync
2:
brlez,pn %o2, Lbcopy_blockdone
faligndata %f30, %f48, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f48, %f50, %f32
inc BLOCK_SIZE, %o0
faligndata %f50, %f52, %f34
inc BLOCK_SIZE, %o1
faligndata %f52, %f54, %f36
cmp %o0, %o5
faligndata %f54, %f56, %f38
dec BLOCK_SIZE, %o2
faligndata %f56, %f58, %f40
faligndata %f58, %f60, %f42
faligndata %f60, %f62, %f44
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
brlez,pn %o2, Lbcopy_blockdone
faligndata %f62, %f0, %f46
stda %f32, [%o1] ASI_STORE
ba 3b
inc BLOCK_SIZE, %o1
!!
!! Source at BLOCK_ALIGN+16
!!
!! We need to load 6 doubles by hand.
!!
L102:
#ifdef RETURN_NAME
sethi %hi(1f), %g1
ba,pt %icc, 2f
or %g1, %lo(1f), %g1
1:
.asciz "L102"
.align 8
2:
#endif
ldd [%o0], %f4
inc 8, %o0
fmovd %f0, %f2 ! Hoist fmovd
ldd [%o0], %f6
inc 8, %o0
ldd [%o0], %f8
inc 8, %o0
ldd [%o0], %f10
inc 8, %o0
ldd [%o0], %f12
inc 8, %o0
ldd [%o0], %f14
inc 8, %o0
cmp %o0, %o5
bleu,a,pn %icc, 3f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
3:
faligndata %f2, %f4, %f32
inc BLOCK_SIZE, %o0
faligndata %f4, %f6, %f34
cmp %o0, %o5
faligndata %f6, %f8, %f36
dec BLOCK_SIZE, %o2
faligndata %f8, %f10, %f38
faligndata %f10, %f12, %f40
faligndata %f12, %f14, %f42
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f48
membar #Sync
2:
faligndata %f14, %f16, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f16, %f18, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f18, %f20, %f32
inc BLOCK_SIZE, %o0
faligndata %f20, %f22, %f34
inc BLOCK_SIZE, %o1
faligndata %f22, %f24, %f36
cmp %o0, %o5
faligndata %f24, %f26, %f38
dec BLOCK_SIZE, %o2
faligndata %f26, %f28, %f40
faligndata %f28, %f30, %f42
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f0
membar #Sync
2:
faligndata %f30, %f48, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f48, %f50, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f50, %f52, %f32
inc BLOCK_SIZE, %o0
faligndata %f52, %f54, %f34
inc BLOCK_SIZE, %o1
faligndata %f54, %f56, %f36
cmp %o0, %o5
faligndata %f56, %f58, %f38
dec BLOCK_SIZE, %o2
faligndata %f58, %f60, %f40
faligndata %f60, %f62, %f42
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
faligndata %f62, %f0, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f0, %f2, %f46
stda %f32, [%o1] ASI_STORE
ba 3b
inc BLOCK_SIZE, %o1
!!
!! Source at BLOCK_ALIGN+24
!!
!! We need to load 5 doubles by hand.
!!
L103:
#ifdef RETURN_NAME
sethi %hi(1f), %g1
ba,pt %icc, 2f
or %g1, %lo(1f), %g1
1:
.asciz "L103"
.align 8
2:
#endif
fmovd %f0, %f4
ldd [%o0], %f6
inc 8, %o0
ldd [%o0], %f8
inc 8, %o0
ldd [%o0], %f10
inc 8, %o0
ldd [%o0], %f12
inc 8, %o0
ldd [%o0], %f14
inc 8, %o0
cmp %o0, %o5
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
inc BLOCK_SIZE, %o0
3:
faligndata %f4, %f6, %f32
cmp %o0, %o5
faligndata %f6, %f8, %f34
dec BLOCK_SIZE, %o2
faligndata %f8, %f10, %f36
faligndata %f10, %f12, %f38
faligndata %f12, %f14, %f40
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f48
membar #Sync
2:
faligndata %f14, %f16, %f42
inc BLOCK_SIZE, %o0
faligndata %f16, %f18, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f18, %f20, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f20, %f22, %f32
cmp %o0, %o5
faligndata %f22, %f24, %f34
dec BLOCK_SIZE, %o2
faligndata %f24, %f26, %f36
inc BLOCK_SIZE, %o1
faligndata %f26, %f28, %f38
faligndata %f28, %f30, %f40
ble,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f0
membar #Sync
2:
faligndata %f30, %f48, %f42
inc BLOCK_SIZE, %o0
faligndata %f48, %f50, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f50, %f52, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f52, %f54, %f32
cmp %o0, %o5
faligndata %f54, %f56, %f34
dec BLOCK_SIZE, %o2
faligndata %f56, %f58, %f36
faligndata %f58, %f60, %f38
inc BLOCK_SIZE, %o1
faligndata %f60, %f62, %f40
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
faligndata %f62, %f0, %f42
inc BLOCK_SIZE, %o0
faligndata %f0, %f2, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f2, %f4, %f46
stda %f32, [%o1] ASI_STORE
ba 3b
inc BLOCK_SIZE, %o1
!!
!! Source at BLOCK_ALIGN+32
!!
!! We need to load 4 doubles by hand.
!!
L104:
#ifdef RETURN_NAME
sethi %hi(1f), %g1
ba,pt %icc, 2f
or %g1, %lo(1f), %g1
1:
.asciz "L104"
.align 8
2:
#endif
fmovd %f0, %f6
ldd [%o0], %f8
inc 8, %o0
ldd [%o0], %f10
inc 8, %o0
ldd [%o0], %f12
inc 8, %o0
ldd [%o0], %f14
inc 8, %o0
cmp %o0, %o5
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
inc BLOCK_SIZE, %o0
3:
faligndata %f6, %f8, %f32
cmp %o0, %o5
faligndata %f8, %f10, %f34
dec BLOCK_SIZE, %o2
faligndata %f10, %f12, %f36
faligndata %f12, %f14, %f38
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f48
membar #Sync
2:
faligndata %f14, %f16, %f40
faligndata %f16, %f18, %f42
inc BLOCK_SIZE, %o0
faligndata %f18, %f20, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f20, %f22, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f22, %f24, %f32
cmp %o0, %o5
faligndata %f24, %f26, %f34
faligndata %f26, %f28, %f36
inc BLOCK_SIZE, %o1
faligndata %f28, %f30, %f38
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f0
membar #Sync
2:
faligndata %f30, %f48, %f40
dec BLOCK_SIZE, %o2
faligndata %f48, %f50, %f42
inc BLOCK_SIZE, %o0
faligndata %f50, %f52, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f52, %f54, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f54, %f56, %f32
cmp %o0, %o5
faligndata %f56, %f58, %f34
faligndata %f58, %f60, %f36
inc BLOCK_SIZE, %o1
faligndata %f60, %f62, %f38
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
faligndata %f62, %f0, %f40
dec BLOCK_SIZE, %o2
faligndata %f0, %f2, %f42
inc BLOCK_SIZE, %o0
faligndata %f2, %f4, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f4, %f6, %f46
stda %f32, [%o1] ASI_STORE
ba 3b
inc BLOCK_SIZE, %o1
!!
!! Source at BLOCK_ALIGN+40
!!
!! We need to load 3 doubles by hand.
!!
L105:
#ifdef RETURN_NAME
sethi %hi(1f), %g1
ba,pt %icc, 2f
or %g1, %lo(1f), %g1
1:
.asciz "L105"
.align 8
2:
#endif
fmovd %f0, %f8
ldd [%o0], %f10
inc 8, %o0
ldd [%o0], %f12
inc 8, %o0
ldd [%o0], %f14
inc 8, %o0
cmp %o0, %o5
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
inc BLOCK_SIZE, %o0
3:
faligndata %f8, %f10, %f32
cmp %o0, %o5
faligndata %f10, %f12, %f34
faligndata %f12, %f14, %f36
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f48
membar #Sync
2:
faligndata %f14, %f16, %f38
dec BLOCK_SIZE, %o2
faligndata %f16, %f18, %f40
inc BLOCK_SIZE, %o0
faligndata %f18, %f20, %f42
faligndata %f20, %f22, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f22, %f24, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f24, %f26, %f32
cmp %o0, %o5
faligndata %f26, %f28, %f34
dec BLOCK_SIZE, %o2
faligndata %f28, %f30, %f36
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f0
membar #Sync
2:
faligndata %f30, %f48, %f38
inc BLOCK_SIZE, %o1
faligndata %f48, %f50, %f40
inc BLOCK_SIZE, %o0
faligndata %f50, %f52, %f42
faligndata %f52, %f54, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f54, %f56, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f56, %f58, %f32
cmp %o0, %o5
faligndata %f58, %f60, %f34
dec BLOCK_SIZE, %o2
faligndata %f60, %f62, %f36
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
faligndata %f62, %f0, %f38
inc BLOCK_SIZE, %o1
faligndata %f0, %f2, %f40
inc BLOCK_SIZE, %o0
faligndata %f2, %f4, %f42
faligndata %f4, %f6, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f6, %f8, %f46
stda %f32, [%o1] ASI_STORE
ba 3b
inc BLOCK_SIZE, %o1
!!
!! Source at BLOCK_ALIGN+48
!!
!! We need to load 2 doubles by hand.
!!
L106:
#ifdef RETURN_NAME
sethi %hi(1f), %g1
ba,pt %icc, 2f
or %g1, %lo(1f), %g1
1:
.asciz "L106"
.align 8
2:
#endif
fmovd %f0, %f10
ldd [%o0], %f12
inc 8, %o0
ldd [%o0], %f14
inc 8, %o0
cmp %o0, %o5
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
inc BLOCK_SIZE, %o0
3:
faligndata %f10, %f12, %f32
cmp %o0, %o5
faligndata %f12, %f14, %f34
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f48
membar #Sync
2:
faligndata %f14, %f16, %f36
dec BLOCK_SIZE, %o2
faligndata %f16, %f18, %f38
inc BLOCK_SIZE, %o0
faligndata %f18, %f20, %f40
faligndata %f20, %f22, %f42
faligndata %f22, %f24, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f24, %f26, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f26, %f28, %f32
cmp %o0, %o5
faligndata %f28, %f30, %f34
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f0
membar #Sync
2:
faligndata %f30, %f48, %f36
dec BLOCK_SIZE, %o2
faligndata %f48, %f50, %f38
inc BLOCK_SIZE, %o1
faligndata %f50, %f52, %f40
faligndata %f52, %f54, %f42
inc BLOCK_SIZE, %o0
faligndata %f54, %f56, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f56, %f58, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f58, %f60, %f32
cmp %o0, %o5
faligndata %f60, %f62, %f34
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
faligndata %f62, %f0, %f36
dec BLOCK_SIZE, %o2
faligndata %f0, %f2, %f38
inc BLOCK_SIZE, %o1
faligndata %f2, %f4, %f40
faligndata %f4, %f6, %f42
inc BLOCK_SIZE, %o0
faligndata %f6, %f8, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f8, %f10, %f46
stda %f32, [%o1] ASI_STORE
ba 3b
inc BLOCK_SIZE, %o1
!!
!! Source at BLOCK_ALIGN+56
!!
!! We need to load 1 double by hand.
!!
L107:
#ifdef RETURN_NAME
sethi %hi(1f), %g1
ba,pt %icc, 2f
or %g1, %lo(1f), %g1
1:
.asciz "L107"
.align 8
2:
#endif
fmovd %f0, %f12
ldd [%o0], %f14
inc 8, %o0
cmp %o0, %o5
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
inc BLOCK_SIZE, %o0
3:
faligndata %f12, %f14, %f32
cmp %o0, %o5
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f48
membar #Sync
2:
faligndata %f14, %f16, %f34
dec BLOCK_SIZE, %o2
faligndata %f16, %f18, %f36
inc BLOCK_SIZE, %o0
faligndata %f18, %f20, %f38
faligndata %f20, %f22, %f40
faligndata %f22, %f24, %f42
faligndata %f24, %f26, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f26, %f28, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f28, %f30, %f32
cmp %o0, %o5
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f0
membar #Sync
2:
faligndata %f30, %f48, %f34
dec BLOCK_SIZE, %o2
faligndata %f48, %f50, %f36
inc BLOCK_SIZE, %o1
faligndata %f50, %f52, %f38
faligndata %f52, %f54, %f40
inc BLOCK_SIZE, %o0
faligndata %f54, %f56, %f42
faligndata %f56, %f58, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f58, %f60, %f46
stda %f32, [%o1] ASI_STORE
faligndata %f60, %f62, %f32
cmp %o0, %o5
bleu,a,pn %icc, 2f
ldda [%o0] ASI_BLK_P, %f16
membar #Sync
2:
faligndata %f62, %f0, %f34
dec BLOCK_SIZE, %o2
faligndata %f0, %f2, %f36
inc BLOCK_SIZE, %o1
faligndata %f2, %f4, %f38
faligndata %f4, %f6, %f40
inc BLOCK_SIZE, %o0
faligndata %f6, %f8, %f42
faligndata %f8, %f10, %f44
brlez,pn %o2, Lbcopy_blockdone
faligndata %f10, %f12, %f46
stda %f32, [%o1] ASI_STORE
ba 3b
inc BLOCK_SIZE, %o1
Lbcopy_blockdone:
inc BLOCK_SIZE, %o2 ! Fixup our overcommit
membar #Sync ! Finish any pending loads
#define FINISH_REG(f) \
deccc 8, %o2; \
bl,a Lbcopy_blockfinish; \
fmovd f, %f48; \
std f, [%o1]; \
inc 8, %o1
FINISH_REG(%f32)
FINISH_REG(%f34)
FINISH_REG(%f36)
FINISH_REG(%f38)
FINISH_REG(%f40)
FINISH_REG(%f42)
FINISH_REG(%f44)
FINISH_REG(%f46)
FINISH_REG(%f48)
#undef FINISH_REG
!!
!! The low 3 bits have the sub-word bits needed to be
!! stored [because (x-8)&0x7 == x].
!!
Lbcopy_blockfinish:
brz,pn %o2, 2f ! 100% complete?
fmovd %f48, %f4
cmp %o2, 8 ! Exactly 8 bytes?
bz,a,pn %xcc, 2f
std %f4, [%o1]
btst 4, %o2 ! Word store?
bz %xcc, 1f
nop
st %f4, [%o1]
inc 4, %o1
1:
btst 2, %o2
fzero %f0
bz 1f
mov -6, %o4
alignaddr %o1, %o4, %g0
faligndata %f0, %f4, %f8
stda %f8, [%o1] ASI_FL16_P ! Store short
inc 2, %o1
1:
btst 1, %o2 ! Byte aligned?
bz 2f
mov -7, %o0 ! Calculate dest - 7
alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest.
faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8
stda %f8, [%o1] ASI_FL8_P ! Store 1st byte
inc 1, %o1 ! Update address
2:
membar #Sync
#if 0
!!
!! verify copy success.
!!
mov %i0, %o2
mov %i1, %o4
mov %i2, %l4
0:
ldub [%o2], %o1
inc %o2
ldub [%o4], %o3
inc %o4
cmp %o3, %o1
bnz 1f
dec %l4
brnz %l4, 0b
nop
ba 2f
nop
1:
set block_disable, %o0
stx %o0, [%o0]
set 0f, %o0
call prom_printf
sub %i2, %l4, %o5
set 1f, %o0
mov %i0, %o1
mov %i1, %o2
call prom_printf
mov %i2, %o3
ta 1
.data
_ALIGN
block_disable: .xword 0
0: .asciz "bcopy failed: %x@%p != %x@%p byte %d\r\n"
1: .asciz "bcopy(%p, %p, %lx)\r\n"
_ALIGN
.text
2:
#endif
#ifdef _KERNEL
set 1f, %o0
mov %i0, %o1
mov %i1, %o2
call printf
mov %i2, %o3
.data
_ALIGN
1: .asciz "block exit (%p, %p, %d)\n"
_ALIGN
.text
/*
* Weve saved our possible fpstate, now disable the fpu
* and continue with life.
*/
#if 1
RESTORE_FPU
#else
#ifdef DEBUG
LDPTR [%l1 + %lo(FPPROC)], %l7
cmp %l7, %l5
! tnz 1 ! fpproc has changed!
LDPTR [%l5 + P_FPSTATE], %l7
cmp %l7, %l0
tnz 1 ! fpstate has changed!
#endif
andcc %l2, %l3, %g0 ! If (fpproc && fpstate)
STPTR %l2, [%l1 + %lo(FPPROC)] ! Restore old fproc
bz,pt %xcc, 1f ! Skip if no fpstate
STPTR %l6, [%l5 + P_FPSTATE] ! Restore old fpstate
call _C_LABEL(loadfpstate) ! Re-load orig fpstate
mov %l3, %o0
1:
#endif
set 1f, %o0
mov %i0, %o1
mov %i1, %o2
call printf
mov %i2, %o3
.data
_ALIGN
1: .asciz "block done (%p, %p, %d)\n"
_ALIGN
.text
ret
restore %g1, 0, %o0 ! Return DEST for memcpy
#endif
retl
mov %g1, %o0
#endif