1135 lines
29 KiB
ArmAsm
1135 lines
29 KiB
ArmAsm
/*
|
|
* IP28 specific assembly routines; cpuid always 0, also make semaphore
|
|
* macros a no-op.
|
|
*/
|
|
#ident "$Revision: 1.58 $"
|
|
|
|
#include "ml/ml.h"
|
|
#include <sys/RACER/gda.h>
|
|
#include <sys/dump.h>
|
|
|
|
/* dummy routines whose return value is unimportant (or no return value).
|
|
Some return reasonable values on other machines, but should never
|
|
be called, or the return value should never be used on other machines.
|
|
*/
|
|
LEAF(dummy_func)
|
|
XLEAF(check_delay_tlbflush)
|
|
XLEAF(check_delay_iflush)
|
|
XLEAF(da_flush_tlb)
|
|
XLEAF(dma_mapinit)
|
|
XLEAF(apsfail)
|
|
XLEAF(disallowboot)
|
|
XLEAF(rmi_fixecc)
|
|
XLEAF(vme_init)
|
|
XLEAF(vme_ivec_init)
|
|
XLEAF(debug_stop_all_cpus)
|
|
XLEAF(bump_leds)
|
|
XLEAF(reset_enet_carrier) /* for if_ec2.c */
|
|
j ra
|
|
END(dummy_func)
|
|
|
|
LEAF(dcache_wb)
|
|
XLEAF(dcache_wbinval)
|
|
XLEAF(dki_dcache_wb)
|
|
XLEAF(dki_dcache_wbinval)
|
|
LI a2,CACH_DCACHE|CACH_INVAL|CACH_WBACK|CACH_IO_COHERENCY
|
|
j cache_operation
|
|
END(dcache_wb)
|
|
|
|
LEAF(dki_dcache_inval)
|
|
LI a2,CACH_DCACHE|CACH_INVAL|CACH_IO_COHERENCY
|
|
j cache_operation
|
|
END(dki_dcache_inval)
|
|
|
|
/* dummy routines that return 0 */
|
|
LEAF(dummyret0_func)
|
|
|
|
XLEAF(vme_adapter)
|
|
XLEAF(is_vme_space)
|
|
XLEAF(getcpuid)
|
|
XLEAF(disarm_threeway_trigger)
|
|
XLEAF(threeway_trigger_armed)
|
|
#ifdef DEBUG
|
|
XLEAF(getcyclecounter)
|
|
#endif /* DEBUG */
|
|
|
|
/* Semaphore call stubs */
|
|
XLEAF(appsema)
|
|
XLEAF(apvsema)
|
|
XLEAF(apcvsema)
|
|
move v0,zero
|
|
j ra
|
|
END(dummyret0_func)
|
|
|
|
/* dummy routines that return 1 */
|
|
LEAF(dummyret1_func)
|
|
XLEAF(apcpsema) /* can always get on non-MP machines */
|
|
XLEAF(enet_carrier_on) /* for if_ec2.c */
|
|
li v0, 1
|
|
j ra
|
|
END(dummyret1_func)
|
|
|
|
/* unsigned int get_count(void)
|
|
*/
|
|
LEAF(get_count)
|
|
XLEAF(get_r4k_counter) /* for compat and R4800 is a r4k */
|
|
XLEAF(_get_timestamp) /* return timestamp on SP */
|
|
.set noreorder
|
|
MFC0(v0, C0_COUNT)
|
|
.set reorder
|
|
j ra
|
|
END(get_count)
|
|
|
|
/* clears processor clock interrupt and we continue */
|
|
LEAF(pcount_intr)
|
|
.set noreorder
|
|
MFPC(t0,PRFCNT0) # hardware performance counters
|
|
bltz t0,1f # counter 0 overflow
|
|
nop
|
|
MFPC(t0,PRFCNT1)
|
|
bltz t0,1f # counter 1 overflow
|
|
nop # not performance, fall thru & return
|
|
j ra
|
|
mtc0 zero,C0_COMPARE # BDSLOT: ack intr
|
|
1: # performance
|
|
j hwperf_intr # call hw performance interrupt
|
|
nop # BDSLOT
|
|
.set reorder
|
|
END(pcount_intr)
|
|
|
|
/*
|
|
*
|
|
* writemcreg (reg, val)
|
|
*
|
|
* Basically this does *(volatile uint *)(PHYS_TO_COMPATK1(reg)) = val;
|
|
* a0 - physical register address
|
|
* a1 - value to write
|
|
*
|
|
* This was a workaround for a bug in the first rev MC chip, but IP28
|
|
* has only rev D (or greater) MCs, so just do the actual operation.
|
|
*/
|
|
|
|
LEAF(writemcreg)
|
|
or a0,COMPAT_K1BASE # a0 = PHYS_TO_COMPATK1(a0)
|
|
sw a1,0(a0)
|
|
j ra
|
|
END(writemcreg)
|
|
|
|
|
|
/*
|
|
* Write the VDMA MEMADR, MODE, SIZE, STRIDE registers
|
|
*
|
|
* write4vdma (buf, mode, size, stride);
|
|
*/
|
|
|
|
LEAF(write4vdma)
|
|
#if DMA_MEMADR & 0x8000
|
|
#error DMA_MEMADDR broken for IP28!
|
|
#endif
|
|
LI v0, (COMPAT_K1BASE | DMA_MEMADR)& (~0xffff)
|
|
|
|
sw a0,DMA_MEMADR & 0xffff(v0)
|
|
sw a1,DMA_MODE & 0xffff(v0)
|
|
sw a2,DMA_SIZE & 0xffff(v0)
|
|
sw a3,DMA_STRIDE & 0xffff(v0)
|
|
|
|
j ra
|
|
END(write4vdma)
|
|
|
|
#define MEMACC_XOR (CPU_MEMACC_SLOW&0x3fff)
|
|
#define CPU_MEMACC_OFFSET CPU_MEMACC-CPUCTRL0
|
|
#define MEMCFG1_OFFSET MEMCFG1-CPUCTRL0
|
|
#define LINESIZE CACHE_SLINE_SIZE
|
|
|
|
/* Enable uncachedable writes via slow memory, returning the old state.
|
|
*
|
|
* Critical section on one cache line to prevent writebacks during
|
|
* the mode switch.
|
|
*
|
|
* May be called from ECC handler w/o an SP, so do not allow cache ops
|
|
* here as all stores are to constructed addresses.
|
|
*/
|
|
LEAF(ip28_enable_ucmem)
|
|
XLEAF(ip26_enable_ucmem)
|
|
AUTO_CACHE_BARRIERS_DISABLE # all stores have dependancies
|
|
lw a2,ip28_memacc_slow # slow mode bits from memory
|
|
LI a0,K1BASE # K1
|
|
or a4,a0,ECC_CTRL_REG # ECC PAL ctrl reg.
|
|
or a0,a0,CPUCTRL0 # PHYS_TO_K1(CPUCTRL0)
|
|
|
|
lw t2,MEMCFG1_OFFSET(a0) # set-up memory config
|
|
and t3,t2,0xffff0000 # save good side of register
|
|
or t3,ECC_MEMCFG # add ECC register
|
|
|
|
.set noreorder
|
|
mfc0 t0,C0_SR # disable interrupts
|
|
ori t1,t0,SR_IE
|
|
xori t1,SR_IE
|
|
|
|
.align 7
|
|
mtc0 t1,C0_SR # critical begin
|
|
mfc0 zero,C0_SR # barrier
|
|
sw t3,MEMCFG1_OFFSET(a0) # map ecc part
|
|
lw t1,CPU_MEMACC_OFFSET(a0) # get MC memory config
|
|
andi v0,t1,0x3fff # important bits
|
|
xori v0,MEMACC_XOR # 0=slow, !0=normal
|
|
|
|
sw a2,CPU_MEMACC_OFFSET(a0) # go to slow mode on MC
|
|
lw zero,0(a0) # flushbus
|
|
sync
|
|
lw zero,0(a0) # flushbus continued
|
|
li a2,ECC_CTRL_DISABLE # disable ECC chk (uc writes ok)
|
|
sd a2,0(a4) # Enter slow mode.
|
|
lw zero,0(a0) # flushbus
|
|
sync
|
|
lw zero,0(a0) # flushbus continued
|
|
|
|
sw t2,MEMCFG1_OFFSET(a0) # restore mapping
|
|
lw zero,0(a0) # flushbus
|
|
sync
|
|
|
|
mtc0 t0,C0_SR # restore C0_SR
|
|
.set reorder
|
|
|
|
j ra
|
|
AUTO_CACHE_BARRIERS_ENABLE
|
|
END(ip28_enable_ucmem)
|
|
|
|
/* Disable uncacheable writes via faster memory, returning the old state.
|
|
*
|
|
* Critical section on one cache line to prevent writebacks during
|
|
* the mode switch.
|
|
*
|
|
* May be called from ECC handler w/o an SP, so do not allow cache ops
|
|
* here as all stores are to constructed addresses.
|
|
*/
|
|
LEAF(ip28_disable_ucmem)
|
|
XLEAF(ip26_disable_ucmem)
|
|
AUTO_CACHE_BARRIERS_DISABLE # all stores have dependancies
|
|
lw a2,ip28_memacc_norm # norm mode bits from memory
|
|
LI a0,K1BASE # K1
|
|
or a4,a0,ECC_CTRL_REG # ECC PAL ctrl reg.
|
|
or a0,a0,CPUCTRL0 # PHYS_TO_K1(CPUCTRL0)
|
|
|
|
lw t2,MEMCFG1_OFFSET(a0) # set-up memory config
|
|
and t3,t2,0xffff0000 # save good side of register
|
|
or t3,ECC_MEMCFG # add ECC register
|
|
|
|
.set noreorder
|
|
mfc0 t0,C0_SR # disable interrupts
|
|
ori t1,t0,SR_IE
|
|
xori t1,SR_IE
|
|
|
|
.align 7
|
|
mtc0 t1,C0_SR # critical begin
|
|
mfc0 zero,C0_SR # barrier
|
|
sw t3,MEMCFG1_OFFSET(a0) # map ecc part
|
|
lw t1,CPU_MEMACC_OFFSET(a0) # get MC memory config (flush)
|
|
andi v0,t1,0x3fff # important bits
|
|
xori v0,MEMACC_XOR # 0=slow, !0=normal
|
|
|
|
sd zero,0(a4) # ECC_CTRL_ENABLE==0 (Fast)
|
|
lw zero,0(a0) # flushbus
|
|
sync
|
|
lw zero,0(a0) # flushbus continued
|
|
sw a2,CPU_MEMACC_OFFSET(a0) # go to normal mode on MC
|
|
lw zero,0(a0) # flushbus
|
|
sync
|
|
lw zero,0(a0) # flushbus continued
|
|
|
|
sw t2,MEMCFG1_OFFSET(a0) # restore mapping
|
|
lw zero,0(a0) # flushbus
|
|
sync
|
|
|
|
mtc0 t0,C0_SR # restore C0_SR
|
|
.set reorder
|
|
|
|
j ra
|
|
AUTO_CACHE_BARRIERS_ENABLE
|
|
END(ip28_disable_ucmem)
|
|
|
|
LEAF(unmap_ecc)
|
|
AUTO_CACHE_BARRIERS_DISABLE # address dependancy on k0
|
|
CLI t0,PHYS_TO_COMPATK1(MEMCFG1)
|
|
lw v0,0(t0) # get current
|
|
and t3,v0,0xffff0000 # keep upper word (drop ECC)
|
|
sw t3,0(t0) # write back
|
|
lw zero,0(t0)
|
|
sync
|
|
j ra
|
|
AUTO_CACHE_BARRIERS_ENABLE
|
|
END(unmap_ecc)
|
|
|
|
/* Routine to map PAL, do write, then unmap PAL */
|
|
LEAF(ip28_write_pal)
|
|
CLI t0,PHYS_TO_COMPATK1(MEMCFG1)
|
|
LI t1,PHYS_TO_K1(ECC_CTRL_REG)
|
|
|
|
.set noreorder
|
|
AUTO_CACHE_BARRIERS_DISABLE # all stores are uncached
|
|
mfc0 v1,C0_SR # disable interrupts
|
|
or t2,v1,SR_IE
|
|
xori t2,SR_IE
|
|
mtc0 t2,C0_SR
|
|
|
|
/* Enable ECC bank */
|
|
lw v0,0(t0) # MEMCFG1
|
|
and t3,v0,0xffff0000 # keep upper word
|
|
or t3,ECC_MEMCFG # or in the ECC mapping
|
|
sw t3,0(t0) # write new value back
|
|
lw zero,0(t0) # flushbus
|
|
sync
|
|
lw zero,0(t0)
|
|
mfc0 zero,C0_SR # barrier
|
|
|
|
/* Do write and then flush */
|
|
sd a0,0(t1) # write ECC PAL
|
|
lw zero,0(t0) # flush
|
|
sync
|
|
mfc0 zero,C0_SR # barrier
|
|
|
|
/* Disable Bank and re-enable interrupts */
|
|
sw v0,0(t0) # write MC
|
|
lw zero,0(t0) # flush
|
|
sync
|
|
mfc0 zero,C0_SR # barrier
|
|
|
|
mtc0 v1,C0_SR # restore C0_SR
|
|
AUTO_CACHE_BARRIERS_ENABLE
|
|
.set reorder
|
|
|
|
j ra
|
|
END(ip28_write_pal)
|
|
|
|
/* Do correct ECC errors in line if possible (normal mode)
|
|
*/
|
|
LEAF(ip28_ecc_correct)
|
|
dli a0,ECC_DEFAULT
|
|
|
|
/* Enable ECC bank */
|
|
CLI t0,PHYS_TO_COMPATK1(MEMCFG1)
|
|
lw v0,0(t0) # MEMCFG1
|
|
and t3,v0,0xffff0000 # keep upper word
|
|
or t3,ECC_MEMCFG # or in the ECC mapping
|
|
sw t3,0(t0) # write new value back
|
|
.set noreorder
|
|
lw zero,0(t0) # flushbus
|
|
sync
|
|
lw zero,0(t0)
|
|
mfc0 zero,C0_SR # barrier
|
|
.set reorder
|
|
|
|
/* Make sure ecc write address does not conflict with instructions */
|
|
LI a1,PHYS_TO_K0(ECC_CTRL_BASE)
|
|
LA a2,1f # addr of instructions
|
|
and a2,0x3ff # 8 line mask
|
|
add a2,4*CACHE_SLINE_SIZE # split the difference
|
|
PTR_ADDU a1,a2
|
|
b 1f
|
|
|
|
/* Cached write_chip0/1 and ctrl */
|
|
LI a1,PHYS_TO_K0(ECC_CTRL_BASE)
|
|
.align 7
|
|
.set noreorder
|
|
1: sd a0,0(a1) # Write 49C466 0
|
|
sd a0,8(a1) # Write 49C466 1
|
|
sd a0,16(a1) # Replicate the above
|
|
sd a0,24(a1) # writes throughout the
|
|
sd a0,32(a1) # cache line. Each quad-
|
|
sd a0,40(a1) # word actually writes
|
|
sd a0,48(a1) # the ECC parts!
|
|
sd a0,56(a1) #
|
|
sd a0,64(a1) #
|
|
sd a0,72(a1) #
|
|
sd a0,80(a1) #
|
|
sd a0,88(a1) #
|
|
sd a0,96(a1) #
|
|
sd a0,104(a1) #
|
|
sd a0,112(a1) #
|
|
sd a0,120(a1) #
|
|
|
|
/* Write back invalidate the line */
|
|
cache CACH_SD|C_HWBINV,0(t0)
|
|
cache CACH_BARRIER,0(t0)
|
|
.set reorder
|
|
|
|
/* Disable Bank and re-enable interrupts */
|
|
sw v0,0(t0) # write MC
|
|
.set noreorder
|
|
lw zero,0(t0) # flush
|
|
sync
|
|
mfc0 zero,C0_SR # barrier
|
|
.set reorder
|
|
|
|
j ra
|
|
END(ip28_ecc_correct)
|
|
|
|
/* Return to previous memory system state. 1 == fast, 0 == slow.
|
|
*/
|
|
LEAF(ip28_return_ucmem)
|
|
XLEAF(ip26_return_ucmem)
|
|
AUTO_CACHE_BARRIERS_DISABLE # may be called from ECC hndlr
|
|
beqz a0,1f
|
|
b ip28_disable_ucmem # going to normal mode
|
|
1: b ip28_enable_ucmem # going to slow mode
|
|
AUTO_CACHE_BARRIERS_ENABLE
|
|
END(ip28_return_ucmem)
|
|
|
|
/* return the content of the R10000 C0 config register */
|
|
LEAF(get_r10k_config)
|
|
.set noreorder
|
|
mfc0 v0,C0_CONFIG
|
|
.set reorder
|
|
j ra
|
|
END(get_r10k_config)
|
|
|
|
/* Return size of secondary cache (really max cache size for start-up) */
|
|
LEAF(getcachesz)
|
|
.set noreorder
|
|
mfc0 v1,C0_CONFIG
|
|
and v1,CONFIG_SS
|
|
dsrl v1,CONFIG_SS_SHFT
|
|
dadd v1,CONFIG_SCACHE_POW2_BASE
|
|
li v0,1
|
|
j ra
|
|
dsll v0,v1 # cache size in byte
|
|
.set reorder
|
|
END(getcachesz)
|
|
|
|
/* Write back/invalidate one line from the cache. This can be used by drivers
|
|
* (enet uses it now) to have a lower overhead cacheflush when getting around
|
|
* problems with IP28 ECC baseboard.
|
|
*
|
|
* Accepts a full 64-bit bit phys, K0, or K1 address (enet tends to pass K1).
|
|
*/
|
|
LEAF(__dcache_line_wb_inval)
|
|
.set noreorder
|
|
and a0,TO_PHYS_MASK # KDM_TO_K0(a)
|
|
or a0,K0BASE
|
|
cache CACH_SD|C_HWBINV,0(a0) # write back line in a0
|
|
cache CACH_BARRIER,0(a0) # make sure line is out
|
|
.set reorder
|
|
j ra
|
|
END(__dcache_line_wb_inval)
|
|
|
|
#define NMI_ERREPC 0
|
|
#define NMI_EPC 8
|
|
#define NMI_SP 16
|
|
#define NMI_RA 24
|
|
#define NMI_SAVE_REGS 4
|
|
|
|
LEAF(nmi_dump)
|
|
.set noreorder
|
|
li k0,SR_KADDR|SR_DEFAULT # make sure C0_SR is sane
|
|
MTC0 (k0,C0_SR)
|
|
|
|
# Save some registers in nmi_saveregs.
|
|
LA k0,nmi_saveregs
|
|
DMFC0 (k1,C0_ERROR_EPC)
|
|
CACHE_BARRIER_AT(NMI_ERREPC,k0) # probably not needed
|
|
sd k1,NMI_ERREPC(k0)
|
|
DMFC0 (k1,C0_EPC)
|
|
sd k1,NMI_EPC(k0)
|
|
sd sp,NMI_SP(k0)
|
|
sd ra,NMI_RA(k0)
|
|
|
|
LA sp,dumpstack # move to dump stack
|
|
LI gp,DUMP_STACK_SIZE-16
|
|
PTR_ADD sp,gp # Set our stack pointer.
|
|
LA gp,_gp # reload gp
|
|
jal ip28_ecc_error # error handler for NMI/IP28 ECC
|
|
nop # BDSLOT
|
|
|
|
.set reorder
|
|
|
|
END(nmi_dump)
|
|
|
|
.data
|
|
|
|
EXPORT(nmi_saveregs)
|
|
.dword 0: NMI_SAVE_REGS
|
|
|
|
.text
|
|
|
|
/* void delayloop(int count, int decr)
|
|
* - delay loop with a loop factoring, and also helps sometimes with
|
|
* messy compilers.
|
|
* - more scale is needed as there are no uncached loads/cacheops around
|
|
* this code like the _ticksper1024instr code.
|
|
*/
|
|
LEAF(delayloop)
|
|
.set noreorder
|
|
sync # force loads out of T5
|
|
#ifdef US_DELAY_DEBUG
|
|
mfc0 t0,C0_COUNT
|
|
#endif
|
|
sll a0,1 # scale a bit more
|
|
1:
|
|
nop;nop;nop;nop;nop;nop;nop
|
|
bgt a0,zero,1b
|
|
subu a0,a1 # BDSLOT
|
|
#ifdef US_DELAY_DEBUG
|
|
mfc0 t1,C0_COUNT
|
|
CACHE_BARRIER
|
|
sw t0,us_before
|
|
sw t1,us_after
|
|
#endif
|
|
.set reorder
|
|
j ra
|
|
END(delayloop)
|
|
|
|
/* time dallas clock for 8 hundreths of a second, then scale to 10ms */
|
|
#define MAXSPIN 0x1fffff /* semi arbitrary... */
|
|
LEAF(_ticksper80ms)
|
|
LI t0,RT_CLOCK_ADDR
|
|
|
|
li a6,MAXSPIN
|
|
1: lw a0,0(t0) # wait for lower nibble of BCD == 0
|
|
add a6,-1
|
|
blez a6,9f # loop limiter
|
|
and a0,0x0f
|
|
bnez a0,1b
|
|
|
|
li a1,1
|
|
li a6,MAXSPIN
|
|
1: lb a0,0(t0) # wait for lower nibble of BCD == 1
|
|
and a0,0x0f # incase already @ 0.
|
|
add a6,-1
|
|
blez a6,9f # loop limiter
|
|
bne a0,a1,1b
|
|
|
|
.set noreorder
|
|
mtc0 zero,C0_COUNT # start @ 0.
|
|
li a1,9 # wait for 8/100ths of a second
|
|
li a6,MAXSPIN
|
|
|
|
1: lw a0,0(t0) # get current ticker
|
|
add a6,-1
|
|
blez a6,9f # loop limiter
|
|
and a0,0x0f # BDSLOT: hundreths
|
|
blt a0,a1,1b # spin for time
|
|
nop # BDSLOT
|
|
|
|
mfc0 v0,C0_COUNT # end time
|
|
.set reorder
|
|
|
|
j ra
|
|
9:
|
|
li v0,7800000 # gag, like at 195Mhz
|
|
|
|
j ra
|
|
END(_ticksper80ms)
|
|
|
|
/* Code to allow crippled support of IP26 baseboard to handle ECC exceptions.
|
|
* It requires the cache to work at CACHE_ERR_FRAME for a few lines. This
|
|
* will not be the case for IP28, which can do double word stores in fast
|
|
* mode.
|
|
*
|
|
* Also must jump to SEG0_BASE version of ecc_handler as cannot jump from
|
|
* alias to high memory w/o a jump register.
|
|
*/
|
|
LEAF(ecc_springboard)
|
|
.set noreorder
|
|
.set noat # AT is not yet saved
|
|
AUTO_CACHE_BARRIERS_DISABLE # code runs uncached...
|
|
/* Save 1st register in C0 by jumping through 3 hoops */
|
|
MTC0(k1,C0_LLADDR) # save the 32 LSBs of k1
|
|
dsrl32 k1,0
|
|
dsll k1,3 # make sure the 3 LSBs are 0's
|
|
MTC0(k1,C0_WATCHLO) # save the middle 29 bits of k1
|
|
dsrl32 k1,0
|
|
MTC0(k1,C0_WATCHHI) # save the 3 MSBs of k1
|
|
|
|
/* Check board revision, IP26 and IP26+ to find how to do the eframe */
|
|
CLI k1,PHYS_TO_COMPATK1(HPC3_SYS_ID)# board revision info
|
|
lw k1,0(k1)
|
|
andi k1,BOARD_REV_MASK # isolate board rev
|
|
sub k1,IP26_ECCSYSID # IP26, IP26+
|
|
bgtz k1,1f # skip if on IP28 bd
|
|
nop # BDSLOT
|
|
/* Need to cache the in a safe buffer range. If we have more than
|
|
* one error, we won't know, but that's a panic anyway.
|
|
*/
|
|
mfc0 k1,C0_CACHE_ERR # get cache err.
|
|
and k1,CACHE_TMP_EMASK # mask our index
|
|
sub k1,CACHE_TMP_EFRAME1
|
|
bnez k1,77f # if no match use frame1
|
|
nop # BDSLOT
|
|
LI k1,K0BASE|CACHE_TMP_EFRAME1
|
|
b 2f # skip IP28 case
|
|
nop # BDSLOT
|
|
77: # else use frame2
|
|
LI k1,K0BASE|CACHE_TMP_EFRAME2
|
|
b 2f # skip IP28 case
|
|
nop # BDLOST
|
|
1: LA k1,cacheErr_frames # ptr to ECC frame
|
|
dsll k1,8 # convert to physical
|
|
dsrl k1,8 # so we avoid the cache
|
|
PTR_L k1,0(k1) # get ECCF ptr
|
|
2:
|
|
|
|
sreg k0,EF_K0(k1) # save K0/AT so we have
|
|
sreg AT,EF_AT(k1) # some breathing room.
|
|
|
|
/* reconstruct and save k1 in frame */
|
|
MFC0(k0,C0_WATCHHI)
|
|
dsll32 k0,29
|
|
MFC0(AT,C0_WATCHLO)
|
|
dsll32 AT,0 # remove sign extensions
|
|
dsrl AT,3
|
|
or k0,AT
|
|
MFC0(AT,C0_LLADDR)
|
|
dsll32 AT,0 # remove sign extensions
|
|
dsrl32 AT,0
|
|
or k0,AT
|
|
sreg k0,EF_K1(k1) # ah, we can save K1 value
|
|
.set at
|
|
|
|
/* jump to direct mapped, high memory version of ecc_exception */
|
|
LA k0,ecc_exception # base address
|
|
and k0,0x7fffffff # KDM_TO_PHYS (enough of it)
|
|
jr k0 # call ecc_exception
|
|
nop # BDSLOT
|
|
.set reorder
|
|
AUTO_CACHE_BARRIERS_ENABLE
|
|
END(ecc_springboard)
|
|
|
|
LEAF(get_scache_tag)
|
|
.set noreorder
|
|
cache CACH_S|C_ILT,0(a0)
|
|
mfc0 v1,C0_TAGHI
|
|
mfc0 v0,C0_TAGLO
|
|
.set reorder
|
|
dsll v1,v1,32
|
|
or v0,v1,v0
|
|
j ra
|
|
END(get_scache_tag)
|
|
|
|
LEAF(get_dcache_tag)
|
|
.set noreorder
|
|
cache CACH_PD|C_ILT,0(a0)
|
|
mfc0 v1,C0_TAGHI
|
|
mfc0 v0,C0_TAGLO
|
|
.set reorder
|
|
dsll v1,v1,32
|
|
or v0,v1,v0
|
|
j ra
|
|
END(get_dcache_tag)
|
|
|
|
/* Specialized pacecar pagecopy routine that is unrolled a bit to avoid
|
|
* d$ speculation workaround overhead execept on the last bit.
|
|
*
|
|
* _pagecopy(void *src, void *dst, int len)
|
|
*
|
|
* Assumes src and dst are both cache line aligned and len is a multiple
|
|
* of (n*2*CACHE_SLINE_SIZE)+2*CACHE_SLINE_SIZE, ie an even number of cache
|
|
* lines greater than or equal to 4.
|
|
*
|
|
* The code does not copy the last two lines to avoid the d$ speculation
|
|
* on stores problem. The T5 has a 16 deep Address queue which has to fill
|
|
* 4 times for us to do a speculative store past the end of our buffer so
|
|
* we are safe. The trailer loop has an explicit cache barrier.
|
|
*
|
|
* This is derived from the teton function of the same name. We do not
|
|
* use prefetch as I think we bog down the address queue enough so it
|
|
* doesn't really become effective. On some strides it helps.
|
|
*
|
|
* Copy registers: a4, a5, a6, a7
|
|
*/
|
|
LEAF(_pagecopy)
|
|
AUTO_CACHE_BARRIERS_DISABLE # ends 2 cache lines (branches early)
|
|
.set noreorder
|
|
|
|
CACHE_BARRIER # ensure operands are known
|
|
|
|
beqz a2,2f # skip zero length copies
|
|
sltu t1,a0,a1 # BDSLOT: if src < dst
|
|
bnez t1,_pagecopy_backwards # then do backwards copy
|
|
li t0,2*CACHE_SLINE_SIZE # BDSLOT: size of trailer
|
|
addi a2,-(2*CACHE_SLINE_SIZE)# do last 2 lines seperately
|
|
|
|
1: ld a4, 0(a0) ; ld a5, 32(a0) # line 1 + 2
|
|
addi a2,-CACHE_SLINE_SIZE # add
|
|
ld a6, 80(a0) ; ld a7,112(a0) # line 3 + 4
|
|
sd a4, 0(a1) ; sd a5, 32(a1) # bank 1
|
|
sd a6, 80(a1) ; sd a7,112(a1) # bank 2
|
|
ld a4, 8(a0) ; ld a5, 40(a0) # bank 1
|
|
ld a6, 16(a0) ; ld a7, 24(a0) # bank 2
|
|
sd a4, 8(a1) ; sd a5, 40(a1)
|
|
sd a6, 16(a1) ; sd a7, 24(a1)
|
|
ld a4, 64(a0) ; ld a5, 72(a0) # bank 1
|
|
ld a6, 88(a0) ; ld a7,120(a0) # bank 2
|
|
sd a4, 64(a1) ; sd a5, 72(a1)
|
|
sd a6, 88(a1) ; sd a7,120(a1)
|
|
ld a4, 96(a0) ; ld a5,104(a0) # bank 1
|
|
ld a6, 48(a0) ; ld a7, 56(a0) # bank 2
|
|
sd a4, 96(a1) ; sd a5,104(a1)
|
|
sd a6, 48(a1) ; sd a7, 56(a1)
|
|
|
|
daddiu a0,CACHE_SLINE_SIZE
|
|
|
|
bgtz a2,1b # keep going?
|
|
daddiu a1,CACHE_SLINE_SIZE # BDSLOT: next dst cache line
|
|
|
|
1: addi t0,-32 # done with one chunk
|
|
ld a4, 0(a0) ; ld a5, 8(a0)
|
|
ld a6,16(a0) ; ld a7,24(a0)
|
|
sd a4, 0(a1) ; sd a5, 8(a1)
|
|
sd a6,16(a1) ; sd a7,24(a1)
|
|
|
|
daddiu a0,32 # next src chunk
|
|
daddiu a1,32 # next dst chunk
|
|
CACHE_BARRIER_AT(-32,a0) # quench store speculation
|
|
bgtz t0,1b # keep going?
|
|
nop # BDSLOT
|
|
|
|
2: j ra
|
|
nop # BDSLOT
|
|
|
|
_pagecopy_backwards:
|
|
daddu a0,a2 # start with ending addresses
|
|
daddu a1,a2
|
|
li t0,2*CACHE_SLINE_SIZE # size of trailer
|
|
addi a2,-(2*CACHE_SLINE_SIZE)# do last 2 lines seperately
|
|
|
|
1: ld a4, -8(a0) ; ld a5, -40(a0) # line 1 + 2
|
|
addi a2,-CACHE_SLINE_SIZE;
|
|
ld a6, -88(a0) ; ld a7,-120(a0) # line 3 + 4
|
|
sd a4, -8(a1) ; sd a5, -40(a1) # bank 2
|
|
sd a6, -88(a1) ; sd a7,-120(a1) # bank 1
|
|
ld a4, -16(a0) ; ld a5, -48(a0)
|
|
ld a6, -24(a0) ; ld a7, -32(a0)
|
|
sd a4, -16(a1) ; sd a5, -48(a1) # bank 2
|
|
sd a6, -24(a1) ; sd a7, -32(a1) # bank 1
|
|
ld a4, -72(a0) ; ld a5, -80(a0)
|
|
ld a6, -96(a0) ; ld a7,-128(a0)
|
|
sd a4, -72(a1) ; sd a5, -80(a1) # bank 2
|
|
sd a6, -96(a1) ; sd a7,-128(a1) # bank 1
|
|
ld a4,-104(a0) ; ld a5,-112(a0)
|
|
ld a6, -56(a0) ; ld a7, -64(a0)
|
|
sd a4,-104(a1) ; sd a5,-112(a1) # bank 2
|
|
sd a6, -56(a1) ; sd a7, -64(a1) # bank 1
|
|
|
|
daddiu a0,-CACHE_SLINE_SIZE
|
|
|
|
bgtz a2,1b # keep going?
|
|
daddiu a1,-CACHE_SLINE_SIZE # BDSLOT: next dst cache line
|
|
|
|
1: ld a4, -8(a0) ; ld a5,-16(a0)
|
|
ld a6,-24(a0) ; ld a7,-32(a0)
|
|
addi t0,-32 # done with one chunk
|
|
sd a4, -8(a1) ; sd a5,-16(a1)
|
|
sd a6,-24(a1) ; sd a7,-32(a1)
|
|
|
|
daddiu a0,-32 # next src chunk
|
|
daddiu a1,-32 # next dst chunk
|
|
CACHE_BARRIER_AT(32-8,a0) # quench store speculation
|
|
bgtz t0,1b # keep going?
|
|
nop # BDSLOT
|
|
|
|
AUTO_CACHE_BARRIERS_ENABLE
|
|
|
|
j ra
|
|
nop # BDSLOT
|
|
.set reorder
|
|
END(_pagecopy)
|
|
|
|
/* Specialized pacecar pagezero routine that is unrolled a bit to help
|
|
* with the d$ speculation problems. It stops 3 lines early which allows
|
|
* us to zero at full speed, and only slow down at the end, and still
|
|
* avoid bogus cache dirtying past the buffer.
|
|
*
|
|
* _pagezero(void *dst, int len)
|
|
*
|
|
* The dst address must be page cache aligned, and the length must be
|
|
* of more than 4 cache lines long (1 then 3 trailers).
|
|
*
|
|
* This is derived from the teton function of the same name, but does
|
|
* not use prefetch as with non blocking caches, we don't have enough
|
|
* time to really hide the latency.
|
|
*/
|
|
LEAF(_pagezero)
|
|
.set noreorder
|
|
AUTO_CACHE_BARRIERS_DISABLE # loop ends 4 lines early.
|
|
|
|
CACHE_BARRIER # ensure operands are known
|
|
|
|
beqz a1,2f # make sure length is non-zero
|
|
li t0,3*CACHE_SLINE_SIZE # BDSLOT: size of secondary copy
|
|
addi a1,-(3*CACHE_SLINE_SIZE)# do last 4 lines seperately
|
|
|
|
1: sd zero, 0(a0) ; sd zero, 32(a0) # line 1 + 2
|
|
addi a1,-CACHE_SLINE_SIZE
|
|
sd zero, 80(a0) ; sd zero,112(a0) # line 3 + 4
|
|
sd zero, 8(a0) ; sd zero, 40(a0) # bank 1
|
|
sd zero, 16(a0) ; sd zero, 24(a0) # bank 2
|
|
sd zero, 64(a0) ; sd zero, 72(a0) # bank 1
|
|
sd zero, 88(a0) ; sd zero,120(a0) # bank 2
|
|
sd zero, 96(a0) ; sd zero,104(a0) # bank 1
|
|
sd zero, 48(a0) ; sd zero, 56(a0) # bank 2
|
|
bgtz a1,1b # loop more?
|
|
daddiu a0,CACHE_SLINE_SIZE # BDSLOT: bump address
|
|
|
|
1: sd zero, 0(a0) ; sd zero, 32(a0) # line 1 + 2
|
|
addi t0,-CACHE_SLINE_SIZE
|
|
sd zero, 80(a0) ; sd zero,112(a0) # line 3 + 4
|
|
sd zero, 8(a0) ; sd zero, 40(a0) # bank 1
|
|
sd zero, 16(a0) ; sd zero, 24(a0) # bank 2
|
|
sd zero, 64(a0) ; sd zero, 72(a0) # bank 1
|
|
sd zero, 88(a0) ; sd zero,120(a0) # bank 2
|
|
sd zero, 96(a0) ; sd zero,104(a0) # bank 1
|
|
sd zero, 48(a0) ; sd zero, 56(a0) # bank 2
|
|
CACHE_BARRIER_AT(0,a0) # prevent speculation
|
|
bgtz t0,1b # loop more?
|
|
daddiu a0,CACHE_SLINE_SIZE # BDSLOT: bump address
|
|
|
|
AUTO_CACHE_BARRIERS_ENABLE
|
|
|
|
2: j ra
|
|
nop
|
|
.set reorder
|
|
END(_pagezero)
|
|
|
|
/* void nowar_bcopy(from, to, count);
|
|
* unsigned char *from, *to;
|
|
* unsigned long count;
|
|
*
|
|
* Copied from usercopy.s, removed the #ifdefs, and turn off cache barriers.
|
|
*/
|
|
#define MINCOPY 12
|
|
#define from a0 /* registers used */
|
|
#define to a1
|
|
#define count a2
|
|
|
|
#define LWS lwl
|
|
#define LWB lwr
|
|
#define LDS ldl
|
|
#define LDB ldr
|
|
#define SWS swl
|
|
#define SWB swr
|
|
#define SDS sdl
|
|
#define SDB sdr
|
|
|
|
/* Use backwards copying code if the from and to regions overlap.
|
|
* Do not worry about zero-length or other silly copies. They are not
|
|
* worth the time to optimize.
|
|
*/
|
|
LEAF(nowar_bcopy)
|
|
AUTO_CACHE_BARRIERS_DISABLE
|
|
LI t0,1<<63 # bit 63 means a kernel addr
|
|
and t0,a1,t0 # kernel dst needs WAR
|
|
beqz t0,1f
|
|
j bcopy
|
|
1:
|
|
ORD_CACHE_BARRIER_AT(0,sp) # ensure above check is ok
|
|
|
|
PTR_ADDU v0,from,count # v0 := from + count
|
|
ble to,from,goforwards # If to <= from then copy forwards
|
|
blt to,v0,gobackwards # backwards if from<to<from+count
|
|
|
|
/* Forward copy code. Check for pointer alignment and try to get both
|
|
* pointers aligned on a long boundary.
|
|
*/
|
|
goforwards:
|
|
blt count,MINCOPY,fbcopy
|
|
/* If possible, align source & destination on 64-bit boundary.
|
|
*/
|
|
and v0,from,7
|
|
and v1,to,7
|
|
li a3,8
|
|
bne v0,v1,align32 # low bits are different
|
|
|
|
/* Pointers 64-bit alignable (may be aligned). Since v0 == v1, we need only
|
|
* check what value v0 has to see how to get aligned. Also, since we have
|
|
* eliminated tiny copies, we know that the count is large enough to
|
|
* encompass the alignment copies.
|
|
*/
|
|
beq v0,zero,1f # If v0==0 then aligned
|
|
subu a3,a3,v1 # a3 = # bytes to get aligned
|
|
LDS v0,0(from)
|
|
SDS v0,0(to) # copy partial word
|
|
PTR_ADDU from,a3
|
|
PTR_ADDU to,a3
|
|
subu count,a3
|
|
1:
|
|
/* When we get here, source and destination are 64-bit aligned. Check if
|
|
* we have at least 64 bytes to move.
|
|
*/
|
|
and a3,count,~(64-1)
|
|
beq a3,zero,forwards # go do 32-bit copy
|
|
PTR_ADDU a3,a3,to
|
|
64:
|
|
ld t0,0(from); ld t1,8(from)
|
|
ld t2,16(from); ld t3,24(from)
|
|
ld ta0,32(from); ld ta1,40(from); ld ta2,48(from); ld ta3,56(from)
|
|
sd t0,0(to); sd t1,8(to); sd t2,16(to); sd t3,24(to)
|
|
sd ta0,32(to); sd ta1,40(to); sd ta2,48(to); sd ta3,56(to)
|
|
PTR_ADDU from,64
|
|
PTR_ADDU to,64
|
|
|
|
bne a3,to,64b
|
|
|
|
and count,64-1 # still have to copy non-64 multiple bytes
|
|
b forwards # complete with 32-bit copy
|
|
|
|
align32:
|
|
and v0,from,3
|
|
and v1,to,3
|
|
li a3,4
|
|
bne v0,v1,fmcopy # low bits are different
|
|
|
|
/* Pointers are alignable and may be aligned. Since v0 == v1, we need only
|
|
* check what value v0 has to see how to get aligned. Also, since we have
|
|
* eliminated tiny copies, we know that the count is large enough to
|
|
* encompass the alignment copies.
|
|
*/
|
|
beq v0,zero,forwards # If v0==0 then aligned
|
|
subu a3,a3,v1 # a3 = # bytes to get aligned
|
|
LWS v0,0(from)
|
|
SWS v0,0(to) # copy partial word
|
|
PTR_ADDU from,a3
|
|
PTR_ADDU to,a3
|
|
subu count,a3
|
|
|
|
/* Once we are here, the pointers are aligned on 32-bit boundaries
|
|
*/
|
|
forwards:
|
|
and a3,count,~(32-1)
|
|
beq a3,zero,16f
|
|
PTR_ADDU a3,a3,to
|
|
32:
|
|
lw t0,0(from); lw t1,4(from); lw t2,8(from); lw t3,12(from)
|
|
lw ta0,16(from); lw ta1,20(from); lw ta2,24(from); lw ta3,28(from)
|
|
sw t0,0(to); sw t1,4(to); sw t2,8(to); sw t3,12(to)
|
|
sw ta0,16(to); sw ta1,20(to); sw ta2,24(to); sw ta3,28(to)
|
|
PTR_ADDU from,32
|
|
PTR_ADDU to,32
|
|
bne a3,to,32b
|
|
|
|
/* We know we have fewer than 32 bytes remaining, so we do no more
|
|
* adjustments of the count.
|
|
*/
|
|
16: and v0,count,16
|
|
beq v0,zero,8f
|
|
lw t0,0(from); lw t1,4(from); lw t2,8(from); lw t3,12(from)
|
|
sw t0,0(to); sw t1,4(to); sw t2,8(to); sw t3,12(to)
|
|
PTR_ADDU from,16
|
|
PTR_ADDU to,16
|
|
|
|
8: and v1,count,8
|
|
beq v1,zero,4f
|
|
lw t0,0(from)
|
|
lw t1,4(from)
|
|
sw t0,0(to)
|
|
sw t1,4(to)
|
|
PTR_ADDU from,8
|
|
PTR_ADDU to,8
|
|
|
|
4: and v0,count,4
|
|
beq v0,zero,3f
|
|
lw t0,0(from)
|
|
sw t0,0(to)
|
|
PTR_ADDU from,4
|
|
PTR_ADDU to,4
|
|
|
|
3: and v1,count,3
|
|
PTR_ADDU from,v1
|
|
beq v1,zero,ret
|
|
PTR_ADDU to,v1
|
|
LWB t0,-1(from)
|
|
SWB t0,-1(to)
|
|
j ra
|
|
|
|
fmcopy:
|
|
/* Missaligned, non-overlap copy of many bytes. This happens too often.
|
|
* Align the destination for machines with write-thru caches.
|
|
*
|
|
* This code is always for machines that prefer nops between stores.
|
|
*
|
|
* Here v1=low bits of destination, a3=4.
|
|
*/
|
|
beq v1,zero,fmcopy4 # If v1==0 then destination is aligned
|
|
subu a3,a3,v1 # a3 = # bytes to align destination
|
|
subu count,a3
|
|
PTR_ADDU a3,to
|
|
1: lb v0,0(from)
|
|
PTR_ADDU from,1
|
|
sb v0,0(to)
|
|
PTR_ADDU to,1
|
|
bne to,a3,1b
|
|
|
|
fmcopy4:
|
|
and a3,count,~(16-1)
|
|
beq a3,zero,8f
|
|
PTR_ADDU a3,a3,to
|
|
16: LWS t0,0(from); LWB t0,0+3(from)
|
|
LWS t1,4(from); LWB t1,4+3(from); sw t0,0(to)
|
|
LWS t2,8(from); LWB t2,8+3(from); sw t1,4(to)
|
|
LWS t3,12(from); LWB t3,12+3(from); sw t2,8(to)
|
|
sw t3,12(to)
|
|
PTR_ADDU from,16
|
|
PTR_ADDU to,16
|
|
bne a3,to,16b
|
|
|
|
8: and v1,count,8
|
|
beq v1,zero,4f
|
|
LWS t0,0(from); LWB t0,0+3(from)
|
|
LWS t1,4(from); LWB t1,4+3(from); sw t0,0(to)
|
|
sw t1,4(to)
|
|
PTR_ADDU from,8
|
|
PTR_ADDU to,8
|
|
|
|
4: and v0,count,4
|
|
and count,3
|
|
beq v0,zero,fbcopy
|
|
LWS t0,0(from); LWB t0,0+3(from); sw t0,0(to)
|
|
PTR_ADDU from,4
|
|
PTR_ADDU to,4
|
|
|
|
|
|
/* Byte at a time copy code. This is used when the byte count is small.
|
|
*/
|
|
fbcopy:
|
|
PTR_ADDU a3,from,count # a3 = end+1
|
|
beq count,zero,ret # If count is zero, then we are done
|
|
|
|
1: lb v0,0(from) # v0 = *from
|
|
PTR_ADDU from,1 # advance pointer
|
|
sb v0,0(to) # Store byte
|
|
PTR_ADDU to,1 # advance pointer
|
|
bne from,a3,1b # Loop until done
|
|
ret: j ra # return to caller
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
/*
|
|
* Backward copy code. Check for pointer alignment and try to get both
|
|
* pointers aligned on a long boundary.
|
|
*/
|
|
gobackwards:
|
|
PTR_ADDU from,count # Advance to end + 1
|
|
PTR_ADDU to,count # Advance to end + 1
|
|
|
|
/* small byte counts use byte at a time copy */
|
|
blt count,MINCOPY,backwards_bytecopy
|
|
and v0,from,3 # v0 := from & 3
|
|
and v1,to,3 # v1 := to & 3
|
|
beq v0,v1,backalignable # low bits are identical
|
|
/*
|
|
* Byte at a time copy code. This is used when the pointers are not
|
|
* alignable, when the byte count is small, or when cleaning up any
|
|
* remaining bytes on a larger transfer.
|
|
*/
|
|
backwards_bytecopy:
|
|
beq count,zero,ret # If count is zero quit
|
|
PTR_SUBU from,1 # Reduce by one (point at byte)
|
|
PTR_SUBU to,1 # Reduce by one (point at byte)
|
|
PTR_SUBU v1,from,count # v1 := original from - 1
|
|
|
|
99: lb v0,0(from) # v0 = *from
|
|
PTR_SUBU from,1 # backup pointer
|
|
sb v0,0(to) # Store byte
|
|
PTR_SUBU to,1 # backup pointer
|
|
bne from,v1,99b # Loop until done
|
|
j ra # return to caller
|
|
|
|
/*
|
|
* Pointers are alignable, and may be aligned. Since v0 == v1, we need only
|
|
* check what value v0 has to see how to get aligned. Also, since we have
|
|
* eliminated tiny copies, we know that the count is large enough to
|
|
* encompass the alignment copies.
|
|
*/
|
|
backalignable:
|
|
beq v0,zero,backwards # If v0==v1 && v0==0 then aligned
|
|
beq v0,3,back_copy3 # Need to copy 3 bytes to get aligned
|
|
beq v0,2,back_copy2 # Need to copy 2 bytes to get aligned
|
|
|
|
/* need to copy 1 byte */
|
|
lb v0,-1(from) # get one byte
|
|
PTR_SUBU from,1 # backup pointer
|
|
sb v0,-1(to) # store one byte
|
|
PTR_SUBU to,1 # backup pointer
|
|
subu count,1 # and reduce count
|
|
b backwards # Now pointers are aligned
|
|
|
|
/* need to copy 2 bytes */
|
|
back_copy2:
|
|
lh v0,-2(from) # get one short
|
|
PTR_SUBU from,2 # backup pointer
|
|
sh v0,-2(to) # store one short
|
|
PTR_SUBU to,2 # backup pointer
|
|
subu count,2 # and reduce count
|
|
b backwards
|
|
|
|
/* need to copy 3 bytes */
|
|
back_copy3:
|
|
lb v0,-1(from) # get one byte
|
|
lh v1,-3(from) # and one short
|
|
PTR_SUBU from,3 # backup pointer
|
|
sb v0,-1(to) # store one byte
|
|
sh v1,-3(to) # and one short
|
|
PTR_SUBU to,3 # backup pointer
|
|
subu count,3 # and reduce count
|
|
/* FALLTHROUGH */
|
|
/*
|
|
* Once we are here, the pointers are aligned on long boundaries.
|
|
* Begin copying in large chunks.
|
|
*/
|
|
backwards:
|
|
|
|
/* 32 byte at a time loop */
|
|
backwards_32:
|
|
blt count,32,backwards_16 # do 16 bytes at a time
|
|
lw v0,-4(from)
|
|
lw v1,-8(from)
|
|
lw t0,-12(from)
|
|
lw t1,-16(from)
|
|
lw t2,-20(from)
|
|
lw t3,-24(from)
|
|
lw ta0,-28(from)
|
|
lw ta1,-32(from) # Fetch 8*4 bytes
|
|
PTR_SUBU from,32 # backup from pointer now
|
|
sw v0,-4(to)
|
|
sw v1,-8(to)
|
|
sw t0,-12(to)
|
|
sw t1,-16(to)
|
|
sw t2,-20(to)
|
|
sw t3,-24(to)
|
|
sw ta0,-28(to)
|
|
sw ta1,-32(to) # Store 8*4 bytes
|
|
PTR_SUBU to,32 # backup to pointer now
|
|
subu count,32 # Reduce count
|
|
b backwards_32 # Try some more
|
|
|
|
/* 16 byte at a time loop */
|
|
backwards_16:
|
|
blt count,16,backwards_4 # Do rest in words
|
|
lw v0,-4(from)
|
|
lw v1,-8(from)
|
|
lw t0,-12(from)
|
|
lw t1,-16(from)
|
|
PTR_SUBU from,16 # backup from pointer now
|
|
sw v0,-4(to)
|
|
sw v1,-8(to)
|
|
sw t0,-12(to)
|
|
sw t1,-16(to)
|
|
PTR_SUBU to,16 # backup to pointer now
|
|
subu count,16 # Reduce count
|
|
b backwards_16 # Try some more
|
|
|
|
/* 4 byte at a time loop */
|
|
backwards_4:
|
|
blt count,4,backwards_bytecopy # Do rest
|
|
lw v0,-4(from)
|
|
PTR_SUBU from,4 # backup from pointer
|
|
sw v0,-4(to)
|
|
PTR_SUBU to,4 # backup to pointer
|
|
subu count,4 # Reduce count
|
|
b backwards_4
|
|
AUTO_CACHE_BARRIERS_ENABLE
|
|
END(nowar_bcopy)
|
|
#undef from
|
|
#undef to
|
|
#undef count
|