1
0
Files
irix-657m-src/irix/kern/ml/IP28asm.s
2022-09-29 17:59:04 +03:00

1135 lines
29 KiB
ArmAsm

/*
* IP28 specific assembly routines; cpuid always 0, also make semaphore
* macros a no-op.
*/
#ident "$Revision: 1.58 $"
#include "ml/ml.h"
#include <sys/RACER/gda.h>
#include <sys/dump.h>
/* dummy routines whose return value is unimportant (or no return value).
Some return reasonable values on other machines, but should never
be called, or the return value should never be used on other machines.
*/
LEAF(dummy_func)
XLEAF(check_delay_tlbflush)
XLEAF(check_delay_iflush)
XLEAF(da_flush_tlb)
XLEAF(dma_mapinit)
XLEAF(apsfail)
XLEAF(disallowboot)
XLEAF(rmi_fixecc)
XLEAF(vme_init)
XLEAF(vme_ivec_init)
XLEAF(debug_stop_all_cpus)
XLEAF(bump_leds)
XLEAF(reset_enet_carrier) /* for if_ec2.c */
j ra
END(dummy_func)
LEAF(dcache_wb)
XLEAF(dcache_wbinval)
XLEAF(dki_dcache_wb)
XLEAF(dki_dcache_wbinval)
LI a2,CACH_DCACHE|CACH_INVAL|CACH_WBACK|CACH_IO_COHERENCY
j cache_operation
END(dcache_wb)
LEAF(dki_dcache_inval)
LI a2,CACH_DCACHE|CACH_INVAL|CACH_IO_COHERENCY
j cache_operation
END(dki_dcache_inval)
/* dummy routines that return 0 */
LEAF(dummyret0_func)
XLEAF(vme_adapter)
XLEAF(is_vme_space)
XLEAF(getcpuid)
XLEAF(disarm_threeway_trigger)
XLEAF(threeway_trigger_armed)
#ifdef DEBUG
XLEAF(getcyclecounter)
#endif /* DEBUG */
/* Semaphore call stubs */
XLEAF(appsema)
XLEAF(apvsema)
XLEAF(apcvsema)
move v0,zero
j ra
END(dummyret0_func)
/* dummy routines that return 1 */
LEAF(dummyret1_func)
XLEAF(apcpsema) /* can always get on non-MP machines */
XLEAF(enet_carrier_on) /* for if_ec2.c */
li v0, 1
j ra
END(dummyret1_func)
/* unsigned int get_count(void)
*/
LEAF(get_count)
XLEAF(get_r4k_counter) /* for compat and R4800 is a r4k */
XLEAF(_get_timestamp) /* return timestamp on SP */
.set noreorder
MFC0(v0, C0_COUNT)
.set reorder
j ra
END(get_count)
/* clears processor clock interrupt and we continue */
LEAF(pcount_intr)
.set noreorder
MFPC(t0,PRFCNT0) # hardware performance counters
bltz t0,1f # counter 0 overflow
nop
MFPC(t0,PRFCNT1)
bltz t0,1f # counter 1 overflow
nop # not performance, fall thru & return
j ra
mtc0 zero,C0_COMPARE # BDSLOT: ack intr
1: # performance
j hwperf_intr # call hw performance interrupt
nop # BDSLOT
.set reorder
END(pcount_intr)
/*
*
* writemcreg (reg, val)
*
* Basically this does *(volatile uint *)(PHYS_TO_COMPATK1(reg)) = val;
* a0 - physical register address
* a1 - value to write
*
* This was a workaround for a bug in the first rev MC chip, but IP28
* has only rev D (or greater) MCs, so just do the actual operation.
*/
LEAF(writemcreg)
or a0,COMPAT_K1BASE # a0 = PHYS_TO_COMPATK1(a0)
sw a1,0(a0)
j ra
END(writemcreg)
/*
* Write the VDMA MEMADR, MODE, SIZE, STRIDE registers
*
* write4vdma (buf, mode, size, stride);
*/
LEAF(write4vdma)
#if DMA_MEMADR & 0x8000
#error DMA_MEMADDR broken for IP28!
#endif
LI v0, (COMPAT_K1BASE | DMA_MEMADR)& (~0xffff)
sw a0,DMA_MEMADR & 0xffff(v0)
sw a1,DMA_MODE & 0xffff(v0)
sw a2,DMA_SIZE & 0xffff(v0)
sw a3,DMA_STRIDE & 0xffff(v0)
j ra
END(write4vdma)
#define MEMACC_XOR (CPU_MEMACC_SLOW&0x3fff)
#define CPU_MEMACC_OFFSET CPU_MEMACC-CPUCTRL0
#define MEMCFG1_OFFSET MEMCFG1-CPUCTRL0
#define LINESIZE CACHE_SLINE_SIZE
/* Enable uncachedable writes via slow memory, returning the old state.
*
* Critical section on one cache line to prevent writebacks during
* the mode switch.
*
* May be called from ECC handler w/o an SP, so do not allow cache ops
* here as all stores are to constructed addresses.
*/
LEAF(ip28_enable_ucmem)
XLEAF(ip26_enable_ucmem)
AUTO_CACHE_BARRIERS_DISABLE # all stores have dependancies
lw a2,ip28_memacc_slow # slow mode bits from memory
LI a0,K1BASE # K1
or a4,a0,ECC_CTRL_REG # ECC PAL ctrl reg.
or a0,a0,CPUCTRL0 # PHYS_TO_K1(CPUCTRL0)
lw t2,MEMCFG1_OFFSET(a0) # set-up memory config
and t3,t2,0xffff0000 # save good side of register
or t3,ECC_MEMCFG # add ECC register
.set noreorder
mfc0 t0,C0_SR # disable interrupts
ori t1,t0,SR_IE
xori t1,SR_IE
.align 7
mtc0 t1,C0_SR # critical begin
mfc0 zero,C0_SR # barrier
sw t3,MEMCFG1_OFFSET(a0) # map ecc part
lw t1,CPU_MEMACC_OFFSET(a0) # get MC memory config
andi v0,t1,0x3fff # important bits
xori v0,MEMACC_XOR # 0=slow, !0=normal
sw a2,CPU_MEMACC_OFFSET(a0) # go to slow mode on MC
lw zero,0(a0) # flushbus
sync
lw zero,0(a0) # flushbus continued
li a2,ECC_CTRL_DISABLE # disable ECC chk (uc writes ok)
sd a2,0(a4) # Enter slow mode.
lw zero,0(a0) # flushbus
sync
lw zero,0(a0) # flushbus continued
sw t2,MEMCFG1_OFFSET(a0) # restore mapping
lw zero,0(a0) # flushbus
sync
mtc0 t0,C0_SR # restore C0_SR
.set reorder
j ra
AUTO_CACHE_BARRIERS_ENABLE
END(ip28_enable_ucmem)
/* Disable uncacheable writes via faster memory, returning the old state.
*
* Critical section on one cache line to prevent writebacks during
* the mode switch.
*
* May be called from ECC handler w/o an SP, so do not allow cache ops
* here as all stores are to constructed addresses.
*/
LEAF(ip28_disable_ucmem)
XLEAF(ip26_disable_ucmem)
AUTO_CACHE_BARRIERS_DISABLE # all stores have dependancies
lw a2,ip28_memacc_norm # norm mode bits from memory
LI a0,K1BASE # K1
or a4,a0,ECC_CTRL_REG # ECC PAL ctrl reg.
or a0,a0,CPUCTRL0 # PHYS_TO_K1(CPUCTRL0)
lw t2,MEMCFG1_OFFSET(a0) # set-up memory config
and t3,t2,0xffff0000 # save good side of register
or t3,ECC_MEMCFG # add ECC register
.set noreorder
mfc0 t0,C0_SR # disable interrupts
ori t1,t0,SR_IE
xori t1,SR_IE
.align 7
mtc0 t1,C0_SR # critical begin
mfc0 zero,C0_SR # barrier
sw t3,MEMCFG1_OFFSET(a0) # map ecc part
lw t1,CPU_MEMACC_OFFSET(a0) # get MC memory config (flush)
andi v0,t1,0x3fff # important bits
xori v0,MEMACC_XOR # 0=slow, !0=normal
sd zero,0(a4) # ECC_CTRL_ENABLE==0 (Fast)
lw zero,0(a0) # flushbus
sync
lw zero,0(a0) # flushbus continued
sw a2,CPU_MEMACC_OFFSET(a0) # go to normal mode on MC
lw zero,0(a0) # flushbus
sync
lw zero,0(a0) # flushbus continued
sw t2,MEMCFG1_OFFSET(a0) # restore mapping
lw zero,0(a0) # flushbus
sync
mtc0 t0,C0_SR # restore C0_SR
.set reorder
j ra
AUTO_CACHE_BARRIERS_ENABLE
END(ip28_disable_ucmem)
LEAF(unmap_ecc)
AUTO_CACHE_BARRIERS_DISABLE # address dependancy on k0
CLI t0,PHYS_TO_COMPATK1(MEMCFG1)
lw v0,0(t0) # get current
and t3,v0,0xffff0000 # keep upper word (drop ECC)
sw t3,0(t0) # write back
lw zero,0(t0)
sync
j ra
AUTO_CACHE_BARRIERS_ENABLE
END(unmap_ecc)
/* Routine to map PAL, do write, then unmap PAL */
LEAF(ip28_write_pal)
CLI t0,PHYS_TO_COMPATK1(MEMCFG1)
LI t1,PHYS_TO_K1(ECC_CTRL_REG)
.set noreorder
AUTO_CACHE_BARRIERS_DISABLE # all stores are uncached
mfc0 v1,C0_SR # disable interrupts
or t2,v1,SR_IE
xori t2,SR_IE
mtc0 t2,C0_SR
/* Enable ECC bank */
lw v0,0(t0) # MEMCFG1
and t3,v0,0xffff0000 # keep upper word
or t3,ECC_MEMCFG # or in the ECC mapping
sw t3,0(t0) # write new value back
lw zero,0(t0) # flushbus
sync
lw zero,0(t0)
mfc0 zero,C0_SR # barrier
/* Do write and then flush */
sd a0,0(t1) # write ECC PAL
lw zero,0(t0) # flush
sync
mfc0 zero,C0_SR # barrier
/* Disable Bank and re-enable interrupts */
sw v0,0(t0) # write MC
lw zero,0(t0) # flush
sync
mfc0 zero,C0_SR # barrier
mtc0 v1,C0_SR # restore C0_SR
AUTO_CACHE_BARRIERS_ENABLE
.set reorder
j ra
END(ip28_write_pal)
/* Do correct ECC errors in line if possible (normal mode)
*/
LEAF(ip28_ecc_correct)
dli a0,ECC_DEFAULT
/* Enable ECC bank */
CLI t0,PHYS_TO_COMPATK1(MEMCFG1)
lw v0,0(t0) # MEMCFG1
and t3,v0,0xffff0000 # keep upper word
or t3,ECC_MEMCFG # or in the ECC mapping
sw t3,0(t0) # write new value back
.set noreorder
lw zero,0(t0) # flushbus
sync
lw zero,0(t0)
mfc0 zero,C0_SR # barrier
.set reorder
/* Make sure ecc write address does not conflict with instructions */
LI a1,PHYS_TO_K0(ECC_CTRL_BASE)
LA a2,1f # addr of instructions
and a2,0x3ff # 8 line mask
add a2,4*CACHE_SLINE_SIZE # split the difference
PTR_ADDU a1,a2
b 1f
/* Cached write_chip0/1 and ctrl */
LI a1,PHYS_TO_K0(ECC_CTRL_BASE)
.align 7
.set noreorder
1: sd a0,0(a1) # Write 49C466 0
sd a0,8(a1) # Write 49C466 1
sd a0,16(a1) # Replicate the above
sd a0,24(a1) # writes throughout the
sd a0,32(a1) # cache line. Each quad-
sd a0,40(a1) # word actually writes
sd a0,48(a1) # the ECC parts!
sd a0,56(a1) #
sd a0,64(a1) #
sd a0,72(a1) #
sd a0,80(a1) #
sd a0,88(a1) #
sd a0,96(a1) #
sd a0,104(a1) #
sd a0,112(a1) #
sd a0,120(a1) #
/* Write back invalidate the line */
cache CACH_SD|C_HWBINV,0(t0)
cache CACH_BARRIER,0(t0)
.set reorder
/* Disable Bank and re-enable interrupts */
sw v0,0(t0) # write MC
.set noreorder
lw zero,0(t0) # flush
sync
mfc0 zero,C0_SR # barrier
.set reorder
j ra
END(ip28_ecc_correct)
/* Return to previous memory system state. 1 == fast, 0 == slow.
*/
LEAF(ip28_return_ucmem)
XLEAF(ip26_return_ucmem)
AUTO_CACHE_BARRIERS_DISABLE # may be called from ECC hndlr
beqz a0,1f
b ip28_disable_ucmem # going to normal mode
1: b ip28_enable_ucmem # going to slow mode
AUTO_CACHE_BARRIERS_ENABLE
END(ip28_return_ucmem)
/* return the content of the R10000 C0 config register */
LEAF(get_r10k_config)
.set noreorder
mfc0 v0,C0_CONFIG
.set reorder
j ra
END(get_r10k_config)
/* Return size of secondary cache (really max cache size for start-up) */
LEAF(getcachesz)
.set noreorder
mfc0 v1,C0_CONFIG
and v1,CONFIG_SS
dsrl v1,CONFIG_SS_SHFT
dadd v1,CONFIG_SCACHE_POW2_BASE
li v0,1
j ra
dsll v0,v1 # cache size in byte
.set reorder
END(getcachesz)
/* Write back/invalidate one line from the cache. This can be used by drivers
* (enet uses it now) to have a lower overhead cacheflush when getting around
* problems with IP28 ECC baseboard.
*
* Accepts a full 64-bit bit phys, K0, or K1 address (enet tends to pass K1).
*/
LEAF(__dcache_line_wb_inval)
.set noreorder
and a0,TO_PHYS_MASK # KDM_TO_K0(a)
or a0,K0BASE
cache CACH_SD|C_HWBINV,0(a0) # write back line in a0
cache CACH_BARRIER,0(a0) # make sure line is out
.set reorder
j ra
END(__dcache_line_wb_inval)
#define NMI_ERREPC 0
#define NMI_EPC 8
#define NMI_SP 16
#define NMI_RA 24
#define NMI_SAVE_REGS 4
LEAF(nmi_dump)
.set noreorder
li k0,SR_KADDR|SR_DEFAULT # make sure C0_SR is sane
MTC0 (k0,C0_SR)
# Save some registers in nmi_saveregs.
LA k0,nmi_saveregs
DMFC0 (k1,C0_ERROR_EPC)
CACHE_BARRIER_AT(NMI_ERREPC,k0) # probably not needed
sd k1,NMI_ERREPC(k0)
DMFC0 (k1,C0_EPC)
sd k1,NMI_EPC(k0)
sd sp,NMI_SP(k0)
sd ra,NMI_RA(k0)
LA sp,dumpstack # move to dump stack
LI gp,DUMP_STACK_SIZE-16
PTR_ADD sp,gp # Set our stack pointer.
LA gp,_gp # reload gp
jal ip28_ecc_error # error handler for NMI/IP28 ECC
nop # BDSLOT
.set reorder
END(nmi_dump)
.data
EXPORT(nmi_saveregs)
.dword 0: NMI_SAVE_REGS
.text
/* void delayloop(int count, int decr)
* - delay loop with a loop factoring, and also helps sometimes with
* messy compilers.
* - more scale is needed as there are no uncached loads/cacheops around
* this code like the _ticksper1024instr code.
*/
LEAF(delayloop)
.set noreorder
sync # force loads out of T5
#ifdef US_DELAY_DEBUG
mfc0 t0,C0_COUNT
#endif
sll a0,1 # scale a bit more
1:
nop;nop;nop;nop;nop;nop;nop
bgt a0,zero,1b
subu a0,a1 # BDSLOT
#ifdef US_DELAY_DEBUG
mfc0 t1,C0_COUNT
CACHE_BARRIER
sw t0,us_before
sw t1,us_after
#endif
.set reorder
j ra
END(delayloop)
/* time dallas clock for 8 hundreths of a second, then scale to 10ms */
#define MAXSPIN 0x1fffff /* semi arbitrary... */
LEAF(_ticksper80ms)
LI t0,RT_CLOCK_ADDR
li a6,MAXSPIN
1: lw a0,0(t0) # wait for lower nibble of BCD == 0
add a6,-1
blez a6,9f # loop limiter
and a0,0x0f
bnez a0,1b
li a1,1
li a6,MAXSPIN
1: lb a0,0(t0) # wait for lower nibble of BCD == 1
and a0,0x0f # incase already @ 0.
add a6,-1
blez a6,9f # loop limiter
bne a0,a1,1b
.set noreorder
mtc0 zero,C0_COUNT # start @ 0.
li a1,9 # wait for 8/100ths of a second
li a6,MAXSPIN
1: lw a0,0(t0) # get current ticker
add a6,-1
blez a6,9f # loop limiter
and a0,0x0f # BDSLOT: hundreths
blt a0,a1,1b # spin for time
nop # BDSLOT
mfc0 v0,C0_COUNT # end time
.set reorder
j ra
9:
li v0,7800000 # gag, like at 195Mhz
j ra
END(_ticksper80ms)
/* Code to allow crippled support of IP26 baseboard to handle ECC exceptions.
* It requires the cache to work at CACHE_ERR_FRAME for a few lines. This
* will not be the case for IP28, which can do double word stores in fast
* mode.
*
* Also must jump to SEG0_BASE version of ecc_handler as cannot jump from
* alias to high memory w/o a jump register.
*/
LEAF(ecc_springboard)
.set noreorder
.set noat # AT is not yet saved
AUTO_CACHE_BARRIERS_DISABLE # code runs uncached...
/* Save 1st register in C0 by jumping through 3 hoops */
MTC0(k1,C0_LLADDR) # save the 32 LSBs of k1
dsrl32 k1,0
dsll k1,3 # make sure the 3 LSBs are 0's
MTC0(k1,C0_WATCHLO) # save the middle 29 bits of k1
dsrl32 k1,0
MTC0(k1,C0_WATCHHI) # save the 3 MSBs of k1
/* Check board revision, IP26 and IP26+ to find how to do the eframe */
CLI k1,PHYS_TO_COMPATK1(HPC3_SYS_ID)# board revision info
lw k1,0(k1)
andi k1,BOARD_REV_MASK # isolate board rev
sub k1,IP26_ECCSYSID # IP26, IP26+
bgtz k1,1f # skip if on IP28 bd
nop # BDSLOT
/* Need to cache the in a safe buffer range. If we have more than
* one error, we won't know, but that's a panic anyway.
*/
mfc0 k1,C0_CACHE_ERR # get cache err.
and k1,CACHE_TMP_EMASK # mask our index
sub k1,CACHE_TMP_EFRAME1
bnez k1,77f # if no match use frame1
nop # BDSLOT
LI k1,K0BASE|CACHE_TMP_EFRAME1
b 2f # skip IP28 case
nop # BDSLOT
77: # else use frame2
LI k1,K0BASE|CACHE_TMP_EFRAME2
b 2f # skip IP28 case
nop # BDLOST
1: LA k1,cacheErr_frames # ptr to ECC frame
dsll k1,8 # convert to physical
dsrl k1,8 # so we avoid the cache
PTR_L k1,0(k1) # get ECCF ptr
2:
sreg k0,EF_K0(k1) # save K0/AT so we have
sreg AT,EF_AT(k1) # some breathing room.
/* reconstruct and save k1 in frame */
MFC0(k0,C0_WATCHHI)
dsll32 k0,29
MFC0(AT,C0_WATCHLO)
dsll32 AT,0 # remove sign extensions
dsrl AT,3
or k0,AT
MFC0(AT,C0_LLADDR)
dsll32 AT,0 # remove sign extensions
dsrl32 AT,0
or k0,AT
sreg k0,EF_K1(k1) # ah, we can save K1 value
.set at
/* jump to direct mapped, high memory version of ecc_exception */
LA k0,ecc_exception # base address
and k0,0x7fffffff # KDM_TO_PHYS (enough of it)
jr k0 # call ecc_exception
nop # BDSLOT
.set reorder
AUTO_CACHE_BARRIERS_ENABLE
END(ecc_springboard)
LEAF(get_scache_tag)
.set noreorder
cache CACH_S|C_ILT,0(a0)
mfc0 v1,C0_TAGHI
mfc0 v0,C0_TAGLO
.set reorder
dsll v1,v1,32
or v0,v1,v0
j ra
END(get_scache_tag)
LEAF(get_dcache_tag)
.set noreorder
cache CACH_PD|C_ILT,0(a0)
mfc0 v1,C0_TAGHI
mfc0 v0,C0_TAGLO
.set reorder
dsll v1,v1,32
or v0,v1,v0
j ra
END(get_dcache_tag)
/* Specialized pacecar pagecopy routine that is unrolled a bit to avoid
* d$ speculation workaround overhead execept on the last bit.
*
* _pagecopy(void *src, void *dst, int len)
*
* Assumes src and dst are both cache line aligned and len is a multiple
* of (n*2*CACHE_SLINE_SIZE)+2*CACHE_SLINE_SIZE, ie an even number of cache
* lines greater than or equal to 4.
*
* The code does not copy the last two lines to avoid the d$ speculation
* on stores problem. The T5 has a 16 deep Address queue which has to fill
* 4 times for us to do a speculative store past the end of our buffer so
* we are safe. The trailer loop has an explicit cache barrier.
*
* This is derived from the teton function of the same name. We do not
* use prefetch as I think we bog down the address queue enough so it
* doesn't really become effective. On some strides it helps.
*
* Copy registers: a4, a5, a6, a7
*/
LEAF(_pagecopy)
AUTO_CACHE_BARRIERS_DISABLE # ends 2 cache lines (branches early)
.set noreorder
CACHE_BARRIER # ensure operands are known
beqz a2,2f # skip zero length copies
sltu t1,a0,a1 # BDSLOT: if src < dst
bnez t1,_pagecopy_backwards # then do backwards copy
li t0,2*CACHE_SLINE_SIZE # BDSLOT: size of trailer
addi a2,-(2*CACHE_SLINE_SIZE)# do last 2 lines seperately
1: ld a4, 0(a0) ; ld a5, 32(a0) # line 1 + 2
addi a2,-CACHE_SLINE_SIZE # add
ld a6, 80(a0) ; ld a7,112(a0) # line 3 + 4
sd a4, 0(a1) ; sd a5, 32(a1) # bank 1
sd a6, 80(a1) ; sd a7,112(a1) # bank 2
ld a4, 8(a0) ; ld a5, 40(a0) # bank 1
ld a6, 16(a0) ; ld a7, 24(a0) # bank 2
sd a4, 8(a1) ; sd a5, 40(a1)
sd a6, 16(a1) ; sd a7, 24(a1)
ld a4, 64(a0) ; ld a5, 72(a0) # bank 1
ld a6, 88(a0) ; ld a7,120(a0) # bank 2
sd a4, 64(a1) ; sd a5, 72(a1)
sd a6, 88(a1) ; sd a7,120(a1)
ld a4, 96(a0) ; ld a5,104(a0) # bank 1
ld a6, 48(a0) ; ld a7, 56(a0) # bank 2
sd a4, 96(a1) ; sd a5,104(a1)
sd a6, 48(a1) ; sd a7, 56(a1)
daddiu a0,CACHE_SLINE_SIZE
bgtz a2,1b # keep going?
daddiu a1,CACHE_SLINE_SIZE # BDSLOT: next dst cache line
1: addi t0,-32 # done with one chunk
ld a4, 0(a0) ; ld a5, 8(a0)
ld a6,16(a0) ; ld a7,24(a0)
sd a4, 0(a1) ; sd a5, 8(a1)
sd a6,16(a1) ; sd a7,24(a1)
daddiu a0,32 # next src chunk
daddiu a1,32 # next dst chunk
CACHE_BARRIER_AT(-32,a0) # quench store speculation
bgtz t0,1b # keep going?
nop # BDSLOT
2: j ra
nop # BDSLOT
_pagecopy_backwards:
daddu a0,a2 # start with ending addresses
daddu a1,a2
li t0,2*CACHE_SLINE_SIZE # size of trailer
addi a2,-(2*CACHE_SLINE_SIZE)# do last 2 lines seperately
1: ld a4, -8(a0) ; ld a5, -40(a0) # line 1 + 2
addi a2,-CACHE_SLINE_SIZE;
ld a6, -88(a0) ; ld a7,-120(a0) # line 3 + 4
sd a4, -8(a1) ; sd a5, -40(a1) # bank 2
sd a6, -88(a1) ; sd a7,-120(a1) # bank 1
ld a4, -16(a0) ; ld a5, -48(a0)
ld a6, -24(a0) ; ld a7, -32(a0)
sd a4, -16(a1) ; sd a5, -48(a1) # bank 2
sd a6, -24(a1) ; sd a7, -32(a1) # bank 1
ld a4, -72(a0) ; ld a5, -80(a0)
ld a6, -96(a0) ; ld a7,-128(a0)
sd a4, -72(a1) ; sd a5, -80(a1) # bank 2
sd a6, -96(a1) ; sd a7,-128(a1) # bank 1
ld a4,-104(a0) ; ld a5,-112(a0)
ld a6, -56(a0) ; ld a7, -64(a0)
sd a4,-104(a1) ; sd a5,-112(a1) # bank 2
sd a6, -56(a1) ; sd a7, -64(a1) # bank 1
daddiu a0,-CACHE_SLINE_SIZE
bgtz a2,1b # keep going?
daddiu a1,-CACHE_SLINE_SIZE # BDSLOT: next dst cache line
1: ld a4, -8(a0) ; ld a5,-16(a0)
ld a6,-24(a0) ; ld a7,-32(a0)
addi t0,-32 # done with one chunk
sd a4, -8(a1) ; sd a5,-16(a1)
sd a6,-24(a1) ; sd a7,-32(a1)
daddiu a0,-32 # next src chunk
daddiu a1,-32 # next dst chunk
CACHE_BARRIER_AT(32-8,a0) # quench store speculation
bgtz t0,1b # keep going?
nop # BDSLOT
AUTO_CACHE_BARRIERS_ENABLE
j ra
nop # BDSLOT
.set reorder
END(_pagecopy)
/* Specialized pacecar pagezero routine that is unrolled a bit to help
* with the d$ speculation problems. It stops 3 lines early which allows
* us to zero at full speed, and only slow down at the end, and still
* avoid bogus cache dirtying past the buffer.
*
* _pagezero(void *dst, int len)
*
* The dst address must be page cache aligned, and the length must be
* of more than 4 cache lines long (1 then 3 trailers).
*
* This is derived from the teton function of the same name, but does
* not use prefetch as with non blocking caches, we don't have enough
* time to really hide the latency.
*/
LEAF(_pagezero)
.set noreorder
AUTO_CACHE_BARRIERS_DISABLE # loop ends 4 lines early.
CACHE_BARRIER # ensure operands are known
beqz a1,2f # make sure length is non-zero
li t0,3*CACHE_SLINE_SIZE # BDSLOT: size of secondary copy
addi a1,-(3*CACHE_SLINE_SIZE)# do last 4 lines seperately
1: sd zero, 0(a0) ; sd zero, 32(a0) # line 1 + 2
addi a1,-CACHE_SLINE_SIZE
sd zero, 80(a0) ; sd zero,112(a0) # line 3 + 4
sd zero, 8(a0) ; sd zero, 40(a0) # bank 1
sd zero, 16(a0) ; sd zero, 24(a0) # bank 2
sd zero, 64(a0) ; sd zero, 72(a0) # bank 1
sd zero, 88(a0) ; sd zero,120(a0) # bank 2
sd zero, 96(a0) ; sd zero,104(a0) # bank 1
sd zero, 48(a0) ; sd zero, 56(a0) # bank 2
bgtz a1,1b # loop more?
daddiu a0,CACHE_SLINE_SIZE # BDSLOT: bump address
1: sd zero, 0(a0) ; sd zero, 32(a0) # line 1 + 2
addi t0,-CACHE_SLINE_SIZE
sd zero, 80(a0) ; sd zero,112(a0) # line 3 + 4
sd zero, 8(a0) ; sd zero, 40(a0) # bank 1
sd zero, 16(a0) ; sd zero, 24(a0) # bank 2
sd zero, 64(a0) ; sd zero, 72(a0) # bank 1
sd zero, 88(a0) ; sd zero,120(a0) # bank 2
sd zero, 96(a0) ; sd zero,104(a0) # bank 1
sd zero, 48(a0) ; sd zero, 56(a0) # bank 2
CACHE_BARRIER_AT(0,a0) # prevent speculation
bgtz t0,1b # loop more?
daddiu a0,CACHE_SLINE_SIZE # BDSLOT: bump address
AUTO_CACHE_BARRIERS_ENABLE
2: j ra
nop
.set reorder
END(_pagezero)
/* void nowar_bcopy(from, to, count);
* unsigned char *from, *to;
* unsigned long count;
*
* Copied from usercopy.s, removed the #ifdefs, and turn off cache barriers.
*/
#define MINCOPY 12
#define from a0 /* registers used */
#define to a1
#define count a2
#define LWS lwl
#define LWB lwr
#define LDS ldl
#define LDB ldr
#define SWS swl
#define SWB swr
#define SDS sdl
#define SDB sdr
/* Use backwards copying code if the from and to regions overlap.
* Do not worry about zero-length or other silly copies. They are not
* worth the time to optimize.
*/
LEAF(nowar_bcopy)
AUTO_CACHE_BARRIERS_DISABLE
LI t0,1<<63 # bit 63 means a kernel addr
and t0,a1,t0 # kernel dst needs WAR
beqz t0,1f
j bcopy
1:
ORD_CACHE_BARRIER_AT(0,sp) # ensure above check is ok
PTR_ADDU v0,from,count # v0 := from + count
ble to,from,goforwards # If to <= from then copy forwards
blt to,v0,gobackwards # backwards if from<to<from+count
/* Forward copy code. Check for pointer alignment and try to get both
* pointers aligned on a long boundary.
*/
goforwards:
blt count,MINCOPY,fbcopy
/* If possible, align source & destination on 64-bit boundary.
*/
and v0,from,7
and v1,to,7
li a3,8
bne v0,v1,align32 # low bits are different
/* Pointers 64-bit alignable (may be aligned). Since v0 == v1, we need only
* check what value v0 has to see how to get aligned. Also, since we have
* eliminated tiny copies, we know that the count is large enough to
* encompass the alignment copies.
*/
beq v0,zero,1f # If v0==0 then aligned
subu a3,a3,v1 # a3 = # bytes to get aligned
LDS v0,0(from)
SDS v0,0(to) # copy partial word
PTR_ADDU from,a3
PTR_ADDU to,a3
subu count,a3
1:
/* When we get here, source and destination are 64-bit aligned. Check if
* we have at least 64 bytes to move.
*/
and a3,count,~(64-1)
beq a3,zero,forwards # go do 32-bit copy
PTR_ADDU a3,a3,to
64:
ld t0,0(from); ld t1,8(from)
ld t2,16(from); ld t3,24(from)
ld ta0,32(from); ld ta1,40(from); ld ta2,48(from); ld ta3,56(from)
sd t0,0(to); sd t1,8(to); sd t2,16(to); sd t3,24(to)
sd ta0,32(to); sd ta1,40(to); sd ta2,48(to); sd ta3,56(to)
PTR_ADDU from,64
PTR_ADDU to,64
bne a3,to,64b
and count,64-1 # still have to copy non-64 multiple bytes
b forwards # complete with 32-bit copy
align32:
and v0,from,3
and v1,to,3
li a3,4
bne v0,v1,fmcopy # low bits are different
/* Pointers are alignable and may be aligned. Since v0 == v1, we need only
* check what value v0 has to see how to get aligned. Also, since we have
* eliminated tiny copies, we know that the count is large enough to
* encompass the alignment copies.
*/
beq v0,zero,forwards # If v0==0 then aligned
subu a3,a3,v1 # a3 = # bytes to get aligned
LWS v0,0(from)
SWS v0,0(to) # copy partial word
PTR_ADDU from,a3
PTR_ADDU to,a3
subu count,a3
/* Once we are here, the pointers are aligned on 32-bit boundaries
*/
forwards:
and a3,count,~(32-1)
beq a3,zero,16f
PTR_ADDU a3,a3,to
32:
lw t0,0(from); lw t1,4(from); lw t2,8(from); lw t3,12(from)
lw ta0,16(from); lw ta1,20(from); lw ta2,24(from); lw ta3,28(from)
sw t0,0(to); sw t1,4(to); sw t2,8(to); sw t3,12(to)
sw ta0,16(to); sw ta1,20(to); sw ta2,24(to); sw ta3,28(to)
PTR_ADDU from,32
PTR_ADDU to,32
bne a3,to,32b
/* We know we have fewer than 32 bytes remaining, so we do no more
* adjustments of the count.
*/
16: and v0,count,16
beq v0,zero,8f
lw t0,0(from); lw t1,4(from); lw t2,8(from); lw t3,12(from)
sw t0,0(to); sw t1,4(to); sw t2,8(to); sw t3,12(to)
PTR_ADDU from,16
PTR_ADDU to,16
8: and v1,count,8
beq v1,zero,4f
lw t0,0(from)
lw t1,4(from)
sw t0,0(to)
sw t1,4(to)
PTR_ADDU from,8
PTR_ADDU to,8
4: and v0,count,4
beq v0,zero,3f
lw t0,0(from)
sw t0,0(to)
PTR_ADDU from,4
PTR_ADDU to,4
3: and v1,count,3
PTR_ADDU from,v1
beq v1,zero,ret
PTR_ADDU to,v1
LWB t0,-1(from)
SWB t0,-1(to)
j ra
fmcopy:
/* Missaligned, non-overlap copy of many bytes. This happens too often.
* Align the destination for machines with write-thru caches.
*
* This code is always for machines that prefer nops between stores.
*
* Here v1=low bits of destination, a3=4.
*/
beq v1,zero,fmcopy4 # If v1==0 then destination is aligned
subu a3,a3,v1 # a3 = # bytes to align destination
subu count,a3
PTR_ADDU a3,to
1: lb v0,0(from)
PTR_ADDU from,1
sb v0,0(to)
PTR_ADDU to,1
bne to,a3,1b
fmcopy4:
and a3,count,~(16-1)
beq a3,zero,8f
PTR_ADDU a3,a3,to
16: LWS t0,0(from); LWB t0,0+3(from)
LWS t1,4(from); LWB t1,4+3(from); sw t0,0(to)
LWS t2,8(from); LWB t2,8+3(from); sw t1,4(to)
LWS t3,12(from); LWB t3,12+3(from); sw t2,8(to)
sw t3,12(to)
PTR_ADDU from,16
PTR_ADDU to,16
bne a3,to,16b
8: and v1,count,8
beq v1,zero,4f
LWS t0,0(from); LWB t0,0+3(from)
LWS t1,4(from); LWB t1,4+3(from); sw t0,0(to)
sw t1,4(to)
PTR_ADDU from,8
PTR_ADDU to,8
4: and v0,count,4
and count,3
beq v0,zero,fbcopy
LWS t0,0(from); LWB t0,0+3(from); sw t0,0(to)
PTR_ADDU from,4
PTR_ADDU to,4
/* Byte at a time copy code. This is used when the byte count is small.
*/
fbcopy:
PTR_ADDU a3,from,count # a3 = end+1
beq count,zero,ret # If count is zero, then we are done
1: lb v0,0(from) # v0 = *from
PTR_ADDU from,1 # advance pointer
sb v0,0(to) # Store byte
PTR_ADDU to,1 # advance pointer
bne from,a3,1b # Loop until done
ret: j ra # return to caller
/*****************************************************************************/
/*
* Backward copy code. Check for pointer alignment and try to get both
* pointers aligned on a long boundary.
*/
gobackwards:
PTR_ADDU from,count # Advance to end + 1
PTR_ADDU to,count # Advance to end + 1
/* small byte counts use byte at a time copy */
blt count,MINCOPY,backwards_bytecopy
and v0,from,3 # v0 := from & 3
and v1,to,3 # v1 := to & 3
beq v0,v1,backalignable # low bits are identical
/*
* Byte at a time copy code. This is used when the pointers are not
* alignable, when the byte count is small, or when cleaning up any
* remaining bytes on a larger transfer.
*/
backwards_bytecopy:
beq count,zero,ret # If count is zero quit
PTR_SUBU from,1 # Reduce by one (point at byte)
PTR_SUBU to,1 # Reduce by one (point at byte)
PTR_SUBU v1,from,count # v1 := original from - 1
99: lb v0,0(from) # v0 = *from
PTR_SUBU from,1 # backup pointer
sb v0,0(to) # Store byte
PTR_SUBU to,1 # backup pointer
bne from,v1,99b # Loop until done
j ra # return to caller
/*
* Pointers are alignable, and may be aligned. Since v0 == v1, we need only
* check what value v0 has to see how to get aligned. Also, since we have
* eliminated tiny copies, we know that the count is large enough to
* encompass the alignment copies.
*/
backalignable:
beq v0,zero,backwards # If v0==v1 && v0==0 then aligned
beq v0,3,back_copy3 # Need to copy 3 bytes to get aligned
beq v0,2,back_copy2 # Need to copy 2 bytes to get aligned
/* need to copy 1 byte */
lb v0,-1(from) # get one byte
PTR_SUBU from,1 # backup pointer
sb v0,-1(to) # store one byte
PTR_SUBU to,1 # backup pointer
subu count,1 # and reduce count
b backwards # Now pointers are aligned
/* need to copy 2 bytes */
back_copy2:
lh v0,-2(from) # get one short
PTR_SUBU from,2 # backup pointer
sh v0,-2(to) # store one short
PTR_SUBU to,2 # backup pointer
subu count,2 # and reduce count
b backwards
/* need to copy 3 bytes */
back_copy3:
lb v0,-1(from) # get one byte
lh v1,-3(from) # and one short
PTR_SUBU from,3 # backup pointer
sb v0,-1(to) # store one byte
sh v1,-3(to) # and one short
PTR_SUBU to,3 # backup pointer
subu count,3 # and reduce count
/* FALLTHROUGH */
/*
* Once we are here, the pointers are aligned on long boundaries.
* Begin copying in large chunks.
*/
backwards:
/* 32 byte at a time loop */
backwards_32:
blt count,32,backwards_16 # do 16 bytes at a time
lw v0,-4(from)
lw v1,-8(from)
lw t0,-12(from)
lw t1,-16(from)
lw t2,-20(from)
lw t3,-24(from)
lw ta0,-28(from)
lw ta1,-32(from) # Fetch 8*4 bytes
PTR_SUBU from,32 # backup from pointer now
sw v0,-4(to)
sw v1,-8(to)
sw t0,-12(to)
sw t1,-16(to)
sw t2,-20(to)
sw t3,-24(to)
sw ta0,-28(to)
sw ta1,-32(to) # Store 8*4 bytes
PTR_SUBU to,32 # backup to pointer now
subu count,32 # Reduce count
b backwards_32 # Try some more
/* 16 byte at a time loop */
backwards_16:
blt count,16,backwards_4 # Do rest in words
lw v0,-4(from)
lw v1,-8(from)
lw t0,-12(from)
lw t1,-16(from)
PTR_SUBU from,16 # backup from pointer now
sw v0,-4(to)
sw v1,-8(to)
sw t0,-12(to)
sw t1,-16(to)
PTR_SUBU to,16 # backup to pointer now
subu count,16 # Reduce count
b backwards_16 # Try some more
/* 4 byte at a time loop */
backwards_4:
blt count,4,backwards_bytecopy # Do rest
lw v0,-4(from)
PTR_SUBU from,4 # backup from pointer
sw v0,-4(to)
PTR_SUBU to,4 # backup to pointer
subu count,4 # Reduce count
b backwards_4
AUTO_CACHE_BARRIERS_ENABLE
END(nowar_bcopy)
#undef from
#undef to
#undef count