801 lines
23 KiB
ArmAsm
801 lines
23 KiB
ArmAsm
/*
|
|
* IP26 specific assembly routines; cpuid always 0, also make semaphore
|
|
* macros a no-op.
|
|
*/
|
|
#ident "$Revision: 1.46 $"
|
|
|
|
#include "ml/ml.h"
|
|
|
|
/* dummy routines whose return value is unimportant (or no return value).
|
|
Some return reasonable values on other machines, but should never
|
|
be called, or the return value should never be used on other machines.
|
|
*/
|
|
LEAF(dummy_func)
|
|
XLEAF(delay_calibrate)
|
|
XLEAF(check_delay_tlbflush)
|
|
XLEAF(check_delay_iflush)
|
|
XLEAF(da_flush_tlb)
|
|
XLEAF(dma_mapinit)
|
|
XLEAF(apsfail)
|
|
XLEAF(disallowboot)
|
|
XLEAF(vme_init)
|
|
XLEAF(vme_ivec_init)
|
|
XLEAF(debug_stop_all_cpus)
|
|
j ra
|
|
END(dummy_func)
|
|
|
|
/* Data cache operation aliases.
|
|
*/
|
|
LEAF(dcache_wb)
|
|
XLEAF(dcache_wbinval)
|
|
XLEAF(dki_dcache_wb)
|
|
XLEAF(dki_dcache_wbinval)
|
|
LI a2,CACH_DCACHE|CACH_INVAL|CACH_WBACK|CACH_IO_COHERENCY
|
|
j cache_operation
|
|
END(dcache_wb)
|
|
|
|
LEAF(dki_dcache_inval)
|
|
LI a2,CACH_DCACHE|CACH_INVAL|CACH_IO_COHERENCY
|
|
j cache_operation
|
|
END(dki_dcache_inval)
|
|
|
|
/* dummy routines that return 0 */
|
|
LEAF(dummyret0_func)
|
|
|
|
XLEAF(vme_adapter)
|
|
XLEAF(is_vme_space)
|
|
XLEAF(getcpuid)
|
|
#ifdef DEBUG
|
|
XLEAF(getcyclecounter)
|
|
#endif /* DEBUG */
|
|
|
|
/* Semaphore call stubs */
|
|
XLEAF(appsema)
|
|
XLEAF(apvsema)
|
|
XLEAF(apcvsema)
|
|
XLEAF(cache_preempt_limit)
|
|
move v0,zero
|
|
j ra
|
|
END(dummyret0_func)
|
|
|
|
/* dummy routines that return 1 */
|
|
LEAF(dummyret1_func)
|
|
XLEAF(apcpsema) /* can always get on non-MP machines */
|
|
li v0, 1
|
|
j ra
|
|
END(dummyret1_func)
|
|
|
|
/* writemcreg(caddr_t reg, int val)
|
|
*
|
|
* Basically this does *(volatile uint *)(PHYS_TO_K1(reg)) = val;
|
|
* a0 - physical register address
|
|
* a1 - value to write
|
|
*
|
|
* This was a workaround for a bug in the first rev MC chip, but IP26
|
|
* has only rev C (or greater) MCs, so just do the actual operation.
|
|
*/
|
|
LEAF(writemcreg)
|
|
or a0,K1BASE # a0 = PHYS_TO_K1(a0)
|
|
sw a1,(a0)
|
|
j ra
|
|
END(writemcreg)
|
|
|
|
/* Write the VDMA MEMADR, MODE, SIZE, STRIDE registers
|
|
*
|
|
* write4vdma (buf, mode, size, stride);
|
|
*/
|
|
LEAF(write4vdma)
|
|
/* XXX Only works because bit 15 not set in DMA_MEMADR */
|
|
LI v0, (K1BASE | DMA_MEMADR) & (~0xffff)
|
|
|
|
sw a0,DMA_MEMADR & 0xffff(v0)
|
|
sw a1,DMA_MODE & 0xffff(v0)
|
|
sw a2,DMA_SIZE & 0xffff(v0)
|
|
sw a3,DMA_STRIDE & 0xffff(v0)
|
|
|
|
j ra
|
|
END(write4vdma)
|
|
|
|
/* Index based writeback or writeback and invalidate routines to flush the
|
|
* GCache (and the IU's DCache) using the built-in TCC cache ops. Address
|
|
* must be in physical, K0, or K1 (cannot handle a mapped address since
|
|
* cacheops take a physical address).
|
|
*
|
|
* We must operate on all 4 sets to make sure we get the address in question.
|
|
* The DCache (unlike the ICache) is a propper subset of the GCache, which
|
|
* ensures propper flushing.
|
|
*
|
|
* NOTE: This code assumes a 2MB, 4 way set associative cache. TCC can
|
|
* support a 4MB (max) cache in theory, but in practice the extra
|
|
* index bit is unused.
|
|
*
|
|
* We do cannot support only invalidate on index based ops with a
|
|
* set associative cache as we will invalidate others data.
|
|
*
|
|
* void __cache_wb_inval(void *, int length);
|
|
* void __cache_wb(void *, int length);
|
|
*
|
|
*/
|
|
LEAF(__cache_wb)
|
|
dli t3,K1BASE|TCC_CACHE_OP|TCC_DIRTY_WB|TCC_INDEX_OP
|
|
j 1f
|
|
XLEAF(__cache_wb_inval)
|
|
dli t3,K1BASE|TCC_CACHE_OP|TCC_DIRTY_WB|TCC_INVALIDATE|TCC_INDEX_OP
|
|
|
|
/* Invalidate prefetch buffers on invalidation cache ops */
|
|
dli t2,K1BASE|TCC_PREFETCH # prefetch control register
|
|
ld t1,0(t2)
|
|
ori t1,PRE_INV # set invalidate bit
|
|
sd t1,0(t2) # do invalildation op
|
|
/*FALLSTHROUGH*/
|
|
|
|
1:
|
|
#if DEBUG /* ensure we do not have a K2 address */
|
|
LI t1,K2BASE
|
|
and t2,t1,a0
|
|
bne t1,t2,1f # branch if not K2
|
|
move a1,a0 # address
|
|
LA a0,msg # string
|
|
jal panic
|
|
.data
|
|
msg: .asciiz "IP26 low-level cache flushing got K2 address: 0x%x"
|
|
.text
|
|
1:
|
|
#endif
|
|
# Calculate the number of indices to invalidate (a1).
|
|
#
|
|
addi a1,TCC_LINESIZE-1 # round up to next cache line
|
|
srl a1,TCC_CACHE_INDEX_SHIFT # drop insignificant bits
|
|
|
|
# Mask to pull index out of address.
|
|
#
|
|
li t1,TCC_CACHE_INDEX
|
|
|
|
# Make sure number of indices is not bigger than the maximum index.
|
|
#
|
|
li t0,TCC_CACHE_INDEX>>TCC_CACHE_INDEX_SHIFT
|
|
blt a1,t0,1f
|
|
move a1,t0
|
|
|
|
1: and t0,a0,t1 # isolate current index
|
|
or t0,t0,t3 # make cache op addr
|
|
addi a1,-1 # decrement line count
|
|
|
|
lw zero,0(t0) # set 0
|
|
lw zero,1<<TCC_CACHE_SET_SHIFT(t0) # set 1
|
|
lw zero,2<<TCC_CACHE_SET_SHIFT(t0) # set 2
|
|
lw zero,3<<TCC_CACHE_SET_SHIFT(t0) # set 3
|
|
|
|
addi a0,TCC_LINESIZE # next index (use word add)
|
|
|
|
bgez a1,1b # loop if more indexes
|
|
|
|
j ra
|
|
END(__cache_wb)
|
|
|
|
/* Hit based invalidate, write back, or write back and invalidate the
|
|
* GCache (and the IU's DCache) using the built-in TCC cache ops. Address
|
|
* must be in physical, K0, or K1 (cannot handle a mapped address since
|
|
* cacheops take a physical address).
|
|
*
|
|
* void __dcache_inval(void *address, int length)
|
|
* void __dcache_wb(void *address, int length)
|
|
* void __dcache_wb_inval(void *address, int length)
|
|
*
|
|
*/
|
|
LEAF(__dcache_inval)
|
|
dli t2,K1BASE|TCC_CACHE_OP|TCC_INVALIDATE
|
|
andi t3,a0,(TCC_LINESIZE-1) # non-aligned address
|
|
andi a5,a1,(TCC_LINESIZE-1) # non-blocked length
|
|
or a6,t3,a5
|
|
beqz a6,2f # do full block invalidation
|
|
|
|
li a7,TCC_PHYSADDR # significant address bits
|
|
dli a6,K1BASE|TCC_CACHE_OP|TCC_DIRTY_WB|TCC_INVALIDATE
|
|
|
|
beqz t3,1f # WB first block?
|
|
and t1,a0,a7 # get phys address
|
|
or t1,a6 # construct address
|
|
lw zero,0(t1) # wb inval line
|
|
li t1,TCC_LINESIZE
|
|
sub t3,t1,t3 # size of flushed block
|
|
subu a1,t3 # update length
|
|
addu a0,t3 # update starting point
|
|
andi a5,a1,(TCC_LINESIZE-1) # new non-blocked length
|
|
|
|
1: beqz a5,3f # WB last block?
|
|
addu t1,a0,a1 # ending address
|
|
subu a1,a5 # update length
|
|
and t1,a7 # get phys address
|
|
or t1,a6 # construct address
|
|
lw zero,0(t1) # wb inval line
|
|
|
|
3: bnez a1,2f # continue if length != 0
|
|
|
|
b 7f # go inval prefetch + return
|
|
|
|
XLEAF(__dcache_wb)
|
|
dli t2,K1BASE|TCC_CACHE_OP|TCC_DIRTY_WB
|
|
j 2f
|
|
|
|
XLEAF(__dcache_wb_inval)
|
|
dli t2,K1BASE|TCC_CACHE_OP|TCC_DIRTY_WB|TCC_INVALIDATE
|
|
|
|
2: andi t0,a0,TCC_LINESIZE-1 # get rounding amount
|
|
add a1,a1,t0 # add rounding amount to length
|
|
li t0,TCC_PHYSADDR # significant address bits
|
|
and a0,a0,t0 # mask off un-need bits
|
|
or a0,a0,t2 # calc cache-op addr
|
|
|
|
.align 4 # loop is one quad
|
|
.set noreorder
|
|
1: lw zero,0(a0) # cache op on line 'a0'
|
|
addi a1,a1,-TCC_LINESIZE # decr length
|
|
bgtz a1,1b # on to next cache line
|
|
daddiu a0,a0,TCC_LINESIZE # BDSLOT incr cache op addr
|
|
.set reorder
|
|
|
|
/* Invalidate prefetch buffers on invalidation cache ops */
|
|
7: dli t3,K1BASE|TCC_PREFETCH # prefetch control register
|
|
ld t1,0(t3) # get current state
|
|
ori t1,PRE_INV # set invalidate bit
|
|
sd t1,0(t3) # do invalildation op
|
|
|
|
j ra
|
|
END(__dcache_inval)
|
|
|
|
/* Write back/invalidate one line from the cache. This can be used by drivers
|
|
* (enet uses it now) to have a lower overhead cacheflush when getting around
|
|
* problems with IP26 ECC baseboard. Assumes this cache line will not have
|
|
* been prefetched (and hence need to invalidate prefetch buffers).
|
|
*/
|
|
LEAF(__dcache_line_wb_inval)
|
|
dli t2,K1BASE|TCC_CACHE_OP|TCC_DIRTY_WB|TCC_INVALIDATE
|
|
|
|
li t0,TCC_PHYSADDR # significant address bits
|
|
and a0,a0,t0 # mask off un-need bits
|
|
or a0,a0,t2 # calc cache-op addr
|
|
lw zero,0(a0) # cache op on line 'a0'
|
|
j ra
|
|
END(__dcache_line_wb_inval)
|
|
|
|
/* Various routines to get TFP C0 registers.
|
|
*/
|
|
LEAF(get_config)
|
|
.set noreorder
|
|
dmfc0 v0, C0_CONFIG
|
|
j ra
|
|
nada # BDSLOT
|
|
.set reorder
|
|
END(get_config)
|
|
|
|
LEAF(set_config)
|
|
.set noreorder
|
|
dmtc0 a0, C0_CONFIG
|
|
j ra
|
|
nada # BDSLOT
|
|
.set reorder
|
|
END(set_config)
|
|
|
|
LEAF(_get_timestamp)
|
|
XLEAF(get_r4k_counter) /* for compat */
|
|
.set noreorder
|
|
dmfc0 v0, C0_COUNT
|
|
j ra
|
|
nada # BDSLOT
|
|
.set reorder
|
|
END(_get_timestamp)
|
|
|
|
LEAF(get_trapbase)
|
|
.set noreorder
|
|
dmfc0 v0,C0_TRAPBASE
|
|
j ra
|
|
nada # BDSLOT
|
|
.set reorder
|
|
END(get_trapbase)
|
|
|
|
LEAF(set_trapbase)
|
|
.set noreorder
|
|
dmtc0 a0,C0_TRAPBASE
|
|
j ra
|
|
nada
|
|
.set reorder
|
|
END(set_trapbase)
|
|
|
|
/* Clear TFP count interrupt by clearing CAUSE_IP11 (don't zero count
|
|
* as this is now fatal as we do not enable the counter interrupt).
|
|
*/
|
|
LEAF(_tfpcount_intr)
|
|
.set noreorder
|
|
dmfc0 t0,C0_CAUSE
|
|
ssnop
|
|
dli t1,~CAUSE_IP11
|
|
and t0,t1
|
|
dmtc0 t0,C0_CAUSE
|
|
ssnop
|
|
ssnop
|
|
j ra
|
|
nada
|
|
.set reorder
|
|
END(_tfpcount_intr)
|
|
|
|
/* Count the number of TCC Fifo Cycles vs the X number of SysAd
|
|
* Cycles - the larger the number of sysad cycles, the less
|
|
* percentage error hit the caller gets
|
|
*/
|
|
LEAF(CountTccVsSysAdCycles)
|
|
.set noreorder
|
|
LI t0,PHYS_TO_K1(TCC_COUNT) # TCC counter == SysAD freq
|
|
sd zero,0(t0)
|
|
move t2,zero # initialize counter
|
|
dmtc0 zero,C0_COUNT # zero TFP counter
|
|
ssnop # C0 hazzard
|
|
ssnop # C0 hazzard #2
|
|
1: dmfc0 v0,C0_COUNT # get current count before ld
|
|
ssnop # C0 hazzard
|
|
ld t2,0(t0) # get sysAD count
|
|
blt t2,a0,1b # enough cycles?
|
|
nada # BDSLOT
|
|
j ra
|
|
nada # BDSLOT
|
|
.set reorder
|
|
END(CountTccVsSysAdCycles)
|
|
|
|
/* Specialized teton pagecopy routine that is SWP'ed to take advantage
|
|
* of the hardware support for prefetching to memory hide latency.
|
|
*
|
|
* _pagecopy(void *src, void *dst, int len)
|
|
*
|
|
* Assumes src and dst are both cache line aligned and len is a multiple
|
|
* of (n*2*TCC_LINESIZE)+2*TCC_LINESIZE, ie an even number of cache lines
|
|
* greater than or equal to 4.
|
|
*
|
|
* The code does not copy the last two lines to avoid prefetching past the
|
|
* end of the page which can cause a MC bus error when done on the last
|
|
* page of memory or the last page before the hole.
|
|
*
|
|
* Be very careful as the code here is carefully quad aligned. Losing
|
|
* propper alignment will reduce performance!
|
|
*
|
|
* Caller saves the dwong patch state.
|
|
*/
|
|
#define PREF(offset,reg) pref 0,offset(reg)
|
|
LEAF(_pagecopy)
|
|
.align 4
|
|
.set noreorder
|
|
beqz a2,2f # skip zero length copies
|
|
sltu t1,a0,a1 # BDSLOT: if src < dst
|
|
bnez t1,_pagecopy_backwards # then do backwards copy
|
|
nada
|
|
PREF(TCC_LINESIZE,a0) # request second src line
|
|
nada
|
|
nada
|
|
addi a2,-(2*TCC_LINESIZE) # do last 2 lines seperately
|
|
PREF(TCC_LINESIZE,a1) # second dst line
|
|
li t0,2*TCC_LINESIZE # size of trailer
|
|
nada
|
|
nada
|
|
1:
|
|
ldc1 $f4,0(a0) ; sdc1 $f4, 0(a1)
|
|
PREF(2*TCC_LINESIZE,a0) ; ldc1 $f4, 8(a0)
|
|
PREF(2*TCC_LINESIZE,a1) ; sdc1 $f4, 8(a1)
|
|
ldc1 $f4,16(a0) ; sdc1 $f4, 16(a1)
|
|
ldc1 $f4,24(a0) ; sdc1 $f4, 24(a1)
|
|
ldc1 $f4,32(a0) ; sdc1 $f4, 32(a1)
|
|
ldc1 $f4,40(a0) ; sdc1 $f4, 40(a1)
|
|
ldc1 $f4,48(a0) ; sdc1 $f4, 48(a1)
|
|
ldc1 $f4,56(a0) ; sdc1 $f4, 56(a1)
|
|
ldc1 $f4,64(a0) ; sdc1 $f4, 64(a1)
|
|
ldc1 $f4,72(a0) ; sdc1 $f4, 72(a1)
|
|
ldc1 $f4,80(a0) ; sdc1 $f4, 80(a1)
|
|
ldc1 $f4,88(a0) ; sdc1 $f4, 88(a1)
|
|
ldc1 $f4,96(a0) ; sdc1 $f4, 96(a1)
|
|
ldc1 $f4,104(a0) ; sdc1 $f4, 104(a1)
|
|
ldc1 $f4,112(a0) ; sdc1 $f4, 112(a1)
|
|
ldc1 $f4,120(a0) ; sdc1 $f4, 120(a1)
|
|
addi a2,-TCC_LINESIZE; daddiu a0,TCC_LINESIZE
|
|
|
|
daddiu a1,TCC_LINESIZE # BDSLOT: next dst cache line
|
|
bgtz a2,1b # keep going?
|
|
nada # BDSLOT keep alignment
|
|
nada # keep alignment
|
|
|
|
1: ldc1 $f4,0(a0) ; sdc1 $f4, 0(a1)
|
|
ldc1 $f4,8(a0) ; sdc1 $f4, 8(a1)
|
|
ldc1 $f4,16(a0) ; sdc1 $f4, 16(a1)
|
|
ldc1 $f4,24(a0) ; sdc1 $f4, 24(a1)
|
|
|
|
addi t0,-32 # done with one chunk
|
|
daddiu a0,32 # next src chunk
|
|
bgtz t0,1b # keep going?
|
|
daddiu a1,32 # BDSLOT: next dst chunk
|
|
|
|
2: j ra
|
|
nada
|
|
nada # keep alignment
|
|
nada
|
|
|
|
_pagecopy_backwards:
|
|
daddu a0,a2 # start with ending addresses
|
|
daddu a1,a2
|
|
PREF(-2*TCC_LINESIZE,a0) # request second src line
|
|
nada
|
|
nada
|
|
addi a2,-(2*TCC_LINESIZE) # do last 2 lines seperately
|
|
PREF(-2*TCC_LINESIZE,a1) # second dst line
|
|
li t0,2*TCC_LINESIZE # size of trailer
|
|
1:
|
|
ldc1 $f4, -8(a0) ; sdc1 $f4, -8(a1)
|
|
PREF(-3*TCC_LINESIZE,a0); ldc1 $f4,-16(a0)
|
|
PREF(-3*TCC_LINESIZE,a1); sdc1 $f4,-16(a1)
|
|
ldc1 $f4,-24(a0) ; sdc1 $f4,-24(a1)
|
|
ldc1 $f4,-32(a0) ; sdc1 $f4,-32(a1)
|
|
ldc1 $f4,-40(a0) ; sdc1 $f4,-40(a1)
|
|
ldc1 $f4,-48(a0) ; sdc1 $f4,-48(a1)
|
|
ldc1 $f4,-56(a0) ; sdc1 $f4,-56(a1)
|
|
ldc1 $f4,-64(a0) ; sdc1 $f4,-64(a1)
|
|
ldc1 $f4,-72(a0) ; sdc1 $f4,-72(a1)
|
|
ldc1 $f4,-80(a0) ; sdc1 $f4,-80(a1)
|
|
ldc1 $f4,-88(a0) ; sdc1 $f4,-88(a1)
|
|
ldc1 $f4,-96(a0) ; sdc1 $f4,-96(a1)
|
|
ldc1 $f4,-104(a0) ; sdc1 $f4,-104(a1)
|
|
ldc1 $f4,-112(a0) ; sdc1 $f4,-112(a1)
|
|
ldc1 $f4,-120(a0) ; sdc1 $f4,-120(a1)
|
|
ldc1 $f4,-128(a0) ; sdc1 $f4,-128(a1)
|
|
addi a2,-TCC_LINESIZE; daddiu a0,-TCC_LINESIZE
|
|
|
|
daddiu a1,-TCC_LINESIZE # BDSLOT: next dst cache line
|
|
bgtz a2,1b # keep going?
|
|
nada # BDSLOT keep alignment
|
|
nada # keep alignment
|
|
|
|
1: ldc1 $f4, -8(a0) ; sdc1 $f4, -8(a1)
|
|
ldc1 $f4,-16(a0) ; sdc1 $f4,-16(a1)
|
|
ldc1 $f4,-24(a0) ; sdc1 $f4,-24(a1)
|
|
ldc1 $f4,-32(a0) ; sdc1 $f4,-32(a1)
|
|
|
|
addi t0,-32 # done with one chunk
|
|
daddiu a0,-32 # next src chunk
|
|
bgtz t0,1b # keep going?
|
|
daddiu a1,-32 # BDSLOT: next dst chunk
|
|
|
|
j ra
|
|
nada
|
|
.set reorder
|
|
END(_pagecopy)
|
|
|
|
/* Specialized teton pagezero routine that is SWP'ed to take advantage
|
|
* of the hardware support for prefetching to memory hide latency.
|
|
*
|
|
* _pagezero(void *dst, int len)
|
|
*
|
|
* The dst address must be page cache aligned, and the length must be
|
|
* a (n*4*TCC_LINESIZE)+4*TCC_LINESIZE. Since this is a bit odd it's
|
|
* probably best to assume len some multiple of 8*TCC_LINESIZE (2KB).
|
|
*
|
|
* The loop is prefetches four lines ahead. The last four cache lines are
|
|
* done seperately to avoid fetching past the end of the page (and they
|
|
* have already been prefetched.
|
|
*
|
|
* The main loop takes 9 cycles once the cache line has been fetched. This
|
|
* is one additional cycle per cache line for lines that are in the GCache,
|
|
* but overall we should save more cycles when prefetching helps.
|
|
*
|
|
* Caller saves the dwong patch state.
|
|
*/
|
|
LEAF(_pagezero)
|
|
.set noreorder
|
|
beqz a1,2f # make sure length is non-zero
|
|
addi a1,-(4*TCC_LINESIZE) # do last 4 lines seperately
|
|
PREF(0,a0) # first line
|
|
PREF(TCC_LINESIZE,a0) # second line
|
|
dmtc1 zero,$f4 # get those zero bits ready
|
|
PREF(2*TCC_LINESIZE,a0) # third line
|
|
li t0,4*TCC_LINESIZE # size of secondary copy
|
|
PREF(3*TCC_LINESIZE,a0) # fourth line
|
|
|
|
1: PREF(4*TCC_LINESIZE,a0) ; nada # [0]
|
|
nada ; addi zero,zero,0 # [0]
|
|
sdc1 $f4,0(a0) ; sdc1 $f4,8(a0) # [1]
|
|
sdc1 $f4,16(a0) ; sdc1 $f4,24(a0) # [2]
|
|
sdc1 $f4,32(a0) ; sdc1 $f4,40(a0) # [3]
|
|
sdc1 $f4,48(a0) ; sdc1 $f4,56(a0) # [4]
|
|
sdc1 $f4,64(a0) ; sdc1 $f4,72(a0) # [5]
|
|
sdc1 $f4,80(a0) ; sdc1 $f4,88(a0) # [6]
|
|
sdc1 $f4,96(a0) ; sdc1 $f4,104(a0) # [7]
|
|
addi zero,zero,0 ; addi a1,-TCC_LINESIZE # [7]
|
|
sdc1 $f4,112(a0) ; sdc1 $f4,120(a0) # [8]
|
|
bgtz a1,1b # loop more? [8]
|
|
daddiu a0,TCC_LINESIZE # BDSLOT: bump address [8]
|
|
|
|
1: sdc1 $f4,0(a0) ; sdc1 $f4,8(a0) # [0]
|
|
sdc1 $f4,16(a0) ; sdc1 $f4,24(a0) # [1]
|
|
sdc1 $f4,32(a0) ; sdc1 $f4,40(a0) # [2]
|
|
addi t0,-TCC_LINESIZE; addi zero,zero,0 # [2]
|
|
sdc1 $f4,48(a0) ; sdc1 $f4,56(a0) # [3]
|
|
sdc1 $f4,64(a0) ; sdc1 $f4,72(a0) # [4]
|
|
sdc1 $f4,80(a0) ; sdc1 $f4,88(a0) # [5]
|
|
sdc1 $f4,96(a0) ; sdc1 $f4,104(a0) # [6]
|
|
sdc1 $f4,112(a0) ; sdc1 $f4,120(a0) # [7]
|
|
bgtz t0,1b # loop more? [7]
|
|
daddiu a0,TCC_LINESIZE # BDSLOT: next address [7]
|
|
|
|
2: j ra
|
|
nada
|
|
.set reorder
|
|
END(_pagezero)
|
|
|
|
/* Save stuff in C0_WORK1 register
|
|
*/
|
|
LEAF(set_work1)
|
|
.set noreorder
|
|
DMTC0(a0,C0_WORK1)
|
|
.set reorder
|
|
j ra
|
|
END(set_work1);
|
|
|
|
/* Calculate speed ratio between TFP side and sysAD side of teton,
|
|
* which we use to set cache refil characteristics. We handle
|
|
* ratios (TFP/Sysad) from 0/1 to 2/1. The value to use for fast
|
|
* mode is returned to the caller for later use.
|
|
*/
|
|
LEAF(tune_tcc_gcache)
|
|
.set noreorder
|
|
LI t0,PHYS_TO_K1(TCC_COUNT) # TCC counter == SysAD freq
|
|
sd zero,0(t0)
|
|
move t2,zero # initialize counter
|
|
dmtc0 zero,C0_COUNT # zero TFP counter
|
|
ssnop # C0 hazzard
|
|
li t3,2000 # count 2000 SysAd cycles
|
|
1: dmfc0 a0,C0_COUNT # get current count before ld
|
|
ld t2,0(t0) # get sysAD count
|
|
blt t2,t3,1b # enough cycles?
|
|
nada
|
|
.set reorder
|
|
|
|
srl a0,1 # divide down to 1000's
|
|
sub a0,1000 # get ratios >= 1.0
|
|
blt a0,zero,1f # use default if ratio < 1.0
|
|
li t0,10 # divide down to 100's
|
|
div a0,t0
|
|
mflo a0
|
|
andi a0,0xfe # even indicies only
|
|
|
|
li t0,100 # check range of index
|
|
ble a0,t0,1f
|
|
move a0,zero # use default if > 2.0
|
|
|
|
1: LA t0,tcc_fiforatios
|
|
daddu t0,a0 # index short array
|
|
lh a0,0(t0)
|
|
sll a0,GCACHE_RR_FULL_SHIFT # shift into position
|
|
|
|
LI t0,PHYS_TO_K1(TCC_GCACHE)
|
|
ld v0,0(t0)
|
|
dli t2,~(GCACHE_RR_FULL|GCACHE_WB_RESTART|GCACHE_REL_DELAY)
|
|
and v0,t2
|
|
or v0,a0 # set our new bits
|
|
|
|
j ra # return new setting
|
|
|
|
END(tune_tcc_gcache)
|
|
|
|
#define MEMACC_XOR (CPU_MEMACC_SLOW&0x3fff)
|
|
#define CPU_MEMACC_OFFSET CPU_MEMACC-CPUCTRL0
|
|
|
|
/* Enable uncachedable writes via slow memory, returning the old state.
|
|
*
|
|
* Critical section on one cache line to prevent writebacks during
|
|
* the mode switch.
|
|
*
|
|
* The Read from K1BASE below is to force a read from main memory before
|
|
* entering slow mode. This will check for any bad (uncached) writes
|
|
* done previously, which have gone undetected until here. (The bus error
|
|
* can't be raised for an uncached write until the next read cycle.)
|
|
*
|
|
* GCache indicies used:
|
|
* TCC, MC, PAL PIO 0
|
|
* critical cache line odd
|
|
* write flush cache line 2
|
|
*/
|
|
LEAF(ip26_enable_ucmem)
|
|
LI a0,K1BASE # K1
|
|
LI a6,K0BASE # K0
|
|
lw zero,0(a0) # Force mem read to flush errs.
|
|
li a2,CPU_MEMACC_SLOW # slow memory setting
|
|
or a1,a0,TCC_BASE # PHYS_TO_K1(TCC_BASE)
|
|
or a5,a0,HPC3_SYS_ID # PHYS_TO_K1(HPC3_SYS_ID)
|
|
or a4,a0,ECC_CTRL_REG # ECC PAL ctrl reg.
|
|
or a7,a0,TCC_CACHE_OP|TCC_DIRTY_WB|TCC_INVALIDATE
|
|
or a0,a0,CPUCTRL0 # PHYS_TO_K1(CPUCTRL0)
|
|
ld a3,tcc_gcache_slow # the slow value to write
|
|
li t3,PRE_INV # prefetch off + invalidated
|
|
|
|
beqz a3,2f # not initialized yet
|
|
|
|
lw t1,0(a5) # get system id
|
|
andi t1,BOARD_REV_MASK # isolate board rev
|
|
sltiu a5,t1,IP26_ECCSYSID # 1=IP22, 0=IP26
|
|
|
|
.set noreorder
|
|
dmfc0 t0,C0_SR # disable interrupts
|
|
ssnop
|
|
ssnop
|
|
ori t1,t0,SR_IE
|
|
xori t1,SR_IE
|
|
.align 8
|
|
/* The critical section fits on one cache line. However we want
|
|
* the GCache index of this cache line to be odd so it is not
|
|
* affected by the PIO's or the flush cache line.
|
|
*/
|
|
b 1f; nada; nada; nada; nada; nada; nada; nada
|
|
nada; nada; nada; nada; nada; nada; nada; nada
|
|
nada; nada; nada; nada; nada; nada; nada; nada
|
|
nada; nada; nada; nada; nada; nada; nada; nada
|
|
|
|
1: dmtc0 t1,C0_SR # critical begin
|
|
ssnop; ssnop; ssnop; ssnop # drain the pipe
|
|
|
|
sd t3,TCC_PREFETCH-TCC_BASE(a1) # invalidate prefetch buffers
|
|
lw t1,CPU_MEMACC_OFFSET(a0) # get MC memory config
|
|
andi v0,t1,0x3fff # important bits
|
|
xori v0,MEMACC_XOR # 0=slow, !0=normal
|
|
|
|
sd a3,TCC_GCACHE-TCC_BASE(a1) # go to slow mode on TCC
|
|
lw zero,0(a0) # flushbus
|
|
sd zero,2*TCC_LINESIZE(a6) # bring cache line in
|
|
sw a2,CPU_MEMACC_OFFSET(a0) # go to slow mode on MC
|
|
lw zero,0(a0) # flushbus
|
|
bnez a5,1f # skip ECC on IP22
|
|
nada # BDSLOT
|
|
li a2,ECC_CTRL_DISABLE # disable ECC chk (uc writes ok)
|
|
sd a2,0(a4) # Enter slow mode.
|
|
ssnop # break super scalar dispatch
|
|
lw zero,0(a0) # flushbus
|
|
lw zero,2*TCC_LINESIZE(a7) # flush cache line
|
|
# end of critical
|
|
|
|
1: dmtc0 t0,C0_SR # restore C0_SR
|
|
ssnop
|
|
ssnop
|
|
.set reorder
|
|
|
|
2: j ra
|
|
END(ip26_enable_ucmem)
|
|
|
|
/* Disable uncacheable writes via faster memory, returning the old state.
|
|
*
|
|
* Critical section on one cache line to prevent writebacks during
|
|
* the mode switch.
|
|
*/
|
|
LEAF(ip26_disable_ucmem)
|
|
LI a0,K1BASE # K1
|
|
li a2,CPU_MEMACC_NORMAL # normal memory setting
|
|
or a1,a0,TCC_BASE # PHYS_TO_K1(TCC_BASE)
|
|
or a5,a0,HPC3_SYS_ID # PHYS_TO_K1(HPC3_SYSID)
|
|
or a4,a0,ECC_CTRL_REG # ECC PAL ctrl reg.
|
|
or a0,a0,CPUCTRL0 # PHYS_TO_K1(CPUCTRL0)
|
|
ld a3,tcc_gcache_normal # the normal value to write
|
|
li t3,PRE_DEFAULT|PRE_INV # re-enable prefetch + inval
|
|
|
|
beqz a3,2f # not initialized yet
|
|
|
|
lw t1,0(a5) # get system id
|
|
andi t1,BOARD_REV_MASK # isolate board rev
|
|
sltiu a5,t1,IP26_ECCSYSID # 1=IP22, 0=IP26
|
|
|
|
.set noreorder
|
|
dmfc0 t0,C0_SR # disable interrupts
|
|
ssnop
|
|
ssnop
|
|
ori t1,t0,SR_IE
|
|
xori t1,SR_IE
|
|
|
|
.align 8
|
|
/* The critical section fits on one cache line. However we want
|
|
* the GCache index of this cache line to be odd so it is not
|
|
* affected by the PIO's or the flush cache line.
|
|
*/
|
|
b 1f; nada; nada; nada; nada; nada; nada; nada
|
|
nada; nada; nada; nada; nada; nada; nada; nada
|
|
nada; nada; nada; nada; nada; nada; nada; nada
|
|
nada; nada; nada; nada; nada; nada; nada; nada
|
|
|
|
1: dmtc0 t1,C0_SR # critical begin
|
|
ssnop; ssnop; ssnop; ssnop # drain the pipe
|
|
|
|
sd t3,TCC_PREFETCH-TCC_BASE(a1) # invalidate prefetch buffers
|
|
lw t1,CPU_MEMACC_OFFSET(a0) # get MC memory config
|
|
andi v0,t1,0x3fff # important bits
|
|
xori v0,MEMACC_XOR # 0=slow, !0=normal
|
|
|
|
bnez a5,1f # skip ECC on IP22
|
|
nada # BDSLOT
|
|
sd zero,0(a4) # ECC_CTRL_ENABLE==0 (Fast)
|
|
lw zero,0(a0) # flushbus
|
|
1: sw a2,CPU_MEMACC_OFFSET(a0) # go to normal mode on MC
|
|
lw zero,0(a0) # flushbus
|
|
sd a3,TCC_GCACHE-TCC_BASE(a1) # go to normal mode on TCC
|
|
lw zero,0(a0) # flushbus - end of critical
|
|
|
|
dmtc0 t0,C0_SR # restore C0_SR
|
|
ssnop
|
|
.set reorder
|
|
|
|
2: j ra
|
|
END(ip26_disable_ucmem)
|
|
|
|
/* Return to previous memory system state. 1 == fast, 0 == slow.
|
|
*/
|
|
LEAF(ip26_return_ucmem)
|
|
beqz a0,1f
|
|
b ip26_disable_ucmem # going to normal mode
|
|
1: b ip26_enable_ucmem # going to slow mode
|
|
END(ip26_return_ucmem)
|
|
|
|
.data
|
|
/* R = proc_freq/sysad_freq (usually 75.0/50.0)
|
|
*
|
|
* Parity memory timing (DDx):
|
|
* Read Response Level = floor[(47R-28)/3R]
|
|
* Release Delay = max{floor[17-27.5R],floor[10-3R]}
|
|
* Writeback Restart Level = ceiling[15 - 19/R]
|
|
*
|
|
* ECC memory timing (DDxx):
|
|
* Read Response Level = floor[(31.5R - 15)/2R]
|
|
* Release Delay = max{floor(17-34.5R),floor(10-3R)}
|
|
* Writeback Restart Level = ceiling[15 - 19/R]
|
|
*/
|
|
tcc_fiforatios:
|
|
#if _MC_MEMORY_PARITY
|
|
.half 0x0706 0x0606 0x0606 0x0606 0x0607 /* 1.0 - 1.08 */
|
|
.half 0x0607 0x0607 0x0607 0x0607 0x0607 /* 1.1 - 1.18 */
|
|
.half 0x0607 0x0608 0x0608 0x0608 0x0618 /* 1.2 - 1.28 */
|
|
.half 0x0618 0x0618 0x0518 0x0528 0x0528 /* 1.3 - 1.38 */
|
|
.half 0x0528 0x0529 0x0529 0x0529 0x0539 /* 1.4 - 1.48 */
|
|
.half 0x0539 0x0539 0x0539 0x0539 0x0539 /* 1.5 - 1.58 */
|
|
.half 0x0549 0x0549 0x0549 0x054a 0x044a /* 1.6 - 1.68 */
|
|
.half 0x044a 0x044a 0x045a 0x045a 0x045a /* 1.7 - 1.78 */
|
|
.half 0x045a 0x045a 0x045a 0x045a 0x045a /* 1.8 - 1.88 */
|
|
.half 0x045a 0x046a 0x046a 0x046a 0x046a /* 1.9 - 1.98 */
|
|
.half 0x046a /* 2.0 */
|
|
#else
|
|
.half 0x00ff 0x00ff 0x00ff 0x00ff 0x00ff /* 1.0 - 1.08 */
|
|
.half 0x00ff 0x00ff 0x00ff 0x00ff 0x0609 /* 1.1 - 1.18 */
|
|
.half 0x0609 0x0609 0x0609 0x0609 0x0619 /* 1.2 - 1.28 */
|
|
.half 0x0619 0x061a 0x051a 0x052a 0x052a /* 1.3 - 1.38 */
|
|
.half 0x052a 0x052a 0x052a 0x052a 0x053a /* 1.4 - 1.48 */
|
|
.half 0x053a 0x053a 0x053a 0x053a 0x053b /* 1.5 - 1.58 */
|
|
.half 0x054b 0x054b 0x054b 0x054b 0x044b /* 1.6 - 1.68 */
|
|
.half 0x044b 0x044b 0x045b 0x045b 0x045b /* 1.7 - 1.78 */
|
|
.half 0x045b 0x045b 0x045b 0x045b 0x045b /* 1.8 - 1.88 */
|
|
.half 0x045b 0x046b 0x046b 0x046b 0x046b /* 1.9 - 1.98 */
|
|
.half 0x046c /* 2.0 */
|
|
#endif
|
|
|
|
.text
|
|
|
|
#ifdef _IP26_SYNC_WAR
|
|
/* Since we cannot handle sync instructions well, we cheat the mongoose
|
|
* __synchronize() built-in, to call ___synchronize() instead. Since
|
|
* TFP integer side is well ordered, we really do not have to do anything.
|
|
* If we needed to flush the FP load/store side we may be in trouble, but
|
|
* the kernel should only care about integer in the synchronization spots.
|
|
*
|
|
* A function call should ensure the compiler performs load/stores to aliased
|
|
* or global items before the next function call, and breaks up problems with
|
|
* the compiler re-ordering critical load/stores.
|
|
*
|
|
* Another option would have been to have a "nosync" option to the compiler
|
|
* to order stores, but just drop the sync, or de-sync the kernel via post
|
|
* processing (sync -> nada).
|
|
*/
|
|
.align 4
|
|
LEAF(___synchronize)
|
|
.set noreorder
|
|
j ra
|
|
nada # BDSLOT
|
|
.set reorder
|
|
END(___synchronize)
|
|
#endif
|