1
0
Files
irix-657m-src/irix/kern/ml/delayasm.s
2022-09-29 17:59:04 +03:00

413 lines
10 KiB
ArmAsm

/*
* Assembly-language source for calibrated delay.
*/
#ident "$Revision: 1.30 $"
#include <ml/ml.h>
#ifdef R10000
/* based on trying on a few parts, and tuning to scalar dispatch */
#define LOOP_FACTOR nop;nop;nop;nop;nop;nop;nop
#else
#define LOOP_FACTOR
#endif
#if defined(PT_CLOCK_ADDR) || defined(PT_CLOCK_OFFSET)
/*
* Calculate the number of ticks at 3.6864MHz to execute 1024 instructions,
* using counter 2 on the 8254 programmable interval timer.
*
* This routine should *not* be called on early revision Indy systems,
* spefically p0 systems and p1 systems with IOC1.1. This is because
* the 8254 did not count correctly. On IOC1.2 (or IOC2), the 8254
* does count correctly, it just doesn't interrupt correctly. So this
* routine can be used in that case.
*
* The routine attempts to discover if it has been erronously called
* and if so, returns an answer consistent with a 50Mhz R4000SC (19).
*/
#define SB(src,dst) sb src,dst; wbflushm
#define COUNT 10000
/*
* This routine should *not* be called on early revision Indy
* systems, spefically p0 systems.
*/
LEAF(_ticksper1024inst)
#if IP22
LI t1,(K1BASE|HPC3_INT2_ADDR+PT_CLOCK_OFFSET) # assume INT2
lw t2,(K1BASE|HPC3_SYS_ID)
andi t3, t2, CHIP_IOC1
beqz t3, 4f # not IOC1 - use INT2's address
andi t3, t2, BOARD_IP22
bnez t3, 5f # IP22 w/IOC1 - use INT3's address
andi t3, t2, BOARD_REV_MASK
sub t3, 2 << BOARD_REV_SHIFT
bgez t3, 5f # At least a P1 board - use INT3's adrs
li v0, 19 # PTC doesn't work - use 50Mhz constant
j ra
5: LI t1,(K1BASE|HPC3_INT3_ADDR+PT_CLOCK_OFFSET)
4:
#elif IP26
LI t1,(K1BASE|HPC3_INT2_ADDR+PT_CLOCK_OFFSET)
#elif IP28
CLI t1,(COMPAT_K1BASE32|HPC3_INT2_ADDR+PT_CLOCK_OFFSET)
#else
li t1,PT_CLOCK_ADDR
#endif
2:
li t0,2 # loop twice in case of I-cache
1:
#ifdef R10000
li ta1,1024 # scale for super-scalar
#else
li ta1,512
#endif
li t2,PTCW_SC(2)|PTCW_16B|PTCW_MODE(2)
li t3,PTCW_SC(2)|PTCW_CLCMD
SB(t2, PT_CONTROL(t1))
li t2,COUNT&0xff # lsb
SB(t2, PT_COUNTER2(t1))
li t2,COUNT>>8 # msb
SB(t2, PT_COUNTER2(t1))
# 1024 inst
.set noreorder
3: LOOP_FACTOR
bgt ta1,zero,3b
subu ta1,1
.set reorder
SB(t3, PT_CONTROL(t1)) # stop counting now!
.set noreorder
lbu t3,PT_COUNTER2(t1) # lsb
nop # BDSLOT
lbu ta0,PT_COUNTER2(t1) # msb
nop # BDSLOT
.set reorder
sll ta0,8
or t3,ta0 # t3 contains ending count
subu t0,1 # loop again from primed cache
bnez t0,1b
li t2,PTCW_SC(2)|PTCW_16B|PTCW_MODE(MODE_STS)
SB(t2, PT_CONTROL(t1))
li v0,COUNT
blt v0,t3,2b # XXX sometimes get msb before lsb!
subu v0,t3
j ra
END(_ticksper1024inst)
#if defined(IP20) || defined(IP22) || defined(IP26) || defined(IP28)
#define COUNT0 0x164
/*
* find out the r4000 timer count delta in ~100 i8254 ticks
* THIS IS FOR R4000 INDIGO and INDIGO2 (and >P1 INDY) ONLY
*
* This routine should *not* be called on early revision Indy systems,
* spefically p0 systems and p1 systems with IOC1.1. This is because
* the 8254 did not count correctly. On IOC1.2 (or IOC2), the 8254
* does count correctly, it just doesn't interrupt correctly. So this
* routine can be used in that case.
*
* The routine attempts to discover if it has been erronously called
* and if so, returns an answer consistent with a 50Mhz R4000SC.
*
* The early logic here determines which type of running system we have:
*
* IP22 w/o IOC - use address of INT2 + 8254 offset
* IP22 w IOC - use address of INT3 + 8254 offset
* IP24 rev >= 2 - use address of INT3 + 8254 offset
* IP24 rev < 2 - use R4000SC constant (5153)
*
* (The number 5153 was an average of a few test runs of this routine.
* The range was from 5040 to 5207.)
*
*/
LEAF(_cpuclkper100ticks)
#if IP22
LI t1,(K1BASE|HPC3_INT2_ADDR+PT_CLOCK_OFFSET) # assume INT2
lw t2,(K1BASE|HPC3_SYS_ID)
andi t3, t2, CHIP_IOC1
beqz t3, 2f # not IOC1 - use INT2's address
andi t3, t2, BOARD_IP22
bnez t3, 3f # IP22 w/IOC1 - use INT3's address
andi t3, t2, BOARD_REV_MASK
sub t3, 2 << BOARD_REV_SHIFT
bgez t3, 3f # At least a P1 board - use INT3's adrs
li v0, 5153 # PTC doesn't work - use 50Mhz constat
j ra
3: LI t1,(K1BASE|HPC3_INT3_ADDR+PT_CLOCK_OFFSET)
2:
#elif IP26 || IP28
.set noreorder
#ifdef TFP
dmtc0 zero,C0_COUNT # ensure no interrupt
#endif
.set reorder
LI t1,(K1BASE|HPC3_INT2_ADDR|PT_CLOCK_OFFSET)
#else
LI t1,PT_CLOCK_ADDR
#endif
li t2,PTCW_SC(2)|PTCW_16B|PTCW_MODE(2)
li t3,PTCW_SC(2)|PTCW_CLCMD
SB(t2, PT_CONTROL(t1))
li t2,COUNT0&0xff # lsb
SB(t2, PT_COUNTER2(t1))
li t2,COUNT0>>8 # msb
SB(t2, PT_COUNTER2(t1))
.set noreorder
#if TFP
dmfc0 v0,C0_COUNT # initial TFP timer count
#else
mfc0 v0,C0_COUNT # initial r4000 timer count
#endif
.set reorder
1: SB(t3, PT_CONTROL(t1)) # latch counter
.set noreorder
#if TFP
dmfc0 v1,C0_COUNT # current TFP timer count
#else
mfc0 v1,C0_COUNT # current r4000 timer count
#endif
lbu a1,PT_COUNTER2(t1) # lsb
lbu a2,PT_COUNTER2(t1) # msb
nop # BDSLOT
.set reorder
bnez a2,1b
li t2,PTCW_SC(2)|PTCW_16B|PTCW_MODE(MODE_STS)
SB(t2, PT_CONTROL(t1)) # stop counting
li a2,COUNT0
subu a1,a2,a1
sw a1,0(a0) # actual number of i8254 timer ticks
subu v0,v1,v0 # r4000 timer count delta
j ra
END(_cpuclkper100ticks)
#endif /* IP20 || IP22 || IP26 || IP28 */
#endif /* defined(PT_CLOCK_ADDR) */
#if IP32
/*
* this value is chosen because the crime counter has an input
* frequency of 66Mhz while the 8254 on IP22 has an input frequency
* of 1Mhz.
*/
#define COUNT (0x164 * 66)
#define TRIAL_COUNT 4
/*
* _ticksper1024inst() -- returns number of CRM_TIME ticks in the
* time it takes to execute 1024 instructions.
*
* register usage:
* v0 -- number of ticks
* t0 -- Address of CRM_TIME register
* t1 -- Count of instructions to execute
* t2 -- Starting number of ticks
* t3 -- scratch
*/
LEAF(_ticksper1024inst)
la t0,CRM_TIME|K1BASE
li t1,1 # for first time around, just prime i-cache
li t3,2 # we'll do through the code twice
1:
ld t2,0(t0)
.set noreorder
2: bgt t1,zero,2b
subu t1,1
.set reorder
ld v0,0(t0)
dsubu v0,t2
li t1,512
subu t3,1 # first time was just to prime i-cache
bgt t3,zero,1b # go back for the real calibration
j ra
END(_ticksper1024inst)
/*
* _cpuclkper100ticks(&tick_count) - returns the number of cpu clocks
* required for COUNT ticks of the CRIME time base register. tick_count
* will contain the actual number of ticks of the CRIME timer (which will
* not be 100).
*
* We prime the cache before beginning measurement and then take the
* average of 5 trials.
*
* register usage:
* t0 -- limit timer count
* t1 -- address of crime time register
* t2 -- current value of CRM_TIME
* t3 -- loop count register
* ta0 -- scratch
* ta1 -- total of CRM_TIME ticks
* ta2 -- total of C0_COUNT ticks
*
* We make no provision for checking for wrap around on the CRM_TIME
* register or C0_COUNT since each of these registers will take several
* minutes from boot to wrap around (about 30 for C0_COUNT, much much
* longer for CRM_TIME -- 71079 minutes).
*
* NB: it is OK to write C0_COUNT here since we have yet to initialize
* the clock.
*/
LEAF(_cpuclkper100ticks)
.set noreorder
mtc0 zero,C0_COUNT
.set reorder
la t1,CRM_TIME|K1BASE
li t3,TRIAL_COUNT-1
la ta0,1f
li ta1,0
li ta2,0
#
# now suck in the code we are about to execute into the
# cache so that we can avoid cache fill induced delays
# during the execution.
#
.set noreorder
cache CACH_PI|C_FILL,0(ta0)
cache CACH_PI|C_FILL,32(ta0)
cache CACH_PI|C_FILL,64(ta0)
cache CACH_PI|C_FILL,96(ta0)
cache CACH_PI|C_FILL,128(ta0)
1:
mtc0 zero,C0_COUNT
sd zero,0(t1)
li t0,COUNT
2:
ld t2,0(t1)
dsll32 t2,0
dsra32 t2,0
mfc0 v0,C0_COUNT
sltu ta0,t2,t0
bne ta0,zero,2b
nop
.set reorder
addu ta1,t2 # total CRM_TIME ticks for all runs
addu ta2,v0 # total C0_COUNT ticks for all runs
.set noreorder
bne t3,zero,1b
addi t3,0xffff
.set reorder
li t3,TRIAL_COUNT
divu ta2,t3
mflo v0 # average of runs
divu ta1,t3
mflo ta0 # average of runs
sw ta0,0(a0)
j ra
END(_cpuclkper100ticks)
#define CALIBRATION_LOOP_CNT (100000)
/*
* delay_calibrate(void) --
* calculates number of CRM_TIME ticks in the
* time it takes to execute the CALIBRATION_LOOP_CNT
* and sets the us_delay decrementer in the pda
* XXX there is a possible counter roll-over but
* the crime counter is 48 bits and would take 150 days to
* roll over.
*
* register usage:
* v0 -- number of ticks
* t0 -- Address of CRM_TIME register
* t1 -- Count of calibration loop
* t2 -- Starting number of ticks
* t3 -- counter for i-cache fill
*/
LEAF(delay_calibrate)
la t0,CRM_TIME|K1BASE
li t1,1 # for first time around, just prime i-cache
li t3,2 # we'll do through the code twice
1:
.set noreorder
ld t2,0(t0)
2:
LOOP_FACTOR
bgt t1,zero,2b
subu t1,1
.set reorder
ld v0,0(t0)
dsubu v0,t2
li t1,(CALIBRATION_LOOP_CNT-1)
subu t3,1 # first time was just to prime i-cache
bgt t3,zero,1b # go back for the real calibration
/*
* on IP32 the MASTER_FREQ is 66.666500 MHz, giving
* DNS_PER_TICK 15 nanosecond period per tick.
* We want nanosecs per CALIBRATION_LOOP to use in
* us_delay as the nanosec decrementer ...
* So given
* v0 = number of master_freq ticks taken for CALIBRATION_LOOP_CNT
*
* (v0 * DNS_PER_TICK) / CALIBRATION_LOOP_CNT
*/
mul v0,DNS_PER_TICK
divu v0,CALIBRATION_LOOP_CNT
sw v0,VPDA_DECINSPERLOOP(zero) # pda.decinsperloop
j ra
END(delay_calibrate)
/* void delayloop(uint count, uint decr)
* - delay loop with a loop factoring, and also helps sometimes with
* messy compilers.
*
* register usage:
* a0 -- nano secs to spin
* a1 -- nano secs to decrement per iteration
*/
.align 6
LEAF(delayloop)
sync
.set noreorder
1:
LOOP_FACTOR
bgt a0,zero,1b
subu a0,a1
.set reorder
j ra
END(delayloop)
#endif /* IP32 */
#if defined(IP20) || defined(IP22) || defined(IP26) || defined(IPMHSIM)
/* void delayloop(int count, int decr) */
LEAF(delayloop)
.set noreorder
1: bgt a0,zero,1b
subu a0,a1 # BDSLOT
j ra
nop
.set reorder
END(delayloop)
#endif