mirror of
https://github.com/torvalds/linux.git
synced 2024-12-30 14:52:05 +00:00
92 lines
2.1 KiB
ArmAsm
92 lines
2.1 KiB
ArmAsm
|
/*
|
||
|
Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
|
||
|
|
||
|
This file is subject to the terms and conditions of the GNU General Public
|
||
|
License. See the file "COPYING" in the main directory of this archive
|
||
|
for more details.
|
||
|
|
||
|
Tight version of mempy for the case of just copying a page.
|
||
|
Prefetch strategy empirically optimised against RTL simulations
|
||
|
of SH5-101 cut2 eval chip with Cayman board DDR memory.
|
||
|
|
||
|
Parameters:
|
||
|
r2 : source effective address (start of page)
|
||
|
r3 : destination effective address (start of page)
|
||
|
|
||
|
Always copies 4096 bytes.
|
||
|
|
||
|
Points to review.
|
||
|
* Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
|
||
|
It seems like the prefetch needs to be at at least 4 lines ahead to get
|
||
|
the data into the cache in time, and the allocos contend with outstanding
|
||
|
prefetches for the same cache set, so it's better to have the numbers
|
||
|
different.
|
||
|
*/
|
||
|
|
||
|
.section .text..SHmedia32,"ax"
|
||
|
.little
|
||
|
|
||
|
.balign 8
|
||
|
.global sh64_page_copy
|
||
|
sh64_page_copy:
|
||
|
|
||
|
/* Copy 4096 bytes worth of data from r2 to r3.
|
||
|
Do prefetches 4 lines ahead.
|
||
|
Do alloco 2 lines ahead */
|
||
|
|
||
|
pta 1f, tr1
|
||
|
pta 2f, tr2
|
||
|
pta 3f, tr3
|
||
|
ptabs r18, tr0
|
||
|
|
||
|
#if 0
|
||
|
/* TAKum03020 */
|
||
|
ld.q r2, 0x00, r63
|
||
|
ld.q r2, 0x20, r63
|
||
|
ld.q r2, 0x40, r63
|
||
|
ld.q r2, 0x60, r63
|
||
|
#endif
|
||
|
alloco r3, 0x00
|
||
|
synco ! TAKum03020
|
||
|
alloco r3, 0x20
|
||
|
synco ! TAKum03020
|
||
|
|
||
|
movi 3968, r6
|
||
|
add r3, r6, r6
|
||
|
addi r6, 64, r7
|
||
|
addi r7, 64, r8
|
||
|
sub r2, r3, r60
|
||
|
addi r60, 8, r61
|
||
|
addi r61, 8, r62
|
||
|
addi r62, 8, r23
|
||
|
addi r60, 0x80, r22
|
||
|
|
||
|
/* Minimal code size. The extra branches inside the loop don't cost much
|
||
|
because they overlap with the time spent waiting for prefetches to
|
||
|
complete. */
|
||
|
1:
|
||
|
#if 0
|
||
|
/* TAKum03020 */
|
||
|
bge/u r3, r6, tr2 ! skip prefetch for last 4 lines
|
||
|
ldx.q r3, r22, r63 ! prefetch 4 lines hence
|
||
|
#endif
|
||
|
2:
|
||
|
bge/u r3, r7, tr3 ! skip alloco for last 2 lines
|
||
|
alloco r3, 0x40 ! alloc destination line 2 lines ahead
|
||
|
synco ! TAKum03020
|
||
|
3:
|
||
|
ldx.q r3, r60, r36
|
||
|
ldx.q r3, r61, r37
|
||
|
ldx.q r3, r62, r38
|
||
|
ldx.q r3, r23, r39
|
||
|
st.q r3, 0, r36
|
||
|
st.q r3, 8, r37
|
||
|
st.q r3, 16, r38
|
||
|
st.q r3, 24, r39
|
||
|
addi r3, 32, r3
|
||
|
bgt/l r8, r3, tr1
|
||
|
|
||
|
blink tr0, r63 ! return
|
||
|
|
||
|
|