mirror of
https://github.com/torvalds/linux.git
synced 2024-11-16 17:12:06 +00:00
1da177e4c3
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
92 lines
2.1 KiB
ArmAsm
92 lines
2.1 KiB
ArmAsm
/*
|
|
Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
|
|
|
|
This file is subject to the terms and conditions of the GNU General Public
|
|
License. See the file "COPYING" in the main directory of this archive
|
|
for more details.
|
|
|
|
Tight version of mempy for the case of just copying a page.
|
|
Prefetch strategy empirically optimised against RTL simulations
|
|
of SH5-101 cut2 eval chip with Cayman board DDR memory.
|
|
|
|
Parameters:
|
|
r2 : source effective address (start of page)
|
|
r3 : destination effective address (start of page)
|
|
|
|
Always copies 4096 bytes.
|
|
|
|
Points to review.
|
|
* Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
|
|
It seems like the prefetch needs to be at at least 4 lines ahead to get
|
|
the data into the cache in time, and the allocos contend with outstanding
|
|
prefetches for the same cache set, so it's better to have the numbers
|
|
different.
|
|
*/
|
|
|
|
.section .text..SHmedia32,"ax"
|
|
.little
|
|
|
|
.balign 8
|
|
.global sh64_page_copy
|
|
sh64_page_copy:
|
|
|
|
/* Copy 4096 bytes worth of data from r2 to r3.
|
|
Do prefetches 4 lines ahead.
|
|
Do alloco 2 lines ahead */
|
|
|
|
pta 1f, tr1
|
|
pta 2f, tr2
|
|
pta 3f, tr3
|
|
ptabs r18, tr0
|
|
|
|
#if 0
|
|
/* TAKum03020 */
|
|
ld.q r2, 0x00, r63
|
|
ld.q r2, 0x20, r63
|
|
ld.q r2, 0x40, r63
|
|
ld.q r2, 0x60, r63
|
|
#endif
|
|
alloco r3, 0x00
|
|
synco ! TAKum03020
|
|
alloco r3, 0x20
|
|
synco ! TAKum03020
|
|
|
|
movi 3968, r6
|
|
add r3, r6, r6
|
|
addi r6, 64, r7
|
|
addi r7, 64, r8
|
|
sub r2, r3, r60
|
|
addi r60, 8, r61
|
|
addi r61, 8, r62
|
|
addi r62, 8, r23
|
|
addi r60, 0x80, r22
|
|
|
|
/* Minimal code size. The extra branches inside the loop don't cost much
|
|
because they overlap with the time spent waiting for prefetches to
|
|
complete. */
|
|
1:
|
|
#if 0
|
|
/* TAKum03020 */
|
|
bge/u r3, r6, tr2 ! skip prefetch for last 4 lines
|
|
ldx.q r3, r22, r63 ! prefetch 4 lines hence
|
|
#endif
|
|
2:
|
|
bge/u r3, r7, tr3 ! skip alloco for last 2 lines
|
|
alloco r3, 0x40 ! alloc destination line 2 lines ahead
|
|
synco ! TAKum03020
|
|
3:
|
|
ldx.q r3, r60, r36
|
|
ldx.q r3, r61, r37
|
|
ldx.q r3, r62, r38
|
|
ldx.q r3, r23, r39
|
|
st.q r3, 0, r36
|
|
st.q r3, 8, r37
|
|
st.q r3, 16, r38
|
|
st.q r3, 24, r39
|
|
addi r3, 32, r3
|
|
bgt/l r8, r3, tr1
|
|
|
|
blink tr0, r63 ! return
|
|
|
|
|