[lib][libc][arm] optimize memcpy/memmove a bit for cortex-a9
-Prefetch 64 bytes ahead -Copy a full 32 bytes at once in a single ldm/stm instruction This increased performance in best case on a Zybo board from ~250MB/sec to ~680MB/sec.
This commit is contained in:
@@ -76,20 +76,19 @@ FUNCTION(memcpy)
|
||||
// and we need at least 32 bytes remaining to copy
|
||||
|
||||
// save r6-r7 for use in the big copy
|
||||
stmfd sp!, {r6-r7}
|
||||
stmfd sp!, {r6-r11}
|
||||
|
||||
sub r2, r2, #32 // subtract an extra 32 to the len so we can avoid an extra compare
|
||||
|
||||
.L_bigcopy_loop:
|
||||
ldmia r1!, {r4, r5, r6, r7}
|
||||
stmia r0!, {r4, r5, r6, r7}
|
||||
ldmia r1!, {r4, r5, r6, r7}
|
||||
pld [r1, #64]
|
||||
ldmia r1!, {r4-r11}
|
||||
subs r2, r2, #32
|
||||
stmia r0!, {r4, r5, r6, r7}
|
||||
stmia r0!, {r4-r11}
|
||||
bge .L_bigcopy_loop
|
||||
|
||||
// restore r6-r7
|
||||
ldmfd sp!, {r6-r7}
|
||||
ldmfd sp!, {r6-r11}
|
||||
|
||||
// see if we are done
|
||||
adds r2, r2, #32
|
||||
|
||||
Reference in New Issue
Block a user