[lib][libc][arm] optimize memcpy/memmove a bit for cortex-a9

-Prefetch 64 bytes ahead
-Copy a full 32 bytes at once in a single ldm/stm instruction

This increased performance in best case on a Zybo board from
~250MB/sec to ~680MB/sec.
This commit is contained in:
Travis Geiselbrecht
2014-08-04 21:16:08 -07:00
parent 40f22ac60a
commit e81ac4155a

View File

@@ -76,20 +76,19 @@ FUNCTION(memcpy)
// and we need at least 32 bytes remaining to copy
// save r6-r7 for use in the big copy
stmfd sp!, {r6-r7}
stmfd sp!, {r6-r11}
sub r2, r2, #32 // subtract an extra 32 to the len so we can avoid an extra compare
.L_bigcopy_loop:
ldmia r1!, {r4, r5, r6, r7}
stmia r0!, {r4, r5, r6, r7}
ldmia r1!, {r4, r5, r6, r7}
pld [r1, #64]
ldmia r1!, {r4-r11}
subs r2, r2, #32
stmia r0!, {r4, r5, r6, r7}
stmia r0!, {r4-r11}
bge .L_bigcopy_loop
// restore r6-r7
ldmfd sp!, {r6-r7}
ldmfd sp!, {r6-r11}
// see if we are done
adds r2, r2, #32