diff --git a/app/tests/benchmarks.c b/app/tests/benchmarks.c index 1a8b7384..8e47e9d4 100644 --- a/app/tests/benchmarks.c +++ b/app/tests/benchmarks.c @@ -5,21 +5,32 @@ * license that can be found in the LICENSE file or at * https://opensource.org/licenses/MIT */ -#include -#include -#include -#include -#include -#include #include -#include +#include +#include #include #include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include -const size_t BUFSIZE = (1024*1024); -const uint ITER = 1024; +// quickly guess how big of a buffer we can try to allocate +#if !defined(MEMSIZE) || MEMSIZE > (1024 * 1024) +static const size_t BUFSIZE = (size_t)1024 * 1024; +static const uint ITER = 1024; +#else +static const size_t BUFSIZE = (4 * 1024); +static const uint ITER = 1024 * 32; +#endif +// Have to use a define to work around gcc 7.x bug where it thinks +// BUFSIZE is not constant. +#define TOTAL_SIZE ((uint64_t)BUFSIZE * ITER) __NO_INLINE static void bench_set_overhead(void) { uint32_t *buf = malloc(BUFSIZE); @@ -51,44 +62,50 @@ __NO_INLINE static void bench_memset(void) { memset(buf, 0, BUFSIZE); } count = arch_cycle_count() - count; + if (count == 0) { + count = 1; + } - size_t total_bytes = BUFSIZE * ITER; - double bytes_cycle = total_bytes / (double)count; - printf_float("took %lu cycles to memset a buffer of size %zu %d times (%zu bytes), %f bytes/cycle\n", - count, BUFSIZE, ITER, total_bytes, bytes_cycle); + uint64_t bytes_cycle = (TOTAL_SIZE * 1000) / count; + printf("took %lu cycles to memset a buffer of size %zu %u times" + "(%" PRIu64 " bytes), %" PRIu64 ".%03" PRIu64 " bytes/cycle\n", + count, BUFSIZE, ITER, TOTAL_SIZE, bytes_cycle / 1000, bytes_cycle % 1000); free(buf); } -#define bench_cset(type) \ -__NO_INLINE static void bench_cset_##type(void) \ -{ \ - type *buf = malloc(BUFSIZE); \ - if (!buf) { \ - printf("failed to allocate buffer\n"); \ - return; \ - } \ - \ - ulong count = arch_cycle_count(); \ - for (uint i = 0; i < ITER; i++) { \ - for (uint j = 0; j < BUFSIZE / sizeof(*buf); j++) { \ - buf[j] = 0; \ - } \ - } \ - count = arch_cycle_count() - count; \ - \ - size_t total_bytes = BUFSIZE * ITER; \ - double bytes_cycle = total_bytes / (double)count; \ - printf_float("took %lu cycles to manually clear a buffer using wordsize %zu of size %zu %u times (%zu bytes), %f bytes/cycle\n", \ - count, sizeof(*buf), BUFSIZE, ITER, total_bytes, bytes_cycle); \ - \ - free(buf); \ -} +#define bench_cset(type) \ + __NO_INLINE static void bench_cset_##type(void) { \ + type *buf = malloc(BUFSIZE); \ + if (!buf) { \ + printf("failed to allocate buffer\n"); \ + return; \ + } \ + \ + ulong count = arch_cycle_count(); \ + for (uint i = 0; i < ITER; i++) { \ + for (uint j = 0; j < BUFSIZE / sizeof(*buf); j++) { \ + buf[j] = 0; \ + } \ + } \ + count = arch_cycle_count() - count; \ + if (count == 0) { \ + count = 1; \ + } \ + \ + uint64_t bytes_cycle = (TOTAL_SIZE * 1000) / count; \ + printf("took %lu cycles to manually clear a buffer using wordsize %zu of size %zu %u times " \ + "(%" PRIu64 " bytes), %" PRIu64 ".%03" PRIu64 " bytes/cycle\n", \ + count, sizeof(*buf), BUFSIZE, ITER, TOTAL_SIZE, bytes_cycle / 1000, bytes_cycle % 1000); \ + free(buf); \ + } +// clang-format off bench_cset(uint8_t) bench_cset(uint16_t) bench_cset(uint32_t) bench_cset(uint64_t) +// clang-format on __NO_INLINE static void bench_cset_wide(void) { uint32_t *buf = malloc(BUFSIZE); @@ -99,23 +116,26 @@ __NO_INLINE static void bench_cset_wide(void) { ulong count = arch_cycle_count(); for (uint i = 0; i < ITER; i++) { - for (uint j = 0; j < BUFSIZE / sizeof(*buf) / 8; j++) { - buf[j*8] = 0; - buf[j*8+1] = 0; - buf[j*8+2] = 0; - buf[j*8+3] = 0; - buf[j*8+4] = 0; - buf[j*8+5] = 0; - buf[j*8+6] = 0; - buf[j*8+7] = 0; + for (size_t j = 0; j < BUFSIZE / sizeof(*buf) / 8; j++) { + buf[j * 8] = 0; + buf[j * 8 + 1] = 0; + buf[j * 8 + 2] = 0; + buf[j * 8 + 3] = 0; + buf[j * 8 + 4] = 0; + buf[j * 8 + 5] = 0; + buf[j * 8 + 6] = 0; + buf[j * 8 + 7] = 0; } } count = arch_cycle_count() - count; + if (count == 0) { + count = 1; + } - size_t total_bytes = BUFSIZE * ITER; - double bytes_cycle = total_bytes / (double)count; - printf_float("took %lu cycles to manually clear a buffer of size %zu %d times 8 words at a time (%zu bytes), %f bytes/cycle\n", - count, BUFSIZE, ITER, total_bytes, bytes_cycle); + uint64_t bytes_cycle = (TOTAL_SIZE * 1000) / count; + printf("took %lu cycles to manually clear a buffer of size %zu %d times 8 words at a time " + "(%" PRIu64 " bytes), %" PRIu64 ".%03" PRIu64 " bytes/cycle\n", + count, BUFSIZE, ITER, TOTAL_SIZE, bytes_cycle / 1000, bytes_cycle % 1000); free(buf); } @@ -132,11 +152,15 @@ __NO_INLINE static void bench_memcpy(void) { memcpy(buf, buf + BUFSIZE / 2, BUFSIZE / 2); } count = arch_cycle_count() - count; + if (count == 0) { + count = 1; + } - size_t total_bytes = (BUFSIZE / 2) * ITER; - double bytes_cycle = total_bytes / (double)count; - printf_float("took %lu cycles to memcpy a buffer of size %zu %d times (%zu source bytes), %f source bytes/cycle\n", - count, BUFSIZE / 2, ITER, total_bytes, bytes_cycle); + uint64_t total_bytes = TOTAL_SIZE / 2; + uint64_t bytes_cycle = (total_bytes * 1000) / count; + printf("took %lu cycles to memcpy a buffer of size %zu %d times (%" PRIu64 " source bytes), " + "%" PRIu64 ".%03" PRIu64 " source bytes/cycle\n", + count, BUFSIZE / 2, ITER, total_bytes, bytes_cycle / 1000, bytes_cycle % 1000); free(buf); } @@ -153,22 +177,23 @@ __NO_INLINE static void arm_bench_cset_stm(void) { for (uint i = 0; i < ITER; i++) { for (uint j = 0; j < BUFSIZE / sizeof(*buf) / 8; j++) { __asm__ volatile( - "stm %0, {r0-r7};" - :: "r" (&buf[j*8]) - ); + "stm %0, {r0-r7};" ::"r"(&buf[j * 8])); } } count = arch_cycle_count() - count; + if (count == 0) { + count = 1; + } - size_t total_bytes = BUFSIZE * ITER; - double bytes_cycle = total_bytes / (float)count; - printf_float("took %lu cycles to manually clear a buffer of size %zu %d times 8 words at a time using stm (%zu bytes), %f bytes/cycle\n", - count, BUFSIZE, ITER, total_bytes, bytes_cycle); + uint64_t bytes_cycle = (TOTAL_SIZE * 1000) / count; + printf("took %lu cycles to manually clear a buffer of size %zu %d times 8 words at a time using stm " + "(%" PRIu64 " bytes), %" PRIu64 ".%03" PRIu64 " bytes/cycle\n", + count, BUFSIZE, ITER, TOTAL_SIZE, bytes_cycle / 1000, bytes_cycle % 1000); free(buf); } -#if (__CORTEX_M >= 0x03) +#if (__CORTEX_M >= 0x03) __NO_INLINE static void arm_bench_multi_issue(void) { ulong cycles; uint32_t a = 0, b = 0, c = 0, d = 0, e = 0, f = 0, g = 0, h = 0; @@ -176,20 +201,21 @@ __NO_INLINE static void arm_bench_multi_issue(void) { uint count = ITER; cycles = arch_cycle_count(); while (count--) { - asm volatile (""); - asm volatile ("add %0, %0, %0" : "=r" (a) : "r" (a)); - asm volatile ("add %0, %0, %0" : "=r" (b) : "r" (b)); - asm volatile ("and %0, %0, %0" : "=r" (c) : "r" (c)); - asm volatile ("mov %0, %0" : "=r" (d) : "r" (d)); - asm volatile ("orr %0, %0, %0" : "=r" (e) : "r" (e)); - asm volatile ("add %0, %0, %0" : "=r" (f) : "r" (f)); - asm volatile ("and %0, %0, %0" : "=r" (g) : "r" (g)); - asm volatile ("mov %0, %0" : "=r" (h) : "r" (h)); + asm volatile(""); + asm volatile("add %0, %0, %0" : "=r"(a) : "r"(a)); + asm volatile("add %0, %0, %0" : "=r"(b) : "r"(b)); + asm volatile("and %0, %0, %0" : "=r"(c) : "r"(c)); + asm volatile("mov %0, %0" : "=r"(d) : "r"(d)); + asm volatile("orr %0, %0, %0" : "=r"(e) : "r"(e)); + asm volatile("add %0, %0, %0" : "=r"(f) : "r"(f)); + asm volatile("and %0, %0, %0" : "=r"(g) : "r"(g)); + asm volatile("mov %0, %0" : "=r"(h) : "r"(h)); } cycles = arch_cycle_count() - cycles; - double cycles_iter = (float)cycles / ITER; - printf_float("took %lu cycles to issue 8 integer ops (%f cycles/iteration)\n", cycles, cycles_iter); + ulong cycles_per_iter = (cycles * 1000) / ITER; + printf("took %lu cycles to issue 8 integer ops (%lu.%03lu cycles/iteration)\n", cycles, + cycles_per_iter / 1000, cycles_per_iter % 1000); #undef ITER } #endif // __CORTEX_M @@ -251,7 +277,7 @@ int benchmarks(int argc, const console_cmd_args *argv) { #if ARCH_ARM arm_bench_cset_stm(); -#if (__CORTEX_M >= 0x03) +#if (__CORTEX_M >= 0x03) arm_bench_multi_issue(); #endif #endif @@ -261,4 +287,3 @@ int benchmarks(int argc, const console_cmd_args *argv) { return NO_ERROR; } - diff --git a/app/tests/rules.mk b/app/tests/rules.mk index 4fd5c4dd..ea30e888 100644 --- a/app/tests/rules.mk +++ b/app/tests/rules.mk @@ -8,11 +8,11 @@ MODULE_SRCS := \ $(LOCAL_DIR)/clock_tests.c \ $(LOCAL_DIR)/fibo.c \ $(LOCAL_DIR)/mem_tests.c \ + $(LOCAL_DIR)/port_tests.c \ $(LOCAL_DIR)/tests.c \ $(LOCAL_DIR)/thread_tests.c \ - $(LOCAL_DIR)/port_tests.c \ - $(LOCAL_DIR)/v9p_tests.c \ $(LOCAL_DIR)/v9fs_tests.c \ + $(LOCAL_DIR)/v9p_tests.c \ MODULE_FLOAT_SRCS := \ $(LOCAL_DIR)/benchmarks.c \