#include "modp_ascii.h" #include "modp_ascii_data.h" #include #include #include #include //extern const char gsToUpperMap[256]; /** * This is standard clib implementation of uppercasing a string. * It has an unfair advantage since it's inside the test file * so the optimizer can inline it. */ static void toupper_copy1(char* dest, const char* str, size_t len) { size_t i; for (i = 0; i < len; ++i) { // toupper is defined in *dest++ = (char)toupper((int)str[i]); } *dest = 0; } /** * Skipping ctype, and doing the compare directly * */ static void toupper_copy2(char* dest, const char* str, size_t len) { size_t i; char c; for (i = 0; i < len; ++i) { c = str[i]; *dest++ = (char)((c >= 'a' && c <= 'z') ? c : (c - 32)); } *dest = 0; } /** * Sequential table lookup */ static void toupper_copy3(char* dest, const char* str, size_t len) { size_t i; unsigned char c; for (i = 0; i < len; ++i) { c = (unsigned char)str[i]; *dest++ = (char)gsToUpperMap[c]; } *dest = 0; } /** \brief toupper Version 4 -- parallel table lookup * * */ static void toupper_copy4(char* dest, const char* str, size_t len) { /* * size_t i; * for (i = 0; i < len; ++i) { * char c = str[i]; * *dest++ = (c >= 'a' && c <= 'z') ? c - 32 : c; * } */ size_t i; uint8_t c1, c2, c3, c4; const size_t leftover = len % 4; const size_t imax = len - leftover; const uint8_t* s = (const uint8_t*)str; for (i = 0; i != imax; i += 4) { /* * it's important to make these variables * it helps the optimizer to figure out what to do */ c1 = s[i], c2 = s[i + 1], c3 = s[i + 2], c4 = s[i + 3]; dest[0] = (char)gsToUpperMap[c1]; dest[1] = (char)gsToUpperMap[c2]; dest[2] = (char)gsToUpperMap[c3]; dest[3] = (char)gsToUpperMap[c4]; dest += 4; } switch (leftover) { case 3: *dest++ = (char)gsToUpperMap[s[i++]]; case 2: *dest++ = (char)gsToUpperMap[s[i++]]; case 1: *dest++ = (char)gsToUpperMap[s[i]]; case 0: *dest = '\0'; } } /** \brief toupper Versions 5 -- hsieh alternate * Based code from Paul Hsieh * http://www.azillionmonkeys.com/qed/asmexample.html * * This was his "improved" version, but it appears to either run just * as fast, or a bit slower than his original version */ static void toupper_copy5(char* dest, const char* str, size_t len) { size_t i; uint32_t eax, ebx, ecx, edx; const uint8_t* ustr = (const uint8_t*)str; const int leftover = len % 4; const size_t imax = len / 4; const uint32_t* s = (const uint32_t*)str; uint32_t* d = (uint32_t*)dest; for (i = 0; i != imax; ++i) { eax = s[i]; ebx = 0x80808080u | eax; ecx = ebx - 0x61616161u; edx = ~(ebx - 0x7b7b7b7bu); ebx = (ecx & edx) & (~eax & 0x80808080u); *d++ = eax - (ebx >> 2); } i = imax * 4; dest = (char*)d; switch (leftover) { case 3: *dest++ = (char)gsToUpperMap[ustr[i++]]; case 2: *dest++ = (char)gsToUpperMap[ustr[i++]]; case 1: *dest++ = (char)gsToUpperMap[ustr[i]]; case 0: *dest = '\0'; } } /** \brief ToUpper Version 6 -- Hsieh original, ASM style * Based code from Paul Hsieh * http://www.azillionmonkeys.com/qed/asmexample.html * * This is almost a direct port of the original ASM code, on some * platforms/compilers it does run faster then the "de-asm'ed" version * used in the modp library. * */ static void toupper_copy6(char* dest, const char* str, size_t len) { size_t i = 0; uint32_t eax, ebx, ecx, edx; const uint8_t* ustr = (const uint8_t*)str; const int leftover = len % 4; const size_t imax = len / 4; const uint32_t* s = (const uint32_t*)str; uint32_t* d = (uint32_t*)dest; for (i = 0; i != imax; ++i) { #if 1 /* * as close to original asm code as possible */ eax = s[i]; ebx = 0x7f7f7f7f; edx = 0x7f7f7f7f; ebx = ebx & eax; ebx = ebx + 0x05050505; ecx = eax; ecx = ~ecx; ebx = ebx & edx; ebx = ebx + 0x1a1a1a1a; ebx = ebx & ecx; ebx = ebx >> 2; ebx = ebx & 0x20202020; eax = eax - ebx; *d++ = eax; #else /* * "de-asm'ed" version, this is what is used in the modp library */ eax = s[i]; ebx = (0x7f7f7f7ful & eax) + 0x05050505ul; ebx = (0x7f7f7f7ful & ebx) + 0x1a1a1a1aul; ebx = ((ebx & ~eax) >> 2) & 0x20202020ul; *d++ = eax - ebx; #endif } i = imax * 4; dest = (char*)d; switch (leftover) { case 3: *dest++ = (char)gsToUpperMap[ustr[i++]]; case 2: *dest++ = (char)gsToUpperMap[ustr[i++]]; case 1: *dest++ = (char)gsToUpperMap[ustr[i]]; case 0: *dest = '\0'; } } static void modp_toupper_copy_a2(char* dest, const char* str, size_t len) { size_t i = 0; uint32_t eax, ebx; const uint8_t* ustr = (const uint8_t*)str; const size_t leftover = len % 4; const size_t imax = len / 4; const uint32_t* s = (const uint32_t*)str; uint32_t* d = (uint32_t*)dest; for (i = 0; i != imax; ++i) { eax = s[i]; ebx = (0x7f7f7f7fu & eax) + 0x05050505u; ebx = (0x7f7f7f7fu & ebx) + 0x1a1a1a1au; ebx = ((ebx & ~eax) >> 2) & 0x20202020u; *d++ = eax - ebx; } i = imax * 4; dest = (char*)d; switch (leftover) { case 3: *dest++ = (char)gsToUpperMap[ustr[i++]]; case 2: *dest++ = (char)gsToUpperMap[ustr[i++]]; case 1: *dest++ = (char)gsToUpperMap[ustr[i]]; case 0: *dest = '\0'; } } int main(void) { double last = 0.0; size_t i = 0; char buf[256]; char obuf[300]; for (i = 0; i < 256; ++i) { buf[i] = (char)i; } uint32_t max = 1000000; clock_t t0, t1; printf("%s", "type\tclib\tdirect\tmap\tpara\thsieh1\thsieh2\tAlt\tFinal\timprovement\n"); printf("toupper\t"); fflush(stdout); /** ** V1 **/ t0 = clock(); for (i = 0; i < max; ++i) { toupper_copy1(obuf, buf, sizeof(buf)); } t1 = clock(); last = (double)(t1 - t0); printf("%lu\t", (t1 - t0)); fflush(stdout); /** ** V2 **/ t0 = clock(); for (i = 0; i < max; ++i) { toupper_copy2(obuf, buf, sizeof(buf)); } t1 = clock(); printf("%lu\t", (t1 - t0)); fflush(stdout); /** ** V3 **/ t0 = clock(); for (i = 0; i < max; ++i) { toupper_copy3(obuf, buf, sizeof(buf)); } t1 = clock(); printf("%lu\t", (t1 - t0)); fflush(stdout); /** ** V4 -- Parallel Table Lookup **/ t0 = clock(); for (i = 0; i < max; ++i) { toupper_copy4(obuf, buf, sizeof(buf)); } t1 = clock(); printf("%lu\t", (t1 - t0)); fflush(stdout); /** ** V5 -- Hsieh Alternate **/ t0 = clock(); for (i = 0; i < max; ++i) { toupper_copy5(obuf, buf, sizeof(buf)); } t1 = clock(); printf("%lu\t", (t1 - t0)); fflush(stdout); /** ** HSEIH -- asm style **/ t0 = clock(); for (i = 0; i < max; ++i) { toupper_copy6(obuf, buf, sizeof(buf)); } t1 = clock(); printf("%lu\t", (t1 - t0)); fflush(stdout); /** ** MODP ALT **/ t0 = clock(); for (i = 0; i < max; ++i) { modp_toupper_copy_a2(obuf, buf, sizeof(buf)); } t1 = clock(); printf("%lu\t", (t1 - t0)); fflush(stdout); /** ** MODP FINAL **/ t0 = clock(); for (i = 0; i < max; ++i) { modp_toupper_copy(obuf, buf, sizeof(buf)); } t1 = clock(); printf("%lu\t", (t1 - t0)); fflush(stdout); printf("%.1fx\n", last / ((double)(t1 - t0))); fflush(stdout); return 0; }