4 in 1

うっかり glibcstrlen を覗いたら面白いことになっていたので、実験。

$ cat test.c
#include <stdlib.h>
#include <string.h>

size_t strlen_naive(const char *str)
{
    const char *head = str;
    while (*str)
        str++;
    return (str - head);
}

size_t strlen_fast(const char *str)
{
    const char *head = str;
    unsigned long word1, word2, *wordp;
    while ((unsigned int)str & (sizeof(word1) - 1)) {
        if (!*str)
            return str - head;
        str++;
    }
    word1 = 0x01010101UL, word2 = 0x80808080UL;
    if (sizeof(word1) > 4)
        word1 = (0x01010101UL << 32) | 0x01010101UL, word2 = (0x80808080UL << 32) | 0x80808080UL;
    wordp = (unsigned long *)str;
    for (;;) {
        if ((*wordp - word1) & word2) {
            str = (const char *)wordp;
            if (!str[0]) return str - head;
            if (!str[1]) return str - head + 1;
            if (!str[2]) return str - head + 2;
            if (!str[3]) return str - head + 3;
            if (sizeof(word1) > 4) {
                if (!str[4]) return str - head + 4;
                if (!str[5]) return str - head + 5;
                if (!str[6]) return str - head + 6;
                if (!str[7]) return str - head + 7;
            }
        }
        wordp++;
    }
}

int main(int argc, char *argv[])
{
    char buffer[1024 * 1024];
    int i, j = 0;
    for (i = 0; i < sizeof(buffer); i++)
        buffer[i] = 'A';
    buffer[sizeof(buffer) - 1] = 0;
    switch (*argv[1]) {
        case '0': for (i = 10000; --i; ) j += strlen      (buffer); break;
        case '1': for (i = 10000; --i; ) j += strlen_naive(buffer); break;
        case '2': for (i = 10000; --i; ) j += strlen_fast (buffer); break;
    }
    return 0;
}
$ gcc -w test.c
$ time ./a.out 0
       21.04 real        21.02 user         0.00 sys
$ time ./a.out 1
       24.55 real        24.54 user         0.00 sys
$ time ./a.out 2
        5.81 real         5.81 user         0.00 sys

速っ。

ちなみに -O を付けると

$ gcc -w -O test.c
$ time ./a.out 0
        0.00 real         0.00 user         0.00 sys
$ time ./a.out 1
        4.42 real         4.42 user         0.00 sys
$ time ./a.out 2
        1.75 real         1.75 user         0.00 sys

組み込みの strlen が超速になってるのはもちろん最適化のせい。ソースの頭に

size_t strlen_naive(const char *str) __attribute__((pure));
size_t strlen_fast(const char *str) __attribute__((pure));

を付ければ

$ gcc -w -O test.c
$ time ./a.out 1
        0.00 real         0.00 user         0.00 sys
$ time ./a.out 2
        0.00 real         0.00 user         0.00 sys