[ create a new paste ] login | about

Link: http://codepad.org/Ot3oQyRV    [ raw code | output | fork ]

C, pasted on Sep 8:
#define N (32 << 10)
float a[N];
float b[N];

fs()
{
        int i;
        for (i = 0; i < N; i++)
                b[i] = a[i] * a[i];
}

fv_a()
{
        int i;
        for (i = 0; i < N; i += 4)
                __asm__ volatile (
                        "movaps (%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, (%1);"
                        :
                        : "r"(a + i), "r"(b + i)
                        : "memory"
                );
}

fv_b()
{
        int i;
        for (i = 0; i < N; i += 32)
                __asm__ volatile (
                        "prefetchnta 2048(%0);"

                        "movaps   0(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0,   0(%1);"
                        "movaps  16(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0,  16(%1);"
                        "movaps  32(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0,  32(%1);"
                        "movaps  48(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0,  48(%1);"
                        "movaps  64(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0,  64(%1);"
                        "movaps  80(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0,  80(%1);"
                        "movaps  96(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0,  96(%1);"
                        "movaps 112(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, 112(%1);"
                        :
                        : "r"(a + i), "r"(b + i)
                        : "memory"
                );
}

fv_c()
{
        int i;
        for (i = 0; i < N; i += 32)
                __asm__ volatile (
                        "prefetchnta 2048(%0);"

                        "movaps   0(%0), %%xmm0;"
                        "movaps  16(%0), %%xmm1;"
                        "movaps  32(%0), %%xmm2;"
                        "movaps  48(%0), %%xmm3;"
                        "movaps  64(%0), %%xmm4;"
                        "movaps  80(%0), %%xmm5;"
                        "movaps  96(%0), %%xmm6;"
                        "movaps 112(%0), %%xmm7;"

                        "mulps %%xmm0, %%xmm0;"
                        "mulps %%xmm1, %%xmm1;"
                        "mulps %%xmm2, %%xmm2;"
                        "mulps %%xmm3, %%xmm3;"
                        "mulps %%xmm4, %%xmm4;"
                        "mulps %%xmm5, %%xmm5;"
                        "mulps %%xmm6, %%xmm6;"
                        "mulps %%xmm7, %%xmm7;"

                        "movaps %%xmm0,   0(%1);"
                        "movaps %%xmm1,  16(%1);"
                        "movaps %%xmm2,  32(%1);"
                        "movaps %%xmm3,  48(%1);"
                        "movaps %%xmm4,  64(%1);"
                        "movaps %%xmm5,  80(%1);"
                        "movaps %%xmm6,  96(%1);"
                        "movaps %%xmm7, 112(%1);"
                        :
                        : "r"(a + i), "r"(b + i)
                        : "memory"
                );
}



/* test */

#include <stdio.h>
#include <string.h>

main()
{
        unsigned long long t(void)
        {
                unsigned long long r;
                __asm__ volatile ("rdtsc;":"=A"(r));
                return r;
        }

        int i = N;
        while (i--)
                a[i] = 0.1 * i;

        fs();
        fv_a();
        fv_b();
        fv_c();

        unsigned long long ts[4] = {[0 ... 3] = 0};

        i = 1000;
        while (i--) {
                unsigned long long t0;

                memset((void*)b, 0, N * 4);
                t0 = t();
                fs();
                ts[0] += t() - t0;

                memset((void*)b, 0, N * 4);
                t0 = t();
                fv_a();
                ts[1] += t() - t0;

                memset((void*)b, 0, N * 4);
                t0 = t();
                fv_b();
                ts[2] += t() - t0;

                memset((void*)b, 0, N * 4);
                t0 = t();
                fv_c();
                ts[3] += t() - t0;
        }

        double _ts[4] = {ts[0], ts[1], ts[2], ts[3]};

        printf(
                "fs:   %+16.0llu clock \t\t(%f fs/fx)\n"
                "fv_a: %+16.0llu clock \t\t(%f fs/fx)\n"
                "fv_b: %+16.0llu clock \t\t(%f fs/fx)\n"
                "fv_c: %+16.0llu clock \t\t(%f fs/fx)\n",
                ts[0], _ts[0] / _ts[0],
                ts[1], _ts[0] / _ts[1],
                ts[2], _ts[0] / _ts[2],
                ts[3], _ts[0] / _ts[3]
        );
}


Output:
1
2
3
4
5
6
fs:          101783872 clock 		(1.000000 fs/fx)
fv_a:        232093920 clock 		(0.438546 fs/fx)
fv_b:        159881373 clock 		(0.636621 fs/fx)
fv_c:         23414852 clock 		(4.346979 fs/fx)

Exited: ExitFailure 192


Create a new paste based on this one


Comments: