#define N (32 << 10)
float a[N];
float b[N];
fs()
{
int i;
for (i = 0; i < N; i++)
b[i] = a[i] * a[i];
}
fv_a()
{
int i;
for (i = 0; i < N; i += 4)
__asm__ volatile (
"movaps (%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, (%1);"
:
: "r"(a + i), "r"(b + i)
: "memory"
);
}
fv_b()
{
int i;
for (i = 0; i < N; i += 32)
__asm__ volatile (
"prefetchnta 2048(%0);"
"movaps 0(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, 0(%1);"
"movaps 16(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, 16(%1);"
"movaps 32(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, 32(%1);"
"movaps 48(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, 48(%1);"
"movaps 64(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, 64(%1);"
"movaps 80(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, 80(%1);"
"movaps 96(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, 96(%1);"
"movaps 112(%0), %%xmm0; mulps %%xmm0, %%xmm0; movaps %%xmm0, 112(%1);"
:
: "r"(a + i), "r"(b + i)
: "memory"
);
}
fv_c()
{
int i;
for (i = 0; i < N; i += 32)
__asm__ volatile (
"prefetchnta 2048(%0);"
"movaps 0(%0), %%xmm0;"
"movaps 16(%0), %%xmm1;"
"movaps 32(%0), %%xmm2;"
"movaps 48(%0), %%xmm3;"
"movaps 64(%0), %%xmm4;"
"movaps 80(%0), %%xmm5;"
"movaps 96(%0), %%xmm6;"
"movaps 112(%0), %%xmm7;"
"mulps %%xmm0, %%xmm0;"
"mulps %%xmm1, %%xmm1;"
"mulps %%xmm2, %%xmm2;"
"mulps %%xmm3, %%xmm3;"
"mulps %%xmm4, %%xmm4;"
"mulps %%xmm5, %%xmm5;"
"mulps %%xmm6, %%xmm6;"
"mulps %%xmm7, %%xmm7;"
"movaps %%xmm0, 0(%1);"
"movaps %%xmm1, 16(%1);"
"movaps %%xmm2, 32(%1);"
"movaps %%xmm3, 48(%1);"
"movaps %%xmm4, 64(%1);"
"movaps %%xmm5, 80(%1);"
"movaps %%xmm6, 96(%1);"
"movaps %%xmm7, 112(%1);"
:
: "r"(a + i), "r"(b + i)
: "memory"
);
}
/* test */
#include <stdio.h>
#include <string.h>
main()
{
unsigned long long t(void)
{
unsigned long long r;
__asm__ volatile ("rdtsc;":"=A"(r));
return r;
}
int i = N;
while (i--)
a[i] = 0.1 * i;
fs();
fv_a();
fv_b();
fv_c();
unsigned long long ts[4] = {[0 ... 3] = 0};
i = 1000;
while (i--) {
unsigned long long t0;
memset((void*)b, 0, N * 4);
t0 = t();
fs();
ts[0] += t() - t0;
memset((void*)b, 0, N * 4);
t0 = t();
fv_a();
ts[1] += t() - t0;
memset((void*)b, 0, N * 4);
t0 = t();
fv_b();
ts[2] += t() - t0;
memset((void*)b, 0, N * 4);
t0 = t();
fv_c();
ts[3] += t() - t0;
}
double _ts[4] = {ts[0], ts[1], ts[2], ts[3]};
printf(
"fs: %+16.0llu clock \t\t(%f fs/fx)\n"
"fv_a: %+16.0llu clock \t\t(%f fs/fx)\n"
"fv_b: %+16.0llu clock \t\t(%f fs/fx)\n"
"fv_c: %+16.0llu clock \t\t(%f fs/fx)\n",
ts[0], _ts[0] / _ts[0],
ts[1], _ts[0] / _ts[1],
ts[2], _ts[0] / _ts[2],
ts[3], _ts[0] / _ts[3]
);
}