#include <cstdlib>
#include <iostream>
#include <ctime>
#include <xmmintrin.h>
#include <emmintrin.h>
typedef __m128 sse_t;
namespace {
template <size_t i> struct afloat_array {
typedef __declspec(align(16)) float type[i];
};
template <size_t len>
struct Vector2 {
typename afloat_array<len>::type a, b, c, d;
};
const size_t size = 1000000;
const size_t more = 1000;
afloat_array<size>::type dots;
Vector2<size> b, c;
}
int main () {
// spread randomness to prevent folding actions
srand (0);
for (size_t i=0; i<size; ++i) {
dots [i] = rand () / (RAND_MAX+1.0f);
b.a [i]= rand () / (RAND_MAX+1.0f);
b.b [i]= rand () / (RAND_MAX+1.0f);
b.c [i]= rand () / (RAND_MAX+1.0f);
b.d [i]= rand () / (RAND_MAX+1.0f);
c.a [i]= rand () / (RAND_MAX+1.0f);
c.b [i]= rand () / (RAND_MAX+1.0f);
c.c [i]= rand () / (RAND_MAX+1.0f);
c.d [i]= rand () / (RAND_MAX+1.0f);
}
const clock_t cpu_begin = clock();
// do some number crunching
for (size_t m=0; m<more; ++m) {
for (size_t i=0; i<size; ++i) {
dots [i] = b.a[i]*c.a[i]
+ b.b[i]*c.b[i]
+ b.c[i]*c.c[i]
+ b.d[i]*c.d[i];
}}
const clock_t cpu_end = clock();
const sse_t *lhs_a, *lhs_b, *lhs_c, *lhs_d;
const sse_t *rhs_a, *rhs_b, *rhs_c, *rhs_d;
sse_t* d;
const clock_t sse_begin = clock();
// do some SSE number crunching
for( size_t m=0; m<more; ++m )
{
for( size_t i=0; i<size; i+=4 )
{
lhs_a = reinterpret_cast< const sse_t* >( &b.a[i] );
lhs_b = reinterpret_cast< const sse_t* >( &b.b[i] );
lhs_c = reinterpret_cast< const sse_t* >( &b.c[i] );
lhs_d = reinterpret_cast< const sse_t* >( &b.d[i] );
rhs_a = reinterpret_cast< const sse_t* >( &c.a[i] );
rhs_b = reinterpret_cast< const sse_t* >( &c.b[i] );
rhs_c = reinterpret_cast< const sse_t* >( &c.c[i] );
rhs_d = reinterpret_cast< const sse_t* >( &c.c[i] );
d = reinterpret_cast< sse_t* >( &dots[i] );
*d = _mm_add_ps( _mm_add_ps( _mm_add_ps( _mm_mul_ps( *lhs_a, *rhs_a )
, _mm_mul_ps( *lhs_b, *rhs_b )
)
, _mm_mul_ps( *lhs_c, *rhs_c )
)
, _mm_mul_ps( *lhs_d, *rhs_d )
);
}
}
const clock_t sse_end = clock();
std::cout << "\ntime for " << (long)more*size << " dp4's: "
<< (cpu_end-cpu_begin)/(float)(CLOCKS_PER_SEC) << '\n';
std::cout << "\ntime for " << (long)more*size << " dp4's: "
<< (sse_end-sse_begin)/(float)(CLOCKS_PER_SEC) << '\n';
}