#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers
#include <windows.h>
#include <stdio.h>
#include <tchar.h>
#include <stdio.h>
#include <tchar.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <malloc.h>
#include <algorithm>
#include <iostream>
#include <limits>
#include <numeric>
static size_t alignment = 16; // 16 byte
static size_t num_vertices = 1000000;
static size_t num_loops = 1000;
typedef __m128 sse_t;
using namespace std;
inline float random_float()
{
return static_cast< float >( rand() )
/ static_cast< float >( RAND_MAX );
}
// SOA - Structure of Arrays
struct vertices
{
float* _x;
float* _y;
float* _z;
};
__forceinline void dot_product4( const float* lhs_x
, const float* lhs_y
, const float* lhs_z
, const float* rhs_x
, const float* rhs_y
, const float* rhs_z
, float* dest
)
{
dest[0] = lhs_x[0] * rhs_x[0] + lhs_y[0] * rhs_y[0] + lhs_z[0] * rhs_z[0];
dest[1] = lhs_x[1] * rhs_x[1] + lhs_y[1] * rhs_y[1] + lhs_z[1] * rhs_z[1];
dest[2] = lhs_x[2] * rhs_x[2] + lhs_y[2] * rhs_y[2] + lhs_z[2] * rhs_z[2];
dest[3] = lhs_x[3] * rhs_x[3] + lhs_y[3] * rhs_y[3] + lhs_z[3] * rhs_z[3];
}
__forceinline void dot_product4_sse( const sse_t* lhs_x
, const sse_t* lhs_y
, const sse_t* lhs_z
, const sse_t* rhs_x
, const sse_t* rhs_y
, const sse_t* rhs_z
, sse_t* dest
)
{
*dest = _mm_add_ps( _mm_add_ps( _mm_mul_ps( *lhs_x, *rhs_x )
, _mm_mul_ps( *lhs_y, *rhs_y )
)
, _mm_mul_ps( *lhs_z, *rhs_z )
);
}
int _tmain(int argc, _TCHAR* argv[])
{
cout.setf( std::ios_base::fixed, std::ios_base::floatfield );
vertices v_1, v_2;
v_1._x = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
v_1._y = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
v_1._z = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
v_2._x = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
v_2._y = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
v_2._z = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
generate_n( v_1._x, num_vertices, random_float );
generate_n( v_1._y, num_vertices, random_float );
generate_n( v_1._z, num_vertices, random_float );
generate_n( v_2._x, num_vertices, random_float );
generate_n( v_2._y, num_vertices, random_float );
generate_n( v_2._z, num_vertices, random_float );
float* dest = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
fill( dest, dest + num_vertices, 0.f );
// Tests
{
// CPU
fill( dest, dest + num_vertices, 0.f );
DWORD start = GetTickCount();
for( size_t k = 0; k < num_loops; ++k )
{
for( size_t i = 0; i < num_vertices; i += 4 )
{
dot_product4( &v_1._x[i]
, &v_1._y[i]
, &v_1._z[i]
, &v_2._x[i]
, &v_2._y[i]
, &v_2._z[i]
, &dest[i]
);
}
}
DWORD end = GetTickCount();
cout << "CPU Test: " << end - start << " ms" << endl;
cout << "Sum of results: " << accumulate( dest, dest + num_vertices, 0.f ) << endl;
}
{
// SSE
fill( dest, dest + num_vertices, 0.f );
DWORD start = GetTickCount();
sse_t* lhs_x;
sse_t* lhs_y;
sse_t* lhs_z;
sse_t* rhs_x;
sse_t* rhs_y;
sse_t* rhs_z;
sse_t* d;
for( size_t k = 0; k < num_loops; ++k )
{
for( size_t i = 0; i < num_vertices; i += 4 )
{
lhs_x = reinterpret_cast< sse_t* >( &v_1._x[i] );
lhs_y = reinterpret_cast< sse_t* >( &v_1._y[i] );
lhs_z = reinterpret_cast< sse_t* >( &v_1._z[i] );
rhs_x = reinterpret_cast< sse_t* >( &v_2._x[i] );
rhs_y = reinterpret_cast< sse_t* >( &v_2._y[i] );
rhs_z = reinterpret_cast< sse_t* >( &v_2._z[i] );
d = reinterpret_cast< sse_t* >( &dest[i] );
dot_product4_sse( lhs_x
, lhs_y
, lhs_z
, rhs_x
, rhs_y
, rhs_z
, d
);
}
}
DWORD end = GetTickCount();
cout << "SSE Test: " << end - start << " ms" << endl;
cout << "Sum of results: " << accumulate( dest, dest + num_vertices, 0.f ) << endl;
}
_aligned_free( v_1._x );
_aligned_free( v_1._y );
_aligned_free( v_1._z );
_aligned_free( v_2._x );
_aligned_free( v_2._y );
_aligned_free( v_2._z );
return 0;
}