codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
#define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers #include <windows.h> #include <stdio.h> #include <tchar.h> #include <stdio.h> #include <tchar.h> #include <xmmintrin.h> #include <emmintrin.h> #include <malloc.h> #include <algorithm> #include <iostream> #include <limits> #include <numeric> static size_t alignment = 16; // 16 byte static size_t num_vertices = 1000000; static size_t num_loops = 1000; typedef __m128 sse_t; using namespace std; inline float random_float() { return static_cast< float >( rand() ) / static_cast< float >( RAND_MAX ); } // SOA - Structure of Arrays struct vertices { float* _x; float* _y; float* _z; }; __forceinline void dot_product4( const float* lhs_x , const float* lhs_y , const float* lhs_z , const float* rhs_x , const float* rhs_y , const float* rhs_z , float* dest ) { dest[0] = lhs_x[0] * rhs_x[0] + lhs_y[0] * rhs_y[0] + lhs_z[0] * rhs_z[0]; dest[1] = lhs_x[1] * rhs_x[1] + lhs_y[1] * rhs_y[1] + lhs_z[1] * rhs_z[1]; dest[2] = lhs_x[2] * rhs_x[2] + lhs_y[2] * rhs_y[2] + lhs_z[2] * rhs_z[2]; dest[3] = lhs_x[3] * rhs_x[3] + lhs_y[3] * rhs_y[3] + lhs_z[3] * rhs_z[3]; } __forceinline void dot_product4_sse( const sse_t* lhs_x , const sse_t* lhs_y , const sse_t* lhs_z , const sse_t* rhs_x , const sse_t* rhs_y , const sse_t* rhs_z , sse_t* dest ) { *dest = _mm_add_ps( _mm_add_ps( _mm_mul_ps( *lhs_x, *rhs_x ) , _mm_mul_ps( *lhs_y, *rhs_y ) ) , _mm_mul_ps( *lhs_z, *rhs_z ) ); } int _tmain(int argc, _TCHAR* argv[]) { cout.setf( std::ios_base::fixed, std::ios_base::floatfield ); vertices v_1, v_2; v_1._x = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment )); v_1._y = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment )); v_1._z = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment )); v_2._x = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment )); v_2._y = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment )); v_2._z = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment )); generate_n( v_1._x, num_vertices, random_float ); generate_n( v_1._y, num_vertices, random_float ); generate_n( v_1._z, num_vertices, random_float ); generate_n( v_2._x, num_vertices, random_float ); generate_n( v_2._y, num_vertices, random_float ); generate_n( v_2._z, num_vertices, random_float ); float* dest = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment )); fill( dest, dest + num_vertices, 0.f ); // Tests { // CPU fill( dest, dest + num_vertices, 0.f ); DWORD start = GetTickCount(); for( size_t k = 0; k < num_loops; ++k ) { for( size_t i = 0; i < num_vertices; i += 4 ) { dot_product4( &v_1._x[i] , &v_1._y[i] , &v_1._z[i] , &v_2._x[i] , &v_2._y[i] , &v_2._z[i] , &dest[i] ); } } DWORD end = GetTickCount(); cout << "CPU Test: " << end - start << " ms" << endl; cout << "Sum of results: " << accumulate( dest, dest + num_vertices, 0.f ) << endl; } { // SSE fill( dest, dest + num_vertices, 0.f ); DWORD start = GetTickCount(); sse_t* lhs_x; sse_t* lhs_y; sse_t* lhs_z; sse_t* rhs_x; sse_t* rhs_y; sse_t* rhs_z; sse_t* d; for( size_t k = 0; k < num_loops; ++k ) { for( size_t i = 0; i < num_vertices; i += 4 ) { lhs_x = reinterpret_cast< sse_t* >( &v_1._x[i] ); lhs_y = reinterpret_cast< sse_t* >( &v_1._y[i] ); lhs_z = reinterpret_cast< sse_t* >( &v_1._z[i] ); rhs_x = reinterpret_cast< sse_t* >( &v_2._x[i] ); rhs_y = reinterpret_cast< sse_t* >( &v_2._y[i] ); rhs_z = reinterpret_cast< sse_t* >( &v_2._z[i] ); d = reinterpret_cast< sse_t* >( &dest[i] ); dot_product4_sse( lhs_x , lhs_y , lhs_z , rhs_x , rhs_y , rhs_z , d ); } } DWORD end = GetTickCount(); cout << "SSE Test: " << end - start << " ms" << endl; cout << "Sum of results: " << accumulate( dest, dest + num_vertices, 0.f ) << endl; } _aligned_free( v_1._x ); _aligned_free( v_1._y ); _aligned_free( v_1._z ); _aligned_free( v_2._x ); _aligned_free( v_2._y ); _aligned_free( v_2._z ); return 0; }
Private
[
?
]
Run code