codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
#include <cstdlib> #include <iostream> #include <ctime> #include <xmmintrin.h> #include <emmintrin.h> typedef __m128 sse_t; namespace { template <size_t i> struct afloat_array { typedef __declspec(align(16)) float type[i]; }; template <size_t len> struct Vector2 { typename afloat_array<len>::type a, b, c, d; }; const size_t size = 1000000; const size_t more = 1000; afloat_array<size>::type dots; Vector2<size> b, c; } int main () { // spread randomness to prevent folding actions srand (0); for (size_t i=0; i<size; ++i) { dots [i] = rand () / (RAND_MAX+1.0f); b.a [i]= rand () / (RAND_MAX+1.0f); b.b [i]= rand () / (RAND_MAX+1.0f); b.c [i]= rand () / (RAND_MAX+1.0f); b.d [i]= rand () / (RAND_MAX+1.0f); c.a [i]= rand () / (RAND_MAX+1.0f); c.b [i]= rand () / (RAND_MAX+1.0f); c.c [i]= rand () / (RAND_MAX+1.0f); c.d [i]= rand () / (RAND_MAX+1.0f); } const clock_t cpu_begin = clock(); // do some number crunching for (size_t m=0; m<more; ++m) { for (size_t i=0; i<size; ++i) { dots [i] = b.a[i]*c.a[i] + b.b[i]*c.b[i] + b.c[i]*c.c[i] + b.d[i]*c.d[i]; }} const clock_t cpu_end = clock(); const sse_t *lhs_a, *lhs_b, *lhs_c, *lhs_d; const sse_t *rhs_a, *rhs_b, *rhs_c, *rhs_d; sse_t* d; const clock_t sse_begin = clock(); // do some SSE number crunching for( size_t m=0; m<more; ++m ) { for( size_t i=0; i<size; i+=4 ) { lhs_a = reinterpret_cast< const sse_t* >( &b.a[i] ); lhs_b = reinterpret_cast< const sse_t* >( &b.b[i] ); lhs_c = reinterpret_cast< const sse_t* >( &b.c[i] ); lhs_d = reinterpret_cast< const sse_t* >( &b.d[i] ); rhs_a = reinterpret_cast< const sse_t* >( &c.a[i] ); rhs_b = reinterpret_cast< const sse_t* >( &c.b[i] ); rhs_c = reinterpret_cast< const sse_t* >( &c.c[i] ); rhs_d = reinterpret_cast< const sse_t* >( &c.c[i] ); d = reinterpret_cast< sse_t* >( &dots[i] ); *d = _mm_add_ps( _mm_add_ps( _mm_add_ps( _mm_mul_ps( *lhs_a, *rhs_a ) , _mm_mul_ps( *lhs_b, *rhs_b ) ) , _mm_mul_ps( *lhs_c, *rhs_c ) ) , _mm_mul_ps( *lhs_d, *rhs_d ) ); } } const clock_t sse_end = clock(); std::cout << "\ntime for " << (long)more*size << " dp4's: " << (cpu_end-cpu_begin)/(float)(CLOCKS_PER_SEC) << '\n'; std::cout << "\ntime for " << (long)more*size << " dp4's: " << (sse_end-sse_begin)/(float)(CLOCKS_PER_SEC) << '\n'; }
Private
[
?
]
Run code