[ create a new paste ] login | about

Link: http://codepad.org/nBhb3C23    [ raw code | fork ]

C++, pasted on Nov 3:
#include <cstdlib>
#include <iostream>
#include <ctime>

#include <xmmintrin.h>
#include <emmintrin.h>

typedef __m128 sse_t;

namespace {
        template <size_t i> struct afloat_array {
                typedef __declspec(align(16)) float type[i];
        };

        template <size_t len>
        struct Vector2 {
                typename afloat_array<len>::type a, b, c, d;
        };
        const size_t size = 1000000;
        const size_t more = 1000;
        afloat_array<size>::type dots;
        Vector2<size> b, c;
}

int main () {

        // spread randomness to prevent folding actions
        srand (0);
        for (size_t i=0; i<size; ++i) {
                dots [i] = rand () / (RAND_MAX+1.0f);
                b.a [i]= rand () / (RAND_MAX+1.0f);
                b.b [i]= rand () / (RAND_MAX+1.0f);
                b.c [i]= rand () / (RAND_MAX+1.0f);
                b.d [i]= rand () / (RAND_MAX+1.0f);
                c.a [i]= rand () / (RAND_MAX+1.0f);
                c.b [i]= rand () / (RAND_MAX+1.0f);
                c.c [i]= rand () / (RAND_MAX+1.0f);
                c.d [i]= rand () / (RAND_MAX+1.0f);
        }

        const clock_t cpu_begin = clock();

        // do some number crunching
        for (size_t m=0; m<more; ++m) {
        for (size_t i=0; i<size; ++i) {
                dots [i] = b.a[i]*c.a[i]
                         + b.b[i]*c.b[i]
                         + b.c[i]*c.c[i]
                         + b.d[i]*c.d[i];
        }}

        const clock_t cpu_end = clock();


        const sse_t *lhs_a, *lhs_b, *lhs_c, *lhs_d;
        const sse_t *rhs_a, *rhs_b, *rhs_c, *rhs_d;

        sse_t* d;

        const clock_t sse_begin = clock();

        // do some SSE number crunching
        for( size_t m=0; m<more; ++m )
        {
            for( size_t i=0; i<size; i+=4 )
            {
                lhs_a = reinterpret_cast< const sse_t* >( &b.a[i] );
                lhs_b = reinterpret_cast< const sse_t* >( &b.b[i] );
                lhs_c = reinterpret_cast< const sse_t* >( &b.c[i] );
                lhs_d = reinterpret_cast< const sse_t* >( &b.d[i] );

                rhs_a = reinterpret_cast< const sse_t* >( &c.a[i] );
                rhs_b = reinterpret_cast< const sse_t* >( &c.b[i] );
                rhs_c = reinterpret_cast< const sse_t* >( &c.c[i] );
                rhs_d = reinterpret_cast< const sse_t* >( &c.c[i] );

                d = reinterpret_cast< sse_t* >( &dots[i] );

                *d = _mm_add_ps( _mm_add_ps( _mm_add_ps( _mm_mul_ps( *lhs_a, *rhs_a )
                                                       , _mm_mul_ps( *lhs_b, *rhs_b )
                                           )
                                           , _mm_mul_ps( *lhs_c, *rhs_c )
                                           )
                               , _mm_mul_ps( *lhs_d, *rhs_d )
                               );
            }
        }

        const clock_t sse_end = clock();

        std::cout << "\ntime for " << (long)more*size << " dp4's: "
                  << (cpu_end-cpu_begin)/(float)(CLOCKS_PER_SEC) << '\n';

        std::cout << "\ntime for " << (long)more*size << " dp4's: "
                  << (sse_end-sse_begin)/(float)(CLOCKS_PER_SEC) << '\n';
}


Create a new paste based on this one


Comments: