[ create a new paste ] login | about

Link: http://codepad.org/plz5aEW6    [ raw code | fork ]

C++, pasted on Oct 20:
#define WIN32_LEAN_AND_MEAN		// Exclude rarely-used stuff from Windows headers
#include <windows.h>

#include <stdio.h>
#include <tchar.h>

#include <stdio.h>
#include <tchar.h>

#include <xmmintrin.h>
#include <emmintrin.h>

#include <malloc.h>

#include <algorithm>
#include <iostream>
#include <limits>
#include <numeric>

static size_t alignment = 16; // 16 byte
static size_t num_vertices = 1000000;
static size_t num_loops = 1000;

typedef __m128 sse_t;

using namespace std;

inline float random_float()
{
   return static_cast< float >( rand() )
        / static_cast< float >( RAND_MAX );
}

// SOA - Structure of Arrays
struct vertices
{
    float* _x;
    float* _y;
    float* _z;
};

__forceinline void dot_product4( const float* lhs_x
                               , const float* lhs_y
                               , const float* lhs_z
                               , const float* rhs_x
                               , const float* rhs_y
                               , const float* rhs_z
                               , float*       dest
                               )
{
    dest[0] = lhs_x[0] * rhs_x[0] + lhs_y[0] * rhs_y[0] + lhs_z[0] * rhs_z[0];
    dest[1] = lhs_x[1] * rhs_x[1] + lhs_y[1] * rhs_y[1] + lhs_z[1] * rhs_z[1];
    dest[2] = lhs_x[2] * rhs_x[2] + lhs_y[2] * rhs_y[2] + lhs_z[2] * rhs_z[2];
    dest[3] = lhs_x[3] * rhs_x[3] + lhs_y[3] * rhs_y[3] + lhs_z[3] * rhs_z[3];
}

__forceinline void dot_product4_sse( const sse_t* lhs_x
                                   , const sse_t* lhs_y
                                   , const sse_t* lhs_z
                                   , const sse_t* rhs_x
                                   , const sse_t* rhs_y
                                   , const sse_t* rhs_z
                                   , sse_t*       dest
                                   )
{
    *dest = _mm_add_ps( _mm_add_ps( _mm_mul_ps( *lhs_x, *rhs_x )
                                  , _mm_mul_ps( *lhs_y, *rhs_y )
                                  )
                      , _mm_mul_ps( *lhs_z, *rhs_z )
                      );
}


int _tmain(int argc, _TCHAR* argv[])
{
    cout.setf( std::ios_base::fixed, std::ios_base::floatfield );

    vertices v_1, v_2;

    v_1._x = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
    v_1._y = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
    v_1._z = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
    v_2._x = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
    v_2._y = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
    v_2._z = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));

    generate_n( v_1._x, num_vertices, random_float );
    generate_n( v_1._y, num_vertices, random_float );
    generate_n( v_1._z, num_vertices, random_float );

    generate_n( v_2._x, num_vertices, random_float );
    generate_n( v_2._y, num_vertices, random_float );
    generate_n( v_2._z, num_vertices, random_float );

    float* dest = reinterpret_cast< float* >( _aligned_malloc( sizeof( float ) * num_vertices, alignment ));
    fill( dest, dest + num_vertices, 0.f );

    // Tests
    {
        // CPU
        fill( dest, dest + num_vertices, 0.f );

        DWORD start = GetTickCount();

        for( size_t k = 0; k < num_loops; ++k  )
        {
            for( size_t i = 0; i < num_vertices; i += 4 )
            {
                dot_product4( &v_1._x[i]
                            , &v_1._y[i]
                            , &v_1._z[i]
                            , &v_2._x[i]
                            , &v_2._y[i]
                            , &v_2._z[i]
                            , &dest[i]
                            );
            }
        }

        DWORD end = GetTickCount();

        cout << "CPU Test: " << end  - start << " ms" << endl;
        cout << "Sum of results: " << accumulate( dest, dest + num_vertices, 0.f ) << endl;
    }

    {
        // SSE
        fill( dest, dest + num_vertices, 0.f );

        DWORD start = GetTickCount();

        sse_t* lhs_x;
        sse_t* lhs_y;
        sse_t* lhs_z;

        sse_t* rhs_x;
        sse_t* rhs_y;
        sse_t* rhs_z;

        sse_t* d;

        for( size_t k = 0; k < num_loops; ++k  )
        {
            for( size_t i = 0; i < num_vertices; i += 4 )
            {
                lhs_x = reinterpret_cast< sse_t* >( &v_1._x[i] );
                lhs_y = reinterpret_cast< sse_t* >( &v_1._y[i] );
                lhs_z = reinterpret_cast< sse_t* >( &v_1._z[i] );

                rhs_x = reinterpret_cast< sse_t* >( &v_2._x[i] );
                rhs_y = reinterpret_cast< sse_t* >( &v_2._y[i] );
                rhs_z = reinterpret_cast< sse_t* >( &v_2._z[i] );

                d = reinterpret_cast< sse_t* >( &dest[i] );

                dot_product4_sse( lhs_x
                                , lhs_y
                                , lhs_z
                                , rhs_x
                                , rhs_y
                                , rhs_z
                                , d
                                );
            }
        }

        DWORD end = GetTickCount();

        cout << "SSE Test: " << end  - start << " ms" << endl;
        cout << "Sum of results: " << accumulate( dest, dest + num_vertices, 0.f ) << endl;
    }

   _aligned_free( v_1._x );
   _aligned_free( v_1._y );
   _aligned_free( v_1._z );

   _aligned_free( v_2._x );
   _aligned_free( v_2._y );
   _aligned_free( v_2._z );

   return 0;
}


Create a new paste based on this one


Comments: