[ create a new paste ] login | about

Link: http://codepad.org/r9u82dOh    [ raw code | fork ]

C++, pasted on Oct 16:
#include <mmintrin.h>
#include <vector>
#include <string>
#include <chrono>
#include <iostream>
#include <random>
#include <algorithm>
#include <boost/align/aligned_allocator.hpp>

using namespace std;
using boost::alignment::aligned_allocator;

struct float2 {
    float x, y;
};

struct float3 {
    float x, y, z;

    friend float3 operator-( const float3 & lhs, const float3 & rhs ) {
        return float3{ lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z };
    }
    friend float3 operator+( const float3 & lhs, const float3 & rhs ) {
        return float3{ lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z };
    }
    friend float3 operator*( const float3 & lhs, float rhs ) {
        return float3{ lhs.x * rhs, lhs.y * rhs, lhs.z * rhs };
    }
};

struct float4 {
    float x, y, z, w;
};

struct particle_t {
    float3 position;
    float3 velocity;
    float3 acceleration;
    float2 size;
    float4 color;
    float energy;
    bool alive;
};

uniform_real_distribution<float> pdist( -10.f, 10.f );
uniform_real_distribution<float> vxzdist( -5.f, 5.f );
uniform_real_distribution<float> vydist( 4.f, 20.f );
uniform_real_distribution<float> adist( -1.f, 1.f );
uniform_real_distribution<float> sdist( 0.1f, 5.f );
uniform_real_distribution<float> cdist(  0.f, 1.f );
uniform_real_distribution<float> edist( 10.f, 1000.f );

constexpr float gravity = -9.8f;
constexpr float dt = 1.0f;

struct aos_emitter_t {
    vector<particle_t> particles;

    void generate( size_t n, mt19937 & rng ) {
        particles.resize( n );


        for ( size_t i = 0; i < n; ++i ) {
            particles[i].position = float3{ pdist( rng ), pdist( rng ), pdist( rng ) };
            particles[i].velocity = float3{ vxzdist( rng ), vydist( rng ), vxzdist( rng ) };
            particles[i].acceleration = float3{ adist( rng ), adist( rng ), adist( rng ) };
            particles[i].size = float2{ sdist( rng ), sdist( rng ) };
            particles[i].color = float4{ cdist( rng ), cdist( rng ), cdist( rng ), cdist( rng ) };
            particles[i].energy = edist( rng );
            particles[i].alive = true;
        }
    }

    void update() {
        for ( auto & p : particles ) {
            auto & v = p.velocity;
            v = v + (p.acceleration * dt);
            v.y += gravity * dt;

            p.position = p.position + (v * dt);
            p.energy -= dt;
            if ( p.energy <= 0 ) {
                p.alive = false;
            }
        }
    }
};

struct soa_emitter_t {
    vector<float3> position;
    vector<float3> velocity;
    vector<float3> acceleration;
    vector<float2> size;
    vector<float4> color;
    vector<float>  energy;
    vector<char>   alive;

    void generate( size_t n, mt19937 & rng ) {
        position.resize( n );
        velocity.resize( n );
        acceleration.resize( n );
        size.resize( n );
        color.resize( n );
        energy.resize( n );
        alive.resize( n, true );

        for ( size_t i = 0; i < n; ++i ) {
            position[i] = float3{ pdist( rng ), pdist( rng ), pdist( rng ) };
            velocity[i] = float3{ vxzdist( rng ), vydist( rng ), vxzdist( rng ) };
            acceleration[i] = float3{ adist( rng ), adist( rng ), adist( rng ) };
            size[i] = float2{ sdist( rng ), sdist( rng ) };
            color[i] = float4{ cdist( rng ), cdist( rng ), cdist( rng ), cdist( rng ) };
            energy[i] = edist( rng );
        }
    }

    void update() {

        size_t n = position.size();
        for ( size_t i = 0; i < n; ++i ) {
            auto & v = velocity[i];
            v = v + (acceleration[i] * dt);
            v.y += gravity * dt;

            position[i] = position[i] + (v * dt);
            energy[i] -= dt;
            if ( energy[i] <= 0 ) {
                alive[i] = false;
            }
        }
    }
};


template< typename T >
using sse_vector = vector<T, aligned_allocator<T,16> >;

struct soa_emitter_sse_t {
    sse_vector<float> position_x;
    sse_vector<float> position_y;
    sse_vector<float> position_z;
    sse_vector<float> velocity_x;
    sse_vector<float> velocity_y;
    sse_vector<float> velocity_z;
    sse_vector<float> acceleration_x;
    sse_vector<float> acceleration_y;
    sse_vector<float> acceleration_z;
    vector<float2> size;
    vector<float4> color;
    sse_vector<float>  energy;
    vector<char>   alive;

    void generate( size_t n, mt19937 & rng ) {
        position_x.resize( n );
        position_y.resize( n );
        position_z.resize( n );
        velocity_x.resize( n );
        velocity_y.resize( n );
        velocity_z.resize( n );
        acceleration_x.resize( n );
        acceleration_y.resize( n );
        acceleration_z.resize( n );
        size.resize( n );
        color.resize( n );
        energy.resize( n );
        alive.resize( n, true );

        for ( size_t i = 0; i < n; ++i ) {
            position_x[i] = pdist( rng );
            position_y[i] = pdist( rng );
            position_z[i] = pdist( rng );
            velocity_x[i] = vxzdist( rng );
            velocity_y[i] = vxzdist( rng );
            velocity_z[i] = vxzdist( rng );
            acceleration_x[i] = adist( rng );
            acceleration_y[i] = adist( rng );
            acceleration_z[i] = adist( rng );
            size[i] = float2{ sdist( rng ), sdist( rng ) };
            color[i] = float4{ cdist( rng ), cdist( rng ), cdist( rng ), cdist( rng ) };
            energy[i] = edist( rng );
        }
    }
    void update() {
        size_t n = position_x.size();

        __m128 vx, vy, vz;
        __m128 t = _mm_set1_ps( dt );
        __m128 g = _mm_set1_ps( gravity * dt );
        __m128 zero = _mm_setzero_ps();
        for ( size_t i = 0; i < n; i += 4 ) {
            vx = _mm_add_ps( _mm_load_ps( velocity_x.data()+i ), _mm_mul_ps( t, _mm_load_ps( acceleration_x.data()+i ) ) );
            vy = _mm_add_ps( _mm_load_ps( velocity_y.data()+i ), _mm_mul_ps( t, _mm_load_ps( acceleration_y.data()+i ) ) );
            vz = _mm_add_ps( _mm_load_ps( velocity_z.data()+i ), _mm_mul_ps( t, _mm_load_ps( acceleration_z.data()+i ) ) );

            vy = _mm_add_ps( vy, g );

            _mm_store_ps( position_x.data()+i, _mm_add_ps(_mm_load_ps(position_x.data()+i), _mm_mul_ps(vx, t)) );
            _mm_store_ps( position_y.data()+i, _mm_add_ps(_mm_load_ps(position_y.data()+i), _mm_mul_ps(vy, t)) );
            _mm_store_ps( position_z.data()+i, _mm_add_ps(_mm_load_ps(position_z.data()+i), _mm_mul_ps(vz, t)) );

            _mm_store_ps( velocity_x.data()+i, vx );
            _mm_store_ps( velocity_y.data()+i, vy );
            _mm_store_ps( velocity_z.data()+i, vz );

            _mm_store_ps( energy.data()+i, _mm_sub_ps(_mm_load_ps(energy.data()+i), t) );

            auto a = _mm_movemask_ps( _mm_cmple_ps( _mm_load_ps( energy.data() + i ), zero ) );
            for ( int j = 0; j < 4; ++j ) {
                alive[i+j] = (a & (1 << j));
            }
        }
    }
};

template< typename emitter_t >
chrono::duration<double> run_test() {
    using clock_t = chrono::high_resolution_clock;

    constexpr size_t count = 250'000 * 4;
    constexpr size_t frames = 1'000;

    const auto seed_val = mt19937::default_seed;
    mt19937 rng( seed_val );

    emitter_t emitter;
    emitter.generate( count, rng );

    auto start = clock_t::now();

    for ( size_t i = 0; i < frames; ++i ) {
        emitter.update();
    }

    auto finish = clock_t::now();
    return finish - start;
}

int main() {
    auto aos_duration = run_test<aos_emitter_t>();
    auto soa_duration = run_test<soa_emitter_t>();
    auto soa_sse_duration = run_test<soa_emitter_sse_t>();

    cout << "AoS in " << aos_duration.count() << " seconds" << std::endl;
    cout << "SoA in " << soa_duration.count() << " seconds" << std::endl;
    cout << "SoA SSE in " << soa_sse_duration.count() << " seconds" << std::endl;

    return 0;
}


Create a new paste based on this one


Comments: