codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
#include <mmintrin.h> #include <vector> #include <string> #include <chrono> #include <iostream> #include <random> #include <algorithm> #include <boost/align/aligned_allocator.hpp> using namespace std; using boost::alignment::aligned_allocator; struct float2 { float x, y; }; struct float3 { float x, y, z; friend float3 operator-( const float3 & lhs, const float3 & rhs ) { return float3{ lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z }; } friend float3 operator+( const float3 & lhs, const float3 & rhs ) { return float3{ lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z }; } friend float3 operator*( const float3 & lhs, float rhs ) { return float3{ lhs.x * rhs, lhs.y * rhs, lhs.z * rhs }; } }; struct float4 { float x, y, z, w; }; struct particle_t { float3 position; float3 velocity; float3 acceleration; float2 size; float4 color; float energy; bool alive; }; uniform_real_distribution<float> pdist( -10.f, 10.f ); uniform_real_distribution<float> vxzdist( -5.f, 5.f ); uniform_real_distribution<float> vydist( 4.f, 20.f ); uniform_real_distribution<float> adist( -1.f, 1.f ); uniform_real_distribution<float> sdist( 0.1f, 5.f ); uniform_real_distribution<float> cdist( 0.f, 1.f ); uniform_real_distribution<float> edist( 10.f, 1000.f ); constexpr float gravity = -9.8f; constexpr float dt = 1.0f; struct aos_emitter_t { vector<particle_t> particles; void generate( size_t n, mt19937 & rng ) { particles.resize( n ); for ( size_t i = 0; i < n; ++i ) { particles[i].position = float3{ pdist( rng ), pdist( rng ), pdist( rng ) }; particles[i].velocity = float3{ vxzdist( rng ), vydist( rng ), vxzdist( rng ) }; particles[i].acceleration = float3{ adist( rng ), adist( rng ), adist( rng ) }; particles[i].size = float2{ sdist( rng ), sdist( rng ) }; particles[i].color = float4{ cdist( rng ), cdist( rng ), cdist( rng ), cdist( rng ) }; particles[i].energy = edist( rng ); particles[i].alive = true; } } void update() { for ( auto & p : particles ) { auto & v = p.velocity; v = v + (p.acceleration * dt); v.y += gravity * dt; p.position = p.position + (v * dt); p.energy -= dt; if ( p.energy <= 0 ) { p.alive = false; } } } }; struct soa_emitter_t { vector<float3> position; vector<float3> velocity; vector<float3> acceleration; vector<float2> size; vector<float4> color; vector<float> energy; vector<char> alive; void generate( size_t n, mt19937 & rng ) { position.resize( n ); velocity.resize( n ); acceleration.resize( n ); size.resize( n ); color.resize( n ); energy.resize( n ); alive.resize( n, true ); for ( size_t i = 0; i < n; ++i ) { position[i] = float3{ pdist( rng ), pdist( rng ), pdist( rng ) }; velocity[i] = float3{ vxzdist( rng ), vydist( rng ), vxzdist( rng ) }; acceleration[i] = float3{ adist( rng ), adist( rng ), adist( rng ) }; size[i] = float2{ sdist( rng ), sdist( rng ) }; color[i] = float4{ cdist( rng ), cdist( rng ), cdist( rng ), cdist( rng ) }; energy[i] = edist( rng ); } } void update() { size_t n = position.size(); for ( size_t i = 0; i < n; ++i ) { auto & v = velocity[i]; v = v + (acceleration[i] * dt); v.y += gravity * dt; position[i] = position[i] + (v * dt); energy[i] -= dt; if ( energy[i] <= 0 ) { alive[i] = false; } } } }; template< typename T > using sse_vector = vector<T, aligned_allocator<T,16> >; struct soa_emitter_sse_t { sse_vector<float> position_x; sse_vector<float> position_y; sse_vector<float> position_z; sse_vector<float> velocity_x; sse_vector<float> velocity_y; sse_vector<float> velocity_z; sse_vector<float> acceleration_x; sse_vector<float> acceleration_y; sse_vector<float> acceleration_z; vector<float2> size; vector<float4> color; sse_vector<float> energy; vector<char> alive; void generate( size_t n, mt19937 & rng ) { position_x.resize( n ); position_y.resize( n ); position_z.resize( n ); velocity_x.resize( n ); velocity_y.resize( n ); velocity_z.resize( n ); acceleration_x.resize( n ); acceleration_y.resize( n ); acceleration_z.resize( n ); size.resize( n ); color.resize( n ); energy.resize( n ); alive.resize( n, true ); for ( size_t i = 0; i < n; ++i ) { position_x[i] = pdist( rng ); position_y[i] = pdist( rng ); position_z[i] = pdist( rng ); velocity_x[i] = vxzdist( rng ); velocity_y[i] = vxzdist( rng ); velocity_z[i] = vxzdist( rng ); acceleration_x[i] = adist( rng ); acceleration_y[i] = adist( rng ); acceleration_z[i] = adist( rng ); size[i] = float2{ sdist( rng ), sdist( rng ) }; color[i] = float4{ cdist( rng ), cdist( rng ), cdist( rng ), cdist( rng ) }; energy[i] = edist( rng ); } } void update() { size_t n = position_x.size(); __m128 vx, vy, vz; __m128 t = _mm_set1_ps( dt ); __m128 g = _mm_set1_ps( gravity * dt ); __m128 zero = _mm_setzero_ps(); for ( size_t i = 0; i < n; i += 4 ) { vx = _mm_add_ps( _mm_load_ps( velocity_x.data()+i ), _mm_mul_ps( t, _mm_load_ps( acceleration_x.data()+i ) ) ); vy = _mm_add_ps( _mm_load_ps( velocity_y.data()+i ), _mm_mul_ps( t, _mm_load_ps( acceleration_y.data()+i ) ) ); vz = _mm_add_ps( _mm_load_ps( velocity_z.data()+i ), _mm_mul_ps( t, _mm_load_ps( acceleration_z.data()+i ) ) ); vy = _mm_add_ps( vy, g ); _mm_store_ps( position_x.data()+i, _mm_add_ps(_mm_load_ps(position_x.data()+i), _mm_mul_ps(vx, t)) ); _mm_store_ps( position_y.data()+i, _mm_add_ps(_mm_load_ps(position_y.data()+i), _mm_mul_ps(vy, t)) ); _mm_store_ps( position_z.data()+i, _mm_add_ps(_mm_load_ps(position_z.data()+i), _mm_mul_ps(vz, t)) ); _mm_store_ps( velocity_x.data()+i, vx ); _mm_store_ps( velocity_y.data()+i, vy ); _mm_store_ps( velocity_z.data()+i, vz ); _mm_store_ps( energy.data()+i, _mm_sub_ps(_mm_load_ps(energy.data()+i), t) ); auto a = _mm_movemask_ps( _mm_cmple_ps( _mm_load_ps( energy.data() + i ), zero ) ); for ( int j = 0; j < 4; ++j ) { alive[i+j] = (a & (1 << j)); } } } }; template< typename emitter_t > chrono::duration<double> run_test() { using clock_t = chrono::high_resolution_clock; constexpr size_t count = 250'000 * 4; constexpr size_t frames = 1'000; const auto seed_val = mt19937::default_seed; mt19937 rng( seed_val ); emitter_t emitter; emitter.generate( count, rng ); auto start = clock_t::now(); for ( size_t i = 0; i < frames; ++i ) { emitter.update(); } auto finish = clock_t::now(); return finish - start; } int main() { auto aos_duration = run_test<aos_emitter_t>(); auto soa_duration = run_test<soa_emitter_t>(); auto soa_sse_duration = run_test<soa_emitter_sse_t>(); cout << "AoS in " << aos_duration.count() << " seconds" << std::endl; cout << "SoA in " << soa_duration.count() << " seconds" << std::endl; cout << "SoA SSE in " << soa_sse_duration.count() << " seconds" << std::endl; return 0; }
Private
[
?
]
Run code
Submit