codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
__m128 zero = _mm_setzero_ps(); auto block_ptr = alive.data(); auto e_ptr = energy.data(); assert( n % 4 == 0 ); const size_t bits_per_block = 64; // handle full blocks (64 element chunks) const size_t num_blocks = n / bits_per_block; for ( size_t i = 0; i < num_blocks; ++i ) { uint64_t block = 0; for ( size_t j = 0; j < bits_per_block; j += 4 ) { _mm_store_ps( e_ptr, _mm_sub_ps( _mm_load_ps( e_ptr ), t ) ); block |= uint64_t( _mm_movemask_ps( _mm_cmpgt_ps( _mm_load_ps( e_ptr ), zero ) ) ) << j; e_ptr += 4; } *block_ptr++ = block; } // handle final partial block auto remaining = n - num_blocks * bits_per_block; if ( remaining != 0 ) { uint64_t block = 0; for ( size_t i = 0; i < remaining; i += 4 ) { _mm_store_ps( e_ptr, _mm_sub_ps( _mm_load_ps( e_ptr ), t ) ); block |= uint64_t( _mm_movemask_ps( _mm_cmpgt_ps( _mm_load_ps( e_ptr ), zero ) ) ) << i; e_ptr += 4; } *block_ptr = block; }
Private
[
?
]
Run code
Submit