__m128 zero = _mm_setzero_ps();
auto block_ptr = alive.data();
auto e_ptr = energy.data();
assert( n % 4 == 0 );
const size_t bits_per_block = 64;
// handle full blocks (64 element chunks)
const size_t num_blocks = n / bits_per_block;
for ( size_t i = 0; i < num_blocks; ++i )
{
uint64_t block = 0;
for ( size_t j = 0; j < bits_per_block; j += 4 )
{
_mm_store_ps( e_ptr, _mm_sub_ps( _mm_load_ps( e_ptr ), t ) );
block |= uint64_t( _mm_movemask_ps( _mm_cmpgt_ps( _mm_load_ps( e_ptr ), zero ) ) ) << j;
e_ptr += 4;
}
*block_ptr++ = block;
}
// handle final partial block
auto remaining = n - num_blocks * bits_per_block;
if ( remaining != 0 ) {
uint64_t block = 0;
for ( size_t i = 0; i < remaining; i += 4 )
{
_mm_store_ps( e_ptr, _mm_sub_ps( _mm_load_ps( e_ptr ), t ) );
block |= uint64_t( _mm_movemask_ps( _mm_cmpgt_ps( _mm_load_ps( e_ptr ), zero ) ) ) << i;
e_ptr += 4;
}
*block_ptr = block;
}