__m128 zero = _mm_setzero_ps();
auto block_ptr = alive.data();
auto e_ptr = energy.data();
{
assert( n % 4 == 0 );
const size_t bits_per_block = 64;
// handle full blocks (64 element chunks)
const auto num_blocks = n / bits_per_block;
size_t i = 0;
while ( i < num_blocks * bits_per_block ) {
uint64_t block = 0;
do {
_mm_store_ps( e_ptr + i, _mm_sub_ps( _mm_load_ps( e_ptr + i ), t ) );
block |= uint64_t( _mm_movemask_ps( _mm_cmpgt_ps( _mm_load_ps( e_ptr + i ), zero ) ) ) << (i % bits_per_block);
i += 4;
} while ( i % bits_per_block != 0 );
*block_ptr++ = block;
}
// handle final partial block
if ( i < n ) {
uint64_t block = 0;
do {
_mm_store_ps( e_ptr + i, _mm_sub_ps( _mm_load_ps( e_ptr + i ), t ) );
block |= uint64_t( _mm_movemask_ps( _mm_cmpgt_ps( _mm_load_ps( e_ptr + i ), zero ) ) ) << (i % bits_per_block);
i += 4;
} while ( i < n );
*block_ptr = block;
}
}