[ create a new paste ] login | about

Link: http://codepad.org/iRwgxAkW    [ raw code | fork ]

C++, pasted on Nov 6:
__m128 zero    = _mm_setzero_ps();
auto block_ptr = alive.data();
auto e_ptr     = energy.data();
{
    assert( n % 4 == 0 );
    const size_t bits_per_block = 64;
    // handle full blocks (64 element chunks)
    const auto num_blocks = n / bits_per_block;
    size_t i = 0;
    while ( i < num_blocks * bits_per_block ) {
        uint64_t block = 0;
        do {
            _mm_store_ps( e_ptr + i, _mm_sub_ps( _mm_load_ps( e_ptr + i ), t ) );
            block |= uint64_t( _mm_movemask_ps( _mm_cmpgt_ps( _mm_load_ps( e_ptr + i ), zero ) ) ) << (i % bits_per_block);
            i += 4;
        } while ( i % bits_per_block != 0 );
        *block_ptr++ = block;
    }
    // handle final partial block
    if ( i < n ) {
        uint64_t block = 0;
        do {
            _mm_store_ps( e_ptr + i, _mm_sub_ps( _mm_load_ps( e_ptr + i ), t ) );
            block |= uint64_t( _mm_movemask_ps( _mm_cmpgt_ps( _mm_load_ps( e_ptr + i ), zero ) ) ) << (i % bits_per_block);
            i += 4;
        } while ( i < n );
        *block_ptr = block;
    }
}


Create a new paste based on this one


Comments: