[ create a new paste ] login | about

Link: http://codepad.org/DjqVHac8    [ raw code | fork ]

C++, pasted on Nov 6:
        __m128 zero    = _mm_setzero_ps();
        auto block_ptr = alive.data();
        auto e_ptr     = energy.data();
        assert( n % 4 == 0 );
        const size_t bits_per_block = 64;
        // handle full blocks (64 element chunks)
        const size_t num_blocks = n / bits_per_block;
        for ( size_t i = 0; i < num_blocks; ++i )
        {
            uint64_t block = 0;
            for ( size_t j = 0; j < bits_per_block; j += 4 )
            {
                _mm_store_ps( e_ptr, _mm_sub_ps( _mm_load_ps( e_ptr ), t ) );
                block |= uint64_t( _mm_movemask_ps( _mm_cmpgt_ps( _mm_load_ps( e_ptr ), zero ) ) ) << j;
                e_ptr += 4;
            }
            *block_ptr++ = block;
        }

        // handle final partial block
        auto remaining = n - num_blocks * bits_per_block;
        if ( remaining != 0 ) {
            uint64_t block = 0;
            for ( size_t i = 0; i < remaining; i += 4 )
            {
                _mm_store_ps( e_ptr, _mm_sub_ps( _mm_load_ps( e_ptr ), t ) );
                block |= uint64_t( _mm_movemask_ps( _mm_cmpgt_ps( _mm_load_ps( e_ptr ), zero ) ) ) << i;
                e_ptr += 4;
            }
            *block_ptr = block;
        }


Create a new paste based on this one


Comments: