C++ code - 208 lines

Link:
C++, pasted on Oct 21:
/*
Run on (1 X 4008 MHz CPU )
10/20/16 22:54:59
Benchmark                               Time(ns)    CPU(ns) Iterations
----------------------------------------------------------------------
BM_fixed_sqrt_poly<fixed_t>/64              1499       1444     497778                42.2706M items/s
BM_fixed_sqrt_poly<fixed_t>/512             9409       9521      64000                51.2821M items/s
BM_fixed_sqrt_poly<fixed_t>/4k             71531      71875      10000                54.3478M items/s
BM_fixed_sqrt_poly<fixed_t>/24.4141k      432971     462123       1792                 51.592M items/s
BM_fixed_sqrt_iter1<fixed_t>/64             5875       5625     100000                10.8507M items/s
BM_fixed_sqrt_iter1<fixed_t>/512           43885      48132      14933                10.1447M items/s
BM_fixed_sqrt_iter1<fixed_t>/4k           351318     369699       2240                 10.566M items/s
BM_fixed_sqrt_iter1<fixed_t>/24.4141k    2143884    2083333        345                11.4441M items/s
BM_fixed_sqrt_iter2<fixed_t>/64             2786       2825     298667                 21.605M items/s
BM_fixed_sqrt_iter2<fixed_t>/512           19889      17090      32000                28.5714M items/s
BM_fixed_sqrt_iter2<fixed_t>/4k           157305     154869       3733                 25.223M items/s
BM_fixed_sqrt_iter2<fixed_t>/24.4141k     964814     962182        747                24.7789M items/s
*/


#include <random>
#include <vector>
#include <intrin.h>
#include <benchmark/benchmark.h>
#include <boost/fixed_point/fixed_point.hpp>
#include <boost/fixed_point/fixed_point_negatable_cmath.hpp>

const int min_size = 64;
const int max_size = 25'000;

using boost::fixed_point::negatable;

// Computes the fixed-point square root using a shift-and-add algorithm.
// Adapted from sqrt found here: http://codepad.org/CqvPx5u0
// And this has beem modified from Ken Turkowski's algorithm.
// Despite the somewhat confusing description of the result in the white paper,
// the algorithm builds a M.F fixed-point square root.
// @see http://www.realitypixels.com/turk/computergraphics/FixedSqrt.pdf

template < const int I, const int F, typename R, typename O >
negatable<I, F, R, O> sqrt_iter1( negatable<I,F,R,O> x )
{
    using fixed_t = negatable<I, F, R, O>;
    typedef fixed_t::unsigned_small_type local_unsigned_small_type;
    typedef fixed_t::nothing             local_nothing;

    constexpr std::uint_fast8_t total_bits      = static_cast< std::uint_fast8_t >(fixed_t::all_bits );
    constexpr std::uint_fast8_t fractional_bits = static_cast< std::uint_fast8_t >(fixed_t::radix_split );

    local_unsigned_small_type root = static_cast< local_unsigned_small_type >( 0U ); // Clear the root.
    local_unsigned_small_type remHi =
        static_cast< local_unsigned_small_type >( 0U ); // Clear the high part of the partial remainder.
    local_unsigned_small_type remLo = static_cast< local_unsigned_small_type >(
        x.crepresentation() ); // Get the argument into the low part of the partial remainder.

    for ( std::uint_fast8_t count = ( total_bits >> 1 ) + ( fractional_bits >> 1 ); count > 0U; --count ) {
        // Get 2 bits of the argument.
        remHi = ( remHi << 2 ) | ( remLo >> ( total_bits - 2 ) );
        remLo <<= 2;

        // Get ready for the next bit in the root.
        root <<= 1;

        // Test the radical.
        const local_unsigned_small_type testDiv = ( root << 1 ) + 1;

        if ( remHi >= testDiv ) {
            remHi = remHi - testDiv;

            ++root;
        }
    }

    return fixed_t( local_nothing(), root );
}

template < const int I, const int F, typename R, typename O >
negatable<I, F, R, O> sqrt_iter2( negatable<I, F, R, O> x )
{
    using fixed_t = negatable<I, F, R, O>;
    typedef fixed_t::unsigned_small_type local_unsigned_small_type;
    typedef fixed_t::nothing             local_nothing;

    constexpr std::uint_fast8_t total_bits = static_cast< std::uint_fast8_t >(fixed_t::all_bits);
    constexpr std::uint_fast8_t fractional_bits = static_cast< std::uint_fast8_t >(fixed_t::radix_split);

    local_unsigned_small_type root = static_cast< local_unsigned_small_type >(0U); // Clear the root.
    local_unsigned_small_type remHi = static_cast< local_unsigned_small_type >(0U); // Clear the high part of the partial remainder.
    local_unsigned_small_type remLo = static_cast< local_unsigned_small_type >(x.crepresentation()); // Get the argument into the low part of the partial remainder.
    
    int count = (total_bits >> 1) + (fractional_bits >> 1);
    auto hlz = __lzcnt( remLo ) >> 1;
    count -= hlz;
    remLo <<= (hlz << 1);
    
    for ( ; count > 0; --count ) {
        // Get 2 bits of the argument.
        remHi = (remHi << 2) | (remLo >> (total_bits - 2));
        remLo <<= 2;

        // Get ready for the next bit in the root.
        root <<= 1;

        // Test the radical.
        const local_unsigned_small_type testDiv = (root << 1) + 1;

        auto newRemHi = remHi - testDiv;
        auto newRoot = root + 1;
        if ( remHi >= testDiv ) {
            // force compiler to use cmov instead of a jump
            remHi = newRemHi;
        }
        if ( remHi >= testDiv ) {
            // the compiler isn't smart enough to merge these if statements
            // it will also refuse to use cmov if both assignments are in the same if block
            root = newRoot;
        }
    }

    return fixed_t( local_nothing(), root );
}

template< typename FixedT >
void generate_data( std::vector<FixedT> & data, size_t n, std::mt19937 & rng ) {
    typedef fixed_t::unsigned_small_type local_unsigned_small_type;
    typedef fixed_t::nothing             local_nothing;

    std::uniform_int_distribution<local_unsigned_small_type> dist( 0, INT_MAX );

    data.resize( n );
    for ( auto & f : data ) {
        f = FixedT( local_nothing(), dist( rng ) );
    }
}

template< typename FixedT >
void BM_fixed_sqrt_poly( benchmark::State& state ) {
    const auto seed_val = std::mt19937::default_seed;
    std::mt19937 rng( seed_val );

    std::vector<FixedT> data;    

    while ( state.KeepRunning() ) {
        state.PauseTiming();
        const auto n = state.range_x();
        generate_data( data, n, rng );

        state.ResumeTiming();
        for ( auto & f : data ) {
            f = sqrt( f );
        }

        const size_t items_processed = state.iterations() * n;
        state.SetItemsProcessed( items_processed );
    }
}

template< typename FixedT >
void BM_fixed_sqrt_iter1( benchmark::State& state ) {
    const auto seed_val = std::mt19937::default_seed;
    std::mt19937 rng( seed_val );

    std::vector<FixedT> data;

    while ( state.KeepRunning() ) {
        state.PauseTiming();
        const auto n = state.range_x();
        generate_data( data, n, rng );
        
        state.ResumeTiming();
        for ( auto & f : data ) {
            f = sqrt_iter1( f );
        }

        const size_t items_processed = state.iterations() * n;
        state.SetItemsProcessed( items_processed );
    }
}

template< typename FixedT >
void BM_fixed_sqrt_iter2( benchmark::State& state ) {
    const auto seed_val = std::mt19937::default_seed;
    std::mt19937 rng( seed_val );

    std::vector<FixedT> data;

    while ( state.KeepRunning() ) {
        state.PauseTiming();
        const auto n = state.range_x();
        generate_data( data, n, rng );

        state.ResumeTiming();
        for ( auto & f : data ) {
            f = sqrt_iter2( f );
        }

        const size_t items_processed = state.iterations() * n;
        state.SetItemsProcessed( items_processed );
    }
}

using fixed_t = negatable<7, -24>;

BENCHMARK_TEMPLATE( BM_fixed_sqrt_poly, fixed_t )->Range( min_size, max_size );
BENCHMARK_TEMPLATE( BM_fixed_sqrt_iter1, fixed_t )->Range( min_size, max_size );
BENCHMARK_TEMPLATE( BM_fixed_sqrt_iter2, fixed_t )->Range( min_size, max_size );

BENCHMARK_MAIN()
Create a new paste based on this one

Comments: