C++ code - 150 lines

Link:
C++, pasted on Apr 3:
#include <stdio.h>
#include <stdlib.h> 
#include <assert.h>
#include <string.h>
#include <math.h>

#include <boost/accumulators/accumulators.hpp>
#include <boost/accumulators/statistics/stats.hpp>
#include <boost/accumulators/statistics/mean.hpp>
#include <boost/accumulators/statistics/variance.hpp>
#include <boost/math/distributions/students_t.hpp>

//OS X Specific hi res timer + headers
//Replace with your own high res timer if desired
#include <stdint.h>
#include <mach/mach_time.h>


using namespace boost::accumulators;
using boost::math::students_t; 

double wall_time()
{
    static uint64_t start = -1;
    static double conversion = -1;
    if( start == -1 )
    {
        if(conversion == -1)
        {
            mach_timebase_info_data_t info;
            kern_return_t err = mach_timebase_info( &info );
            if( err == 0  )
            {
                conversion = 1e-9 * (double) info.numer / (double) info.denom;
            }
        }
        start = mach_absolute_time();
    }
    uint64_t elapsed = mach_absolute_time() - start;
    return conversion*(double)elapsed;
}

//We fill with roughly equal counts of positive and negative numbers to prevent
//overflow.
void fill( double *p, int n )
{
    for( int i = 0; i < n; i++ )
    {
        double val = (double)rand() / RAND_MAX;
        double sign_num = (double) rand()/RAND_MAX;
        int sign;
        if(sign_num > 0.5)
        {
            sign = +1;
        }
        else
        {
            sign = -1;
        }
        p[i] = sign*val;
    }
}

//A pretty basic linear algebra kernel for calibration
void daxpy(double a,double* x,double* y,int n)
{
    for(int i =0; i < n; ++i) 
    {
        y[i] += a*x[i]; 
    }
}

//The CLFLUSH based flushing routine
//Flush all cache lines associated with the array from all levels of cache
#define FLUSH_CACHE_LINE(mem) __asm__ __volatile__\
    ("clflush %0"::"m"(*((char *)(mem))))
#define MFENCE __asm__ __volatile__\
    ("mfence":::)

#define CACHE_LINE_SIZE 64
template<typename T>
void flush_array(T* array,long len)
{
    //Want pointer operations to increment by one byte
    long byte_len = len*sizeof(T);
    char* byte_ptr =  (char*) array;

    MFENCE;
    for(long off = 0; off < byte_len; off += CACHE_LINE_SIZE)
    {
        FLUSH_CACHE_LINE(byte_ptr+off);
    }
    MFENCE;
}

#define KB 1024

int main()
{
    int cache_size = 32*KB;
    double alpha = 42.5;

    int operand_size = cache_size/(sizeof(double)*2);
    double* X = new double[operand_size];
    double* Y = new double[operand_size];


    //95% confidence interval
    double max_risk = 0.05;
    //Interval half width
    double w;
    int n_iterations = 100;
    students_t dist(n_iterations-1);
    double T = boost::math::quantile(complement(dist,max_risk/2));
    accumulator_set<double, stats<tag::mean,tag::variance> > unflushed_acc;

    for(int i = 0; i < n_iterations; ++i)
    {
        fill(X,operand_size);
        fill(Y,operand_size);
        double seconds = wall_time();
        daxpy(alpha,X,Y,operand_size);
        seconds = wall_time() - seconds;
        unflushed_acc(seconds);
    }

    w = T*sqrt(variance(unflushed_acc))/sqrt(count(unflushed_acc));
    printf("Without flush: time=%g +/- %g ns\n",mean(unflushed_acc)*1e9,w*1e9);

    //Using clflush instruction
    //We need to put the operands back in cache
    accumulator_set<double, stats<tag::mean,tag::variance> > clflush_acc;
    for(int i = 0; i < n_iterations; ++i)
    {
        fill(X,operand_size);
        fill(Y,operand_size);

        flush_array(X,operand_size);
        flush_array(Y,operand_size);
        double seconds = wall_time();
        daxpy(alpha,X,Y,operand_size);
        seconds = wall_time() - seconds;
        clflush_acc(seconds);
    }

    w = T*sqrt(variance(clflush_acc))/sqrt(count(clflush_acc));
    printf("With clflush: time=%g +/- %g ns\n",mean(clflush_acc)*1e9,w*1e9);

    return 0;
}
Create a new paste based on this one

Comments: