#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <math.h>
#include <boost/accumulators/accumulators.hpp>
#include <boost/accumulators/statistics/stats.hpp>
#include <boost/accumulators/statistics/mean.hpp>
#include <boost/accumulators/statistics/variance.hpp>
#include <boost/math/distributions/students_t.hpp>
//OS X Specific hi res timer + headers
//Replace with your own high res timer if desired
#include <stdint.h>
#include <mach/mach_time.h>
using namespace boost::accumulators;
using boost::math::students_t;
double wall_time()
{
static uint64_t start = -1;
static double conversion = -1;
if( start == -1 )
{
if(conversion == -1)
{
mach_timebase_info_data_t info;
kern_return_t err = mach_timebase_info( &info );
if( err == 0 )
{
conversion = 1e-9 * (double) info.numer / (double) info.denom;
}
}
start = mach_absolute_time();
}
uint64_t elapsed = mach_absolute_time() - start;
return conversion*(double)elapsed;
}
//We fill with roughly equal counts of positive and negative numbers to prevent
//overflow.
void fill( double *p, int n )
{
for( int i = 0; i < n; i++ )
{
double val = (double)rand() / RAND_MAX;
double sign_num = (double) rand()/RAND_MAX;
int sign;
if(sign_num > 0.5)
{
sign = +1;
}
else
{
sign = -1;
}
p[i] = sign*val;
}
}
//A pretty basic linear algebra kernel for calibration
void daxpy(double a,double* x,double* y,int n)
{
for(int i =0; i < n; ++i)
{
y[i] += a*x[i];
}
}
//The CLFLUSH based flushing routine
//Flush all cache lines associated with the array from all levels of cache
#define FLUSH_CACHE_LINE(mem) __asm__ __volatile__\
("clflush %0"::"m"(*((char *)(mem))))
#define MFENCE __asm__ __volatile__\
("mfence":::)
#define CACHE_LINE_SIZE 64
template<typename T>
void flush_array(T* array,long len)
{
//Want pointer operations to increment by one byte
long byte_len = len*sizeof(T);
char* byte_ptr = (char*) array;
MFENCE;
for(long off = 0; off < byte_len; off += CACHE_LINE_SIZE)
{
FLUSH_CACHE_LINE(byte_ptr+off);
}
MFENCE;
}
#define KB 1024
int main()
{
int cache_size = 32*KB;
double alpha = 42.5;
int operand_size = cache_size/(sizeof(double)*2);
double* X = new double[operand_size];
double* Y = new double[operand_size];
//95% confidence interval
double max_risk = 0.05;
//Interval half width
double w;
int n_iterations = 100;
students_t dist(n_iterations-1);
double T = boost::math::quantile(complement(dist,max_risk/2));
accumulator_set<double, stats<tag::mean,tag::variance> > unflushed_acc;
for(int i = 0; i < n_iterations; ++i)
{
fill(X,operand_size);
fill(Y,operand_size);
double seconds = wall_time();
daxpy(alpha,X,Y,operand_size);
seconds = wall_time() - seconds;
unflushed_acc(seconds);
}
w = T*sqrt(variance(unflushed_acc))/sqrt(count(unflushed_acc));
printf("Without flush: time=%g +/- %g ns\n",mean(unflushed_acc)*1e9,w*1e9);
//Using clflush instruction
//We need to put the operands back in cache
accumulator_set<double, stats<tag::mean,tag::variance> > clflush_acc;
for(int i = 0; i < n_iterations; ++i)
{
fill(X,operand_size);
fill(Y,operand_size);
flush_array(X,operand_size);
flush_array(Y,operand_size);
double seconds = wall_time();
daxpy(alpha,X,Y,operand_size);
seconds = wall_time() - seconds;
clflush_acc(seconds);
}
w = T*sqrt(variance(clflush_acc))/sqrt(count(clflush_acc));
printf("With clflush: time=%g +/- %g ns\n",mean(clflush_acc)*1e9,w*1e9);
return 0;
}