[ create a new paste ] login | about

Link: http://codepad.org/hNJpQxTv    [ raw code | fork ]

C++, pasted on Mar 29:
#include <stdio.h>
#include <stdlib.h> 
#include <assert.h>
#include <string.h>
#include <math.h>

//OS X Specific hi res timer + headers
//Replace with your own high res timer if desired
#include <stdint.h>
#include <mach/mach_time.h>
double timer()
{
    static uint64_t start = -1;
    static double conversion = -1;
    if( start == -1 )
    {
		if(conversion == -1)
		{
			mach_timebase_info_data_t info;
			kern_return_t err = mach_timebase_info( &info );
			if( err == 0  )
			{
				conversion = 1e-9 * (double) info.numer / (double) info.denom;
			}
		}
	 	start = mach_absolute_time();
    }
	uint64_t elapsed = mach_absolute_time() - start;
	return conversion*(double)elapsed;
}

//We fill with roughly equal counts of positive and negative numbers to prevent
//overflow.
void fill( double *p, int n )
{
    for( int i = 0; i < n; i++ )
    {
        double val = (double)rand() / RAND_MAX;
        double sign_num = (double) rand()/RAND_MAX;
        int sign;
        if(sign_num > 0.5)
        {
            sign = +1;
        }
        else
        {
            sign = -1;
        }
        p[i] = sign*val;
    }
}

//A pretty basic linear algebra kernel for calibration
void daxpy(double a,double* x,double* y,int n)
{
    for(int i =0; i < n; ++i) 
    {
        y[i] += a*x[i]; 
    }
}

//The CLFLUSH based flushing routine
//Flush all cache lines associated with the array from all levels of cache
#define FLUSH_CACHE_LINE(mem) __asm__ __volatile__\
    ("clflush %0"::"m"(*((char *)(mem))))
#define MFENCE __asm__ __volatile__\
    ("mfence":::)

#define CACHE_LINE_SIZE 64
template<typename T>
void flush_array(T* array,long len)
{
    //Want pointer operations to increment by one byte
    long byte_len = len*sizeof(T);
    char* byte_ptr =  (char*) array;

    MFENCE;
    for(long off = 0; off < byte_len; off += CACHE_LINE_SIZE)
    {
        FLUSH_CACHE_LINE(byte_ptr+off);
    }
    MFENCE;
}

//This is a primitive used in the manual flush routine below
//Reads the flush array and does an assert() at the end so that the compiler can't
//optimize away the code
void _read_flush_array(char* flush_array,long size)
{
    char total = 0;
    for(long i = 0; i < size; ++i)
    {
        total += flush_array[i];
    }
    
    //Since our array always contains exclusively NULL bytes
    //this should evaluate true.
    assert(total < 1e-16);
}

//This flushes our cache by reading in a very large array
//We keep on making the array bigger until our calibration timing has converged 
//within 5%, indicating that we should basically be reading from memory
void flush_cache(int cache_size)
{
        //Operands for the calibration routine
        int operand_size = cache_size/(2*sizeof(double));
        double alpha = 43.5; //arbitrary
        double* X = new double[operand_size];
        double* Y = new double[operand_size];

        //Converge difference betweeen most recent iterations to within 5%
        double last_duration = 0; 

        //Arbitrary large value so that the loop runs once
        double percent_change = 1000;

        long flush_size = 100*cache_size;
        int iter_counter = 0; 
        while(percent_change > 5)
        {
            //We start at the size of the cache, which would be adequate if the replacement policy was LRU.
            //Then we double the flush size every iteration 
            char* flush_array = new char[flush_size];

            //Set them all to NULL so that the sum will add up to zero
            memset(flush_array,0,flush_size*sizeof(char));

            //Make sure our operands are in cache
            fill(X,operand_size);
            fill(Y,operand_size);

            //Now flush
            _read_flush_array(flush_array,flush_size);

            double seconds = timer();
            daxpy(alpha,X,Y,operand_size);
            seconds = timer() - seconds;
            printf("Iteration %d: %g s\n",iter_counter,seconds);
            percent_change = (fabs(last_duration - seconds)/double(seconds))*100;
            last_duration = seconds;
            ++iter_counter;
            delete [] flush_array;
            flush_size *= 2;
        }
}

#define KB 1024
#define MB 1048576

int main()
{
    int cache_size = 32*KB;
    double alpha = 42.5;
	
	int operand_size = cache_size/(sizeof(double)*2);
	double* X = new double[operand_size];
	double* Y = new double[operand_size]; 

    fill(X,operand_size);
    fill(Y,operand_size);

	double seconds = timer();
	daxpy(alpha,X,Y,operand_size);
	seconds = timer() - seconds;

	//We print the value so code is not optimized away
	printf("Without flush: time=%g ns\n",seconds*1e9);

	
	//Using clflush instruction
    //We need to put the operands back in cache
    fill(X,operand_size);
    fill(Y,operand_size);

	flush_array(X,operand_size);
	flush_array(Y,operand_size);
	seconds = timer();
	daxpy(alpha,X,Y,operand_size);
	seconds = timer() - seconds;
	printf("With clflush: time=%g ns\n",seconds*1e9);

	//Using manual flush
    //We need to put the operands back in cache
    fill(X,operand_size);
    fill(Y,operand_size);

    flush_cache(cache_size);
    seconds = timer();
	daxpy(alpha,X,Y,operand_size);
    seconds = timer() - seconds;
    printf("With copy flush: time=%g ns\n",seconds*1e9);
	return 0;
}


Create a new paste based on this one


Comments: