# include <stdio.h>
#include <stdlib.h>
# include <math.h>
# include <float.h>
# include <limits.h>
# include <time.h>
/*
STREAM_L is a port of the original STREAM code from John McCalpin and 
Joe Zagar to the Intel(r) Architecture and Linux platform. 
It addresses the known problem of the 8253
timer used on PCs by re-writing the timer routine using the RDTSC assembler
instruction. It further refines the original benchmark such that memory is 
repeatedly de-allocated and re-allocated between individual test runs, 
capturing benchmark performance dependence on OS virtual memory allocation.

Compile with full optimization for speed.
Compile with: g++ -O3 -march=pentiumpro stream_l.cpp -funroll-loops 
Use egcs to get Pentium(r) Pro Processor floating point optimizations 

This code is provided "as is" with no representations or warranties of any kind, 
including non-infringement.


thanks, 
mason.cabot@intel.com
Platform Architecture Lab
Intel Corporation
mason.cabot@intel.com
April 1999.
*/

/*
 * Program: Stream
 * Programmer: Joe R. Zagar
 * Revision: 4.0-BETA, October 24, 1995
 * Original code developed by John D. McCalpin
 *
 * This program measures memory transfer rates in MB/s for simple 
 * computational kernels coded in C.  These numbers reveal the quality
 * of code generation for simple uncacheable kernels as well as showing
 * the cost of floating-point operations relative to memory accesses.
 *
 * INSTRUCTIONS:
 *
 *	1) Stream requires a good bit of memory to run.  Adjust the
 *          value of 'N' (below) to give a 'timing calibration' of 
 *          at least 20 clock-ticks.  This will provide rate estimates
 *          that should be good to about 5% precision.
 */
#define N 999936
#define CACHEBLOW 131072
# define OFFSET	0
/*
 *	3) Compile the code with full optimization.  Many compilers
 *	   generate unreasonably bad code before the optimizer tightens
 *	   things up.  If the results are unreasonably good, on the
 *	   other hand, the optimizer might be too smart for me!
 *
 *         Try compiling with:
 *               cc -O stream_d.c second.c -o stream_d -lm
 *
 *         This is known to work on Cray, SGI, IBM, and Sun machines.
 *
 *
 *	4) Mail the results to mccalpin@cs.virginia.edu
 *	   Be sure to include:
 *		a) computer hardware model number and software revision
 *		b) the compiler flags
 *		c) all of the output from the test case.
 * Thanks!
 *
 */

# define HLINE "-------------------------------------------------------------\n"

# ifndef MIN
# define MIN(x,y) ((x)<(y)?(x):(y))
# endif
# ifndef MAX
# define MAX(x,y) ((x)>(y)?(x):(y))
# endif
static double	rmstime[4] = {0}, maxtime[4] = {0},
		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};

static char	*label[4] = {"Copy:      ", "Scale:     ",
    "Add:       ", "Triad:     "};

double second(double core_freq);

int main(int argc, char **argv) {
	double *a, *b, *c, *d, *e;
	double *big; 
	int NTIMES;
    int	quantum, checktick(double);
    int	BytesPerWord;
    register int j, k;
	FILE *time_file=NULL;
	int ti=0;
	double core=0.0;

	if (argc != 3) {
		printf ("\n usage: %s <core_frequency(MHz)> <iterations>\n", argv[0]);
		return 1;
	}
	core=1000000*atof(argv[1]);
	NTIMES=atoi(argv[2]);

	double	bytes[4] = {
	    2 * sizeof(double) * N,
	    2 * sizeof(double) * N,
	    3 * sizeof(double) * N,
	    3 * sizeof(double) * N
    };
    double		scalar, t;
	double  *times[4];
	for (ti=0; ti<4; ti++) {
		times[ti]=new double[NTIMES];
		if (times[ti]==NULL) {
			printf ("\nError allocating arrays. Quitting.\n");
			exit(0);
		}
	}

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    BytesPerWord = sizeof(double);
    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
	BytesPerWord);

    printf(HLINE);
    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
    printf("Total memory required = %.1f MB.\n",
	((3*N + 2*CACHEBLOW) * BytesPerWord) / 1048576.0);
    printf("Each test is run %d times, but only\n", NTIMES);
    printf("the *best* time for each is used.\n");

    /* Get initial value for system clock. */

    printf(HLINE);

    if  ( (quantum = checktick(core)) >= 1) 
	printf("Your clock granularity/precision appears to be "
	    "%d microseconds.\n", quantum);
    else
	printf("Your clock granularity appears to be "
	    "less than one microsecond.\n");

    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
	scalar = 3.0;
	for (k=0; k<NTIMES; k++) {
		a=new double[N];
		b=new double[N];
		c=new double[N];
		d=new double[CACHEBLOW]; 
		e=new double[CACHEBLOW]; 
		if (a==NULL || b==NULL || c==NULL || d==NULL || e==NULL) 
			break;	

         for (j=0; j<N; j++) {
		a[j] = 1.0;
		b[j] = 2.0;
		c[j] = 0.0;
	 }
         for (j=0; j<CACHEBLOW; j++) {
             d[j]=1.0;
             e[j]=3.0;
         }
		// dirty up the cache:mbc
		for (j=0; j<CACHEBLOW; j++)
		    d[j] = e[j];

		times[0][k] = second(core);
		for (j=0; j<N; j++){
		    c[j] = a[j];
		}
		times[0][k] = second(core) - times[0][k];

		// dirty up the caches:mbc
		for (j=0; j<CACHEBLOW; j++)
		    d[j] = 5.0E0*e[j];

		times[1][k] = second(core);
		for (j=0; j<N; j++) {
		    b[j] = scalar*c[j];
		}
		times[1][k] = second(core) - times[1][k];

		// dirty up the caches:mbc
		for (j=0; j<CACHEBLOW; j++)
		    e[j] = 1.5E0*d[j];

		times[2][k] = second(core);
		for (j=0; j<N; j++) {
		    c[j] = a[j]+b[j];
		}
		times[2][k] = second(core) - times[2][k];
		// dirty up the caches:mbc
		for (j=0; j<CACHEBLOW; j++)
		    e[j] = 0.5E0*d[j];

		times[3][k] = second(core);
		for (j=0; j<N; j++) {
		    a[j] = b[j]+scalar*c[j];
		}
		times[3][k] = second(core) - times[3][k];
	delete [] a;
	delete [] b;
	delete [] c;
	delete [] d;
	delete [] e;

	}

    /*	--- SUMMARY --- */


    for (k=0; k<NTIMES; k++)
	{
		for (j=0; j<4; j++)
	    {
			rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
			mintime[j] = MIN(mintime[j], times[j][k]);
			maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
   
	time_file=fopen("times.txt", "w");
	fprintf(time_file, "copy\tscale\tadd\ttriad\n");
	for (ti=0; ti<NTIMES; ti++) {
		fprintf(time_file, "%f\t%f\t%f\t%f\n", 
			times[0][ti],
			times[1][ti],
			times[2][ti],
			times[3][ti]);
	}
	for (ti=0; ti<4; ti++) {
		delete [] times[ti];
	}
	fclose (time_file);
    printf("Function      Rate (MB/s)   RMS time     Min time     Max time\n");
    for (j=0; j<4; j++) {
		rmstime[j] = sqrt(rmstime[j]/(double)NTIMES);

		printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       rmstime[j],
	       mintime[j],
	       maxtime[j]);
    }
    return 0;
}

# define	M	20

int checktick(double core_freq)
{
    int		i, minDelta, Delta;
    double	t1, t2, timesfound[M];

/*  Collect a sequence of M unique time values from the system. */

    for (i = 0; i < M; i++) {
	t1 = second(core_freq);
	while( ((t2=second(core_freq)) - t1) < 1.0E-6 )
	    ;
	timesfound[i] = t1 = t2;
	}

/*
 * Determine the minimum difference between these M values.
 * This result will be our estimate (in microseconds) for the
 * clock granularity.
 */

    minDelta = 1000000;
    for (i = 1; i < M; i++) {
	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
	minDelta = MIN(minDelta, MAX(Delta,0));
	}

    return(minDelta);
}

#include <time.h>
#pragma optimize( "", off )

double second(double core_freq)
{
	unsigned long clk_time_hi=0;
	unsigned long clk_time_lo=0;
	double clk_time = 0.0;
	
	//__asm {
	//	_emit 0x0F
	//	_emit 0x31
	//	mov clk_time_lo, eax
	//	mov clk_time_hi, edx
//	}
asm ("rdtsc" 
	: "=d"(clk_time_hi), "=a"(clk_time_lo)
	: /* no inputs */);
	
	clk_time = (double)clk_time_hi * 4294967295.0; // upshift by 32bits
	clk_time += (double)clk_time_lo;
	return (clk_time / core_freq);

}


#pragma optimize( "", on ) 
