#include <stdio.h>
#include <conio.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <time.h>
#include <windows.h>

/*
WSTREAM is a port of the original STREAM code from John McCalpin and 
Joe Zagar to the Intel(r) Architecture and Win32 platform. 
It addresses the known problem of the 8253
timer used on PCs by re-writing the timer routine using the RDTSC assembler
instruction. It further refines the original benchmark such that memory is 
repeatedly de-allocated and re-allocated between individual test runs, 
eliminating benchmark performance dependence on OS virtual memory allocation.

Compile with full optimization for speed: I've seen best reults with 
the Intel(r) C/C++ compiler available in VTune(tm) analyzer 4.0.

This code is provided "as is" with no representations or warranties of any kind, 
including non-infringement.


thanks, 
Mason Cabot
Platform Architecture Lab
Intel Corporation
mason.cabot@intel.com
April 1999.
*/


/*
 * Program: Stream
 * Programmer: Joe R. Zagar
 * Revision: 4.0-BETA, October 24, 1995
 * Original code developed by John D. McCalpin
 *
 * This program measures memory transfer rates in MB/s for simple 
 * computational kernels coded in C.  These numbers reveal the quality
 * of code generation for simple uncacheable kernels as well as showing
 * the cost of floating-point operations relative to memory accesses.
 *
 * INSTRUCTIONS:
 *
 *	1) Stream requires a good bit of memory to run.  Adjust the
 *          value of 'NSIZE' (below) to give a 'timing calibration' of 
 *          at least 20 clock-ticks.  This will provide rate estimates
 *          that should be good to about 5% precision.
 */
#define NSIZE 999936
/*
 *	3) Compile the code with full optimization.  Many compilers
 *	   generate unreasonably bad code before the optimizer tightens
 *	   things up.  If the results are unreasonably good, on the
 *	   other hand, the optimizer might be too smart for me!
 *
 *         Try compiling with:
 *               cc -O stream_d.c second.c -o stream_d -lm
 *
 *         This is known to work on Cray, SGI, IBM, and Sun machines.
 *
 *
 *	4) Mail the results to mccalpin@cs.virginia.edu
 *	   Be sure to include:
 *		a) computer hardware model number and software revision
 *		b) the compiler flags
 *		c) all of the output from the test case.
 * Thanks!
 *
 */
#define CACHEBLOW 131072
#define HLINE "-------------------------------------------------------------\n"

#ifndef MIN
#define MIN(x,y) ((x)<(y)?(x):(y))
#endif
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#endif
	
static double	rmstime[4] = {0}, maxtime[4] = {0},
		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};

static char	*label[4] = {"Copy:      ", "Scale:     ",
    "Add:       ", "Triad:     "};

double second(double core_freq);

int main(int argc, char **argv) {

	double *a, *b, *c, *d, *e;
	int NTIMES;
    int	quantum, checktick(double);
    int	BytesPerWord;
    register int j, k;
	FILE *time_file=NULL;
	int ti=0;
	double core=0.0;

	if (argc != 3) {
		printf ("\n usage: %s <core_frequency(MHz)> <iterations>\n", argv[0]);
		return 1;
	}
	core=1000000*atof(argv[1]);
	NTIMES=atoi(argv[2]);

	double	bytes[4] = {
	    2 * sizeof(double) * NSIZE,
	    2 * sizeof(double) * NSIZE,
	    3 * sizeof(double) * NSIZE,
	    3 * sizeof(double) * NSIZE
    };
    double	t;
	const double scalar=3.0;
	double  *times[4];
	for (ti=0; ti<4; ti++) {
		times[ti]=new double[NTIMES];
		if (times[ti]==NULL) {
			printf ("\nError allocating arrays. Quitting.\n");
			exit(0);
		}
	}

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    BytesPerWord = sizeof(double);
    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
	BytesPerWord);

    printf(HLINE);
    printf("Array size = %d\n" , NSIZE);
    printf("Total memory required = %.1f MB.\n",
	((2*CACHEBLOW + 3*NSIZE)*BytesPerWord) / 1048576.0);
    printf("Each test is run %d times, but only\n", NTIMES);
    printf("the *best* time for each is used.\n");

    /* Get initial value for system clock. */

    printf(HLINE);

    if  ( (quantum = checktick(core)) >= 1) 
	printf("Your clock granularity/precision appears to be "
	    "%d microseconds.\n", quantum);
    else
	printf("Your clock granularity appears to be "
	    "less than one microsecond.\n");

	a=new double [NSIZE];
	if (a==NULL) {
		printf ("Error allocating arrays\n");
		return 1;
	}
	for (j=0; j<NSIZE; j++) a[j]=3.0;
    t = second(core);
	for (j = 0; j < NSIZE; j++)
		a[j] = 2.0E0 * a[j];
	t = 1.0E6 * (second(core) - t);
	delete [] a;

    printf("Each test below will take on the order"
	" of %d microseconds.\n", (int) t  );
    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);

	// bump process priority to real-time
	SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
	SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
    for (k=0; k<NTIMES; k++)
	{
		// allocate arrays for testing and clearing the cache
		a=new double[NSIZE]; 
		b=new double[NSIZE]; 
		c=new double[NSIZE];
		d=new double[CACHEBLOW];
		e=new double[CACHEBLOW];

		if (a==NULL || b==NULL || c==NULL || d==NULL || e==NULL) 
			break;
		// initialize arrays to some value
		for (j=0; j<NSIZE; j++) {
			a[j] = 1.0;
			b[j] = 2.0;
			c[j] = 0.0;
		}
		for (j=0; j<CACHEBLOW; j++) {
			d[j] = 5.0;
			e[j] = 1.0;
		}
		// dirty up the cache
		for (j=0; j<CACHEBLOW; j++)
		    d[j] = e[j];

		times[0][k] = second(core);
		for (j=0; j<NSIZE; j++){
		    c[j] = a[j];
	//		j++;
	//		c[j] = a[j];
		}
		times[0][k] = second(core) - times[0][k];
		// dirty up the caches:mbc
		for (j=0; j<CACHEBLOW; j++)
		    d[j] = 5.0E0*e[j];

		times[1][k] = second(core);
		for (j=0; j<NSIZE; j++) {
			// the compiler may not be smart enough to unroll loops
			// so explicitly unroll a few iterations just in case
			b[j] = scalar*c[j];
	//		j++;
	//		b[j] = scalar*c[j];
		}
		times[1][k] = second(core) - times[1][k];
		// dirty up the caches
		for (j=0; j<CACHEBLOW; j++)
		    e[j] = 1.5E0*d[j];

		times[2][k] = second(core);
		for (j=0; j<NSIZE; j++) {
		    c[j] = a[j]+b[j];
	//		j++;
	//	    c[j] = a[j]+b[j];
		}
		times[2][k] = second(core) - times[2][k];
		// dirty up the caches
		for (j=0; j<CACHEBLOW; j++)
		    e[j] = 0.5E0*d[j];

		times[3][k] = second(core);
		for (j=0; j<NSIZE; j++) {
		    a[j] = b[j]+scalar*c[j];
	//		j++;
	//	    a[j] = b[j]+scalar*c[j];
		}
		times[3][k] = second(core) - times[3][k];

		// deallocate the arrays: they will be reallocated on the next iteration
		// this de- and re-alloc captures the random allocation of NT virtual memory
		delete [] a;
		delete [] b;
		delete [] c;
		delete [] d;
		delete [] e;
	}

   	SetPriorityClass(GetCurrentProcess(), NORMAL_PRIORITY_CLASS);
	SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_NORMAL);

	if (a==NULL || b==NULL || c==NULL || d==NULL || e==NULL) {
		printf ("\nerror allocating arrays\n");
		return 1;
	}

    /*	--- SUMMARY --- */
    for (k=0; k<NTIMES; k++)
	{
		for (j=0; j<4; j++)
	    {
			rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
			mintime[j] = MIN(mintime[j], times[j][k]);
			maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
   
	time_file=fopen("times.txt", "w");
	if (time_file==NULL) 
		printf ("\ncouldn't open times.txt\n");
	else {
		fprintf(time_file, "copy\tscale\tadd\ttriad\n");
		for (ti=0; ti<NTIMES; ti++) {
			fprintf(time_file, "%f\t%f\t%f\t%f\n", 
				times[0][ti],
				times[1][ti],
				times[2][ti],
				times[3][ti]);
		}
		fclose (time_file);
	}
	for (ti=0; ti<4; ti++) {
		delete [] times[ti];
	}
    printf("Function      Rate (MB/s)   RMS time     Min time     Max time\n");
    for (j=0; j<4; j++) {
		rmstime[j] = sqrt(rmstime[j]/(double)NTIMES);

		printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       rmstime[j],
	       mintime[j],
	       maxtime[j]);
    }
    return 0;
}

# define	M	20

int checktick(double core_freq)
{
    int		i, minDelta, Delta;
    double	t1, t2, timesfound[M];

/*  Collect a sequence of M unique time values from the system. */

    for (i = 0; i < M; i++) {
	t1 = second(core_freq);
	while( ((t2=second(core_freq)) - t1) < 1.0E-6 )
	    ;
	timesfound[i] = t1 = t2;
	}

/*
 * Determine the minimum difference between these M values.
 * This result will be our estimate (in microseconds) for the
 * clock granularity.
 */

    minDelta = 1000000;
    for (i = 1; i < M; i++) {
	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
	minDelta = MIN(minDelta, MAX(Delta,0));
	}

    return(minDelta);
}

#include <time.h>
// some compilers read this routine as NULL and optimize it out.
// prevent optimization of this routine with the pragma
#pragma optimize( "", off )

double second(double core_freq)
{
	unsigned long clk_time_hi=0;
	unsigned long clk_time_lo=0;
	double clk_time = 0.0;
	
	__asm {
		_emit 0x0F
		_emit 0x31
		mov clk_time_lo, eax
		mov clk_time_hi, edx
	}
	
	clk_time = (double)clk_time_hi * 4294967295.0; // upshift by 32bits
	clk_time += (double)clk_time_lo;
	return (clk_time / core_freq);
}


#pragma optimize( "", on ) 
