# include <stdio.h>
#include <stdlib.h>
# include <math.h>
# include <float.h>
# include <limits.h>
# include <time.h>
# include <string.h>

/*
This version of stream_l is based on the code by Mason Cabot. 

It makes use of MPI to run the benchmark on all nodes of a cluster and display 
results on the node with rank 0. The CPU speed of each node is determined by 
reading /proc/cpuinfo. It doesn't treat SMP nodes any differently (from single 
processor ones) and in such a case, picks up the speed of the first processor 
listed in /proc/info

Compilation: using mpiCC
egcs (2.91) with (-march=pentiumpro -funroll-loops) options seems to give best results.
When compiled with 2.96, the code does not seem to be optimized enough and hence the 
results suffer

This code is provided "as is" with no representations or warranties of any kind, 
including non-infringement.The code may be modified, and distributed under BSD license

Thanks

G. Venkateswara Rao
Enginner, Singapore Computer Systems
mail: gattameni@scs.com.sg
02-October-2002
*/

/*
STREAM_L is a port of the original STREAM code from John McCalpin and 
Joe Zagar to the Intel(r) Architecture and Linux platform. 
It addresses the known problem of the 8253
timer used on PCs by re-writing the timer routine using the RDTSC assembler
instruction. It further refines the original benchmark such that memory is 
repeatedly de-allocated and re-allocated between individual test runs, 
capturing benchmark performance dependence on OS virtual memory allocation.

Compile with full optimization for speed.
Compile with: g++ -O3 -march=pentiumpro stream_l.cpp -funroll-loops 
Use egcs to get Pentium(r) Pro Processor floating point optimizations 

This code is provided "as is" with no representations or warranties of any kind, 
including non-infringement.


thanks, 
mason.cabot@intel.com
Platform Architecture Lab
Intel Corporation
mason.cabot@intel.com
April 1999.
*/

/*
 * Program: Stream
 * Programmer: Joe R. Zagar
 * Revision: 4.0-BETA, October 24, 1995
 * Original code developed by John D. McCalpin
 *
 * This program measures memory transfer rates in MB/s for simple 
 * computational kernels coded in C.  These numbers reveal the quality
 * of code generation for simple uncacheable kernels as well as showing
 * the cost of floating-point operations relative to memory accesses.
 *
 * INSTRUCTIONS:
 *
 *	1) Stream requires a good bit of memory to run.  Adjust the
 *          value of 'N' (below) to give a 'timing calibration' of 
 *          at least 20 clock-ticks.  This will provide rate estimates
 *          that should be good to about 5% precision.
 */
#define N 999936
#define CACHEBLOW 131072
# define OFFSET	0
/*
 *	3) Compile the code with full optimization.  Many compilers
 *	   generate unreasonably bad code before the optimizer tightens
 *	   things up.  If the results are unreasonably good, on the
 *	   other hand, the optimizer might be too smart for me!
 *
 *         Try compiling with:
 *               cc -O stream_d.c second.c -o stream_d -lm
 *
 *         This is known to work on Cray, SGI, IBM, and Sun machines.
 *
 *
 *	4) Mail the results to mccalpin@cs.virginia.edu
 *	   Be sure to include:
 *		a) computer hardware model number and software revision
 *		b) the compiler flags
 *		c) all of the output from the test case.
 * Thanks!
 *
 */

# define HLINE "-------------------------------------------------------------\n"

# ifndef MIN
# define MIN(x,y) ((x)<(y)?(x):(y))
# endif
# ifndef MAX
# define MAX(x,y) ((x)>(y)?(x):(y))
# endif
static double	rmstime[4] = {0}, maxtime[4] = {0},
		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};

static char	*label[4] = {"Copy:      ", "Scale:     ",
    "Add:       ", "Triad:     "};

double second(double core_freq);
void print_results(const char*, ...);
int stream(char *argv, int notests);

int stream(char *argv, int notests) {
	double *a, *b, *c, *d, *e;
	double *big; 
	int NTIMES;
    int	quantum, checktick(double);
    int	BytesPerWord;
    register int j, k;
	FILE *time_file=NULL;
	int ti=0;
	double core=0.0;

	core=1000000*atof(argv);
	NTIMES=notests;

	double	bytes[4] = {
	    2 * sizeof(double) * N,
	    2 * sizeof(double) * N,
	    3 * sizeof(double) * N,
	    3 * sizeof(double) * N
    };
    double		scalar, t;
	double  *times[4];
	for (ti=0; ti<4; ti++) {
		times[ti]=new double[NTIMES];
		if (times[ti]==NULL) {
			print_results ("\nError allocating arrays. Quitting.\n");
			exit(0);
		}
	}

    /* --- SETUP --- determine precision and check timing --- */

    print_results(HLINE);
    BytesPerWord = sizeof(double);
    print_results("This system uses %d bytes per DOUBLE PRECISION word.\n",
	BytesPerWord);

    print_results(HLINE);
    print_results("Array size = %d, Offset = %d\n" , N, OFFSET);
    print_results("Total memory required = %.1f MB.\n",
	((3*N + 2*CACHEBLOW) * BytesPerWord) / 1048576.0);
    print_results("Each test is run %d times, but only\n", NTIMES);
    print_results("the *best* time for each is used.\n");

    /* Get initial value for system clock. */

    print_results(HLINE);

    if  ( (quantum = checktick(core)) >= 1) 
	print_results("Your clock granularity/precision appears to be "
	    "%d microseconds.\n", quantum);
    else
	print_results("Your clock granularity appears to be "
	    "less than one microsecond.\n");

    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
	scalar = 3.0;
	for (k=0; k<NTIMES; k++) {
		a=new double[N];
		b=new double[N];
		c=new double[N];
		d=new double[CACHEBLOW]; 
		e=new double[CACHEBLOW]; 
		if (a==NULL || b==NULL || c==NULL || d==NULL || e==NULL) 
			break;	

         for (j=0; j<N; j++) {
		a[j] = 1.0;
		b[j] = 2.0;
		c[j] = 0.0;
	 }
         for (j=0; j<CACHEBLOW; j++) {
             d[j]=1.0;
             e[j]=3.0;
         }
		// dirty up the cache:mbc
		for (j=0; j<CACHEBLOW; j++)
		    d[j] = e[j];

		times[0][k] = second(core);
		for (j=0; j<N; j++){
		    c[j] = a[j];
		}
		times[0][k] = second(core) - times[0][k];

		// dirty up the caches:mbc
		for (j=0; j<CACHEBLOW; j++)
		    d[j] = 5.0E0*e[j];

		times[1][k] = second(core);
		for (j=0; j<N; j++) {
		    b[j] = scalar*c[j];
		}
		times[1][k] = second(core) - times[1][k];

		// dirty up the caches:mbc
		for (j=0; j<CACHEBLOW; j++)
		    e[j] = 1.5E0*d[j];

		times[2][k] = second(core);
		for (j=0; j<N; j++) {
		    c[j] = a[j]+b[j];
		}
		times[2][k] = second(core) - times[2][k];
		// dirty up the caches:mbc
		for (j=0; j<CACHEBLOW; j++)
		    e[j] = 0.5E0*d[j];

		times[3][k] = second(core);
		for (j=0; j<N; j++) {
		    a[j] = b[j]+scalar*c[j];
		}
		times[3][k] = second(core) - times[3][k];
	delete [] a;
	delete [] b;
	delete [] c;
	delete [] d;
	delete [] e;

	}

    /*	--- SUMMARY --- */


    for (k=0; k<NTIMES; k++)
	{
		for (j=0; j<4; j++)
	    {
			rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
			mintime[j] = MIN(mintime[j], times[j][k]);
			maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
   
	time_file=fopen("times.txt", "w");
	fprintf(time_file, "copy\tscale\tadd\ttriad\n");
	for (ti=0; ti<NTIMES; ti++) {
		fprintf(time_file, "%f\t%f\t%f\t%f\n", 
			times[0][ti],
			times[1][ti],
			times[2][ti],
			times[3][ti]);
	}
	for (ti=0; ti<4; ti++) {
		delete [] times[ti];
	}
	fclose (time_file);
    print_results("Function      Rate (MB/s)   RMS time     Min time     Max time\n");
    for (j=0; j<4; j++) {
		rmstime[j] = sqrt(rmstime[j]/(double)NTIMES);

		print_results("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       rmstime[j],
	       mintime[j],
	       maxtime[j]);
    }
    return 0;
}

# define	M	20

int checktick(double core_freq)
{
    int		i, minDelta, Delta;
    double	t1, t2, timesfound[M];

/*  Collect a sequence of M unique time values from the system. */

    for (i = 0; i < M; i++) {
	t1 = second(core_freq);
	while( ((t2=second(core_freq)) - t1) < 1.0E-6 )
	    ;
	timesfound[i] = t1 = t2;
	}

/*
 * Determine the minimum difference between these M values.
 * This result will be our estimate (in microseconds) for the
 * clock granularity.
 */

    minDelta = 1000000;
    for (i = 1; i < M; i++) {
	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
	minDelta = MIN(minDelta, MAX(Delta,0));
	}

    return(minDelta);
}

#include <time.h>
#pragma optimize( "", off )

double second(double core_freq)
{
	unsigned long clk_time_hi=0;
	unsigned long clk_time_lo=0;
	double clk_time = 0.0;
	
	//__asm {
	//	_emit 0x0F
	//	_emit 0x31
	//	mov clk_time_lo, eax
	//	mov clk_time_hi, edx
//	}
asm ("rdtsc" 
	: "=d"(clk_time_hi), "=a"(clk_time_lo)
	: /* no inputs */);
	
	clk_time = (double)clk_time_hi * 4294967295.0; // upshift by 32bits
	clk_time += (double)clk_time_lo;
	return (clk_time / core_freq);

}

#include "mpi.h"
#include <stdarg.h>
int rank, linenum;
char results[100][128];

int main(int argc, char **argv){
  char line[128];
  int notests = 10;
  char speed[20];
  int size, rank, partner, namelen, i;
  char name[MPI_MAX_PROCESSOR_NAME];
  MPI_Status stat;

  MPI_Init(&argc,&argv);
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Get_processor_name(name, &namelen);

  linenum = 0;
//   Just open /proc/info and determine clock speed
//   In case of an SMP, we'll just take the first CPU speed

  FILE *fptr=fopen("/proc/cpuinfo","r");  

  if (fptr != NULL) {
    int i;
    for(i=0; i<128; i++){
      results[i][0]='\0';
    }
    while (!feof(fptr)){
      fgets(line, 128, fptr);
      if(strncmp("cpu MHz", line, 7) == 0){
	strtok(line, ":");
	sprintf(speed,"%s",strtok(NULL,": \n"));
	break;
      }
    }

    print_results(HLINE);
    print_results(HLINE);
    print_results("**** Rank: %d, Name: %s, Speed (in MHz): %s ****\n",rank,name,speed);
    stream(speed,notests);
  }
  else {
    print_results("Unable to open file /proc/cpuinfo\n");
  }
  
  if (rank == 0){
    for(partner=1;partner<size;partner++){
      char rcv_str[128];
      fflush(stdout);
      rcv_str[0]='\0';
      for(i=0; i<128; i++){
	MPI_Recv(rcv_str, 128, MPI_BYTE, partner,1,
		 MPI_COMM_WORLD, &stat);
	fputs(rcv_str,stdout);
      }
    }
  }
  else{
    // Send over results array to display on node with rank 0
    // Easiest way is to send each line separately
    for(i=0; i<128; i++) {
      char *str = results[i];
      MPI_Send(str, strlen(str)+1, MPI_BYTE, 0,1,
	       MPI_COMM_WORLD);
    }
  }
  MPI_Finalize();
  return 0;
}


void print_results(const char *fmt, ...){
  // This function must be treated like printf except that it prints to results array
  // The idea here is to simply keep printing all output/errors to an array which
  // will be sent to rank 0 in the end for display

  va_list ap;
  va_start(ap, fmt);
  // if this is rank 0 print to screen else print to results 
  if (rank == 0){
    vprintf (fmt, ap);
  }
  else if (linenum<127){
    vsnprintf (results[linenum], 128, fmt, ap);
    linenum++;
  }
  va_end(ap);
}

#pragma optimize( "", on )