# include #include # include # include # include # include # include /* This version of stream_l is based on the code by Mason Cabot. It makes use of MPI to run the benchmark on all nodes of a cluster and display results on the node with rank 0. The CPU speed of each node is determined by reading /proc/cpuinfo. It doesn't treat SMP nodes any differently (from single processor ones) and in such a case, picks up the speed of the first processor listed in /proc/info Compilation: using mpiCC egcs (2.91) with (-march=pentiumpro -funroll-loops) options seems to give best results. When compiled with 2.96, the code does not seem to be optimized enough and hence the results suffer This code is provided "as is" with no representations or warranties of any kind, including non-infringement.The code may be modified, and distributed under BSD license Thanks G. Venkateswara Rao Enginner, Singapore Computer Systems mail: gattameni@scs.com.sg 02-October-2002 */ /* STREAM_L is a port of the original STREAM code from John McCalpin and Joe Zagar to the Intel(r) Architecture and Linux platform. It addresses the known problem of the 8253 timer used on PCs by re-writing the timer routine using the RDTSC assembler instruction. It further refines the original benchmark such that memory is repeatedly de-allocated and re-allocated between individual test runs, capturing benchmark performance dependence on OS virtual memory allocation. Compile with full optimization for speed. Compile with: g++ -O3 -march=pentiumpro stream_l.cpp -funroll-loops Use egcs to get Pentium(r) Pro Processor floating point optimizations This code is provided "as is" with no representations or warranties of any kind, including non-infringement. thanks, mason.cabot@intel.com Platform Architecture Lab Intel Corporation mason.cabot@intel.com April 1999. */ /* * Program: Stream * Programmer: Joe R. Zagar * Revision: 4.0-BETA, October 24, 1995 * Original code developed by John D. McCalpin * * This program measures memory transfer rates in MB/s for simple * computational kernels coded in C. These numbers reveal the quality * of code generation for simple uncacheable kernels as well as showing * the cost of floating-point operations relative to memory accesses. * * INSTRUCTIONS: * * 1) Stream requires a good bit of memory to run. Adjust the * value of 'N' (below) to give a 'timing calibration' of * at least 20 clock-ticks. This will provide rate estimates * that should be good to about 5% precision. */ #define N 999936 #define CACHEBLOW 131072 # define OFFSET 0 /* * 3) Compile the code with full optimization. Many compilers * generate unreasonably bad code before the optimizer tightens * things up. If the results are unreasonably good, on the * other hand, the optimizer might be too smart for me! * * Try compiling with: * cc -O stream_d.c second.c -o stream_d -lm * * This is known to work on Cray, SGI, IBM, and Sun machines. * * * 4) Mail the results to mccalpin@cs.virginia.edu * Be sure to include: * a) computer hardware model number and software revision * b) the compiler flags * c) all of the output from the test case. * Thanks! * */ # define HLINE "-------------------------------------------------------------\n" # ifndef MIN # define MIN(x,y) ((x)<(y)?(x):(y)) # endif # ifndef MAX # define MAX(x,y) ((x)>(y)?(x):(y)) # endif static double rmstime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; static char *label[4] = {"Copy: ", "Scale: ", "Add: ", "Triad: "}; double second(double core_freq); void print_results(const char*, ...); int stream(char *argv, int notests); int stream(char *argv, int notests) { double *a, *b, *c, *d, *e; double *big; int NTIMES; int quantum, checktick(double); int BytesPerWord; register int j, k; FILE *time_file=NULL; int ti=0; double core=0.0; core=1000000*atof(argv); NTIMES=notests; double bytes[4] = { 2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N }; double scalar, t; double *times[4]; for (ti=0; ti<4; ti++) { times[ti]=new double[NTIMES]; if (times[ti]==NULL) { print_results ("\nError allocating arrays. Quitting.\n"); exit(0); } } /* --- SETUP --- determine precision and check timing --- */ print_results(HLINE); BytesPerWord = sizeof(double); print_results("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); print_results(HLINE); print_results("Array size = %d, Offset = %d\n" , N, OFFSET); print_results("Total memory required = %.1f MB.\n", ((3*N + 2*CACHEBLOW) * BytesPerWord) / 1048576.0); print_results("Each test is run %d times, but only\n", NTIMES); print_results("the *best* time for each is used.\n"); /* Get initial value for system clock. */ print_results(HLINE); if ( (quantum = checktick(core)) >= 1) print_results("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else print_results("Your clock granularity appears to be " "less than one microsecond.\n"); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k #pragma optimize( "", off ) double second(double core_freq) { unsigned long clk_time_hi=0; unsigned long clk_time_lo=0; double clk_time = 0.0; //__asm { // _emit 0x0F // _emit 0x31 // mov clk_time_lo, eax // mov clk_time_hi, edx // } asm ("rdtsc" : "=d"(clk_time_hi), "=a"(clk_time_lo) : /* no inputs */); clk_time = (double)clk_time_hi * 4294967295.0; // upshift by 32bits clk_time += (double)clk_time_lo; return (clk_time / core_freq); } #include "mpi.h" #include int rank, linenum; char results[100][128]; int main(int argc, char **argv){ char line[128]; int notests = 10; char speed[20]; int size, rank, partner, namelen, i; char name[MPI_MAX_PROCESSOR_NAME]; MPI_Status stat; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Get_processor_name(name, &namelen); linenum = 0; // Just open /proc/info and determine clock speed // In case of an SMP, we'll just take the first CPU speed FILE *fptr=fopen("/proc/cpuinfo","r"); if (fptr != NULL) { int i; for(i=0; i<128; i++){ results[i][0]='\0'; } while (!feof(fptr)){ fgets(line, 128, fptr); if(strncmp("cpu MHz", line, 7) == 0){ strtok(line, ":"); sprintf(speed,"%s",strtok(NULL,": \n")); break; } } print_results(HLINE); print_results(HLINE); print_results("**** Rank: %d, Name: %s, Speed (in MHz): %s ****\n",rank,name,speed); stream(speed,notests); } else { print_results("Unable to open file /proc/cpuinfo\n"); } if (rank == 0){ for(partner=1;partner