# include #include # include # include # include # include /* STREAM_L is a port of the original STREAM code from John McCalpin and Joe Zagar to the Intel(r) Architecture and Linux platform. It addresses the known problem of the 8253 timer used on PCs by re-writing the timer routine using the RDTSC assembler instruction. It further refines the original benchmark such that memory is repeatedly de-allocated and re-allocated between individual test runs, capturing benchmark performance dependence on OS virtual memory allocation. Compile with full optimization for speed. Compile with: g++ -O3 -march=pentiumpro stream_l.cpp -funroll-loops Use egcs to get Pentium(r) Pro Processor floating point optimizations This code is provided "as is" with no representations or warranties of any kind, including non-infringement. thanks, mason.cabot@intel.com Platform Architecture Lab Intel Corporation mason.cabot@intel.com April 1999. */ /* * Program: Stream * Programmer: Joe R. Zagar * Revision: 4.0-BETA, October 24, 1995 * Original code developed by John D. McCalpin * * This program measures memory transfer rates in MB/s for simple * computational kernels coded in C. These numbers reveal the quality * of code generation for simple uncacheable kernels as well as showing * the cost of floating-point operations relative to memory accesses. * * INSTRUCTIONS: * * 1) Stream requires a good bit of memory to run. Adjust the * value of 'N' (below) to give a 'timing calibration' of * at least 20 clock-ticks. This will provide rate estimates * that should be good to about 5% precision. */ #define N 999936 #define CACHEBLOW 131072 # define OFFSET 0 /* * 3) Compile the code with full optimization. Many compilers * generate unreasonably bad code before the optimizer tightens * things up. If the results are unreasonably good, on the * other hand, the optimizer might be too smart for me! * * Try compiling with: * cc -O stream_d.c second.c -o stream_d -lm * * This is known to work on Cray, SGI, IBM, and Sun machines. * * * 4) Mail the results to mccalpin@cs.virginia.edu * Be sure to include: * a) computer hardware model number and software revision * b) the compiler flags * c) all of the output from the test case. * Thanks! * */ # define HLINE "-------------------------------------------------------------\n" # ifndef MIN # define MIN(x,y) ((x)<(y)?(x):(y)) # endif # ifndef MAX # define MAX(x,y) ((x)>(y)?(x):(y)) # endif static double rmstime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; static char *label[4] = {"Copy: ", "Scale: ", "Add: ", "Triad: "}; double second(double core_freq); int main(int argc, char **argv) { double *a, *b, *c, *d, *e; double *big; int NTIMES; int quantum, checktick(double); int BytesPerWord; register int j, k; FILE *time_file=NULL; int ti=0; double core=0.0; if (argc != 3) { printf ("\n usage: %s \n", argv[0]); return 1; } core=1000000*atof(argv[1]); NTIMES=atoi(argv[2]); double bytes[4] = { 2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N }; double scalar, t; double *times[4]; for (ti=0; ti<4; ti++) { times[ti]=new double[NTIMES]; if (times[ti]==NULL) { printf ("\nError allocating arrays. Quitting.\n"); exit(0); } } /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); printf(HLINE); printf("Array size = %d, Offset = %d\n" , N, OFFSET); printf("Total memory required = %.1f MB.\n", ((3*N + 2*CACHEBLOW) * BytesPerWord) / 1048576.0); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); /* Get initial value for system clock. */ printf(HLINE); if ( (quantum = checktick(core)) >= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else printf("Your clock granularity appears to be " "less than one microsecond.\n"); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k #pragma optimize( "", off ) double second(double core_freq) { unsigned long clk_time_hi=0; unsigned long clk_time_lo=0; double clk_time = 0.0; //__asm { // _emit 0x0F // _emit 0x31 // mov clk_time_lo, eax // mov clk_time_hi, edx // } asm ("rdtsc" : "=d"(clk_time_hi), "=a"(clk_time_lo) : /* no inputs */); clk_time = (double)clk_time_hi * 4294967295.0; // upshift by 32bits clk_time += (double)clk_time_lo; return (clk_time / core_freq); } #pragma optimize( "", on )