#include # include # include # include # include double second(void); /* * Program: Stream * Programmer: Joe R. Zagar; ported to OS-9000: Ray Van De Walker * Revision: 4.0-BETA, October 24, 1995 * Original code developed by John D. McCalpin * * This program measures memory transfer rates in MB/s for simple * computational kernels coded in C. These numbers reveal the quality * of code generation for simple uncacheable kernels as well as showing * the cost of floating-point operations relative to memory accesses. * * INSTRUCTIONS: * * 1) Stream requires a good bit of memory to run. Adjust the * value of 'N' (below) to give a 'timing calibration' of * at least 20 clock-ticks. This will provide rate estimates * that should be good to about 5% precision. */ # define N 10000 /* was a million; the i860 data cache is 4k */ # define TOMBYTE (1000000/N) # define NTIMES 10 /* done 10 times */ # define OFFSET 0 /* * 3) Compile the code with full optimization. Many compilers * generate unreasonably bad code before the optimizer tightens * things up. If the results are unreasonably good, on the * other hand, the optimizer might be too smart for me! * * Try compiling with: * cc -O stream_d.c second.c -o stream_d -lm * * This is known to work on Cray, SGI, IBM, and Sun machines. * * * 4) Mail the results to mccalpin@cs.virginia.edu * Be sure to include: * a) computer hardware model number and software revision * b) the compiler flags * c) all of the output from the test case. * Thanks! * */ # define HLINE "-------------------------------------------------------------\n" # ifndef MIN # define MIN(x,y) ((x)<(y)?(x):(y)) # endif # ifndef MAX # define MAX(x,y) ((x)>(y)?(x):(y)) # endif static double a[N+OFFSET], b[N+OFFSET], c[N+OFFSET]; static double rmstime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; static char *label[4] = {"Copy: ", "Scale: ", "Add: ", "Triad: "}; static double bytes[4] = { 2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N }; int main(void) { int quantum, checktick(void); int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); printf(HLINE); printf("Array size = %d, Offset = %d\n" , N, OFFSET); printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); /* Get initial value for system clock. */ for (j=0; j= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else printf("Your clock granularity appears to be " "less than one microsecond.\n"); t = second(); for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (second() - t); printf("Each test below will take on the order" " of %d microseconds.\n", (int) t ); printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k