Re: stream_d entry for Pentium 100

From: Russell L. Carter (rcarter@geli.com)
Date: Thu May 04 1995 - 11:23:05 CDT


John,
You were right about there being problems, so I rewrote the stream_d.c
code a bit, and corrected several glitches. But the upshot is that the
results I initially reported were essentially correct. The rate is
now slightly slower because stream_d.c was calculating MBytes as
1e6*Bytes, rather than (1024*1024)*Bytes.

Here is the output:

Timing calibration 1: utime overhead = 48.995018 usec.
Timing calibration 2: array init time = 469684.004784 usec.
Increase the size of the arrays if this is < 300000
and your clock precision is =< 1/100 second.
---------------------------------------------------
Function Rate (MB/s) RMS time Min time Max time
Assignment: 84.349 0.091 0.090 0.091
Scaling : 68.201 0.112 0.112 0.112
Summing : 76.163 0.150 0.150 0.151
SAXPYing : 73.129 0.157 0.156 0.157

I've appended the precise code used to generate this output. It
was compiled with "gcc-2.6.3 -m486 -O4 -funroll-loops -ffast-math"

Regards,
Russell

/*
* Program: Stream
* Programmer: John D. McCalpin
* Revision: 2.0, September 30,1991
*
* Modified slightly to account for timing overhead and correct
* calculation of MBytes by R. L. Carter 5/4/95.
*
* This program measures memory transfer rates in MB/s for simple
* computational kernels coded in Fortran. These numbers reveal the
* quality of code generation for simple uncacheable kernels as well
* as showing the cost of floating-point operations relative to memory
* accesses.
*
* INSTRUCTIONS:
* 1) (fortran-specific, omitted.)
* 2) Stream requires a good bit of memory to run.
* Adjust the Parameter 'N' in the second line of the main
* program to give a 'timing calibration' of at least 20 clicks.
* This will provide rate estimates that should be good to
* about 5% precision.
* 3) Compile the code with full optimization. Many compilers
* generate unreasonably bad code before the optimizer tightens
* things up. If the results are unreasonable good, on the
* other hand, the optimizer might be too smart for me!
* 4) Mail the results to mccalpin@perelandra.cms.udel.edu
* Be sure to include:
* a) computer hardware model number and software revision
* b) the compiler flags
* c) all of the output from the test case.
* Thanks!
*
* this version was ported from fortran to c by mark hahn, hahn+@pitt.edu.
*/

#define N 500000
#define NTIMES 20

#include <limits.h>
#include <float.h>
#include <sys/time.h>
#include <math.h>
#include <stdio.h>

#ifndef MIN
#define MIN(x,y) ((x)<(y)?(x):(y))
#endif
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#endif

double utime(void) /* returns the time since the last call */
{
  static double oldtime=0,newtime,deltatime;
  static struct timeval tp;
  static struct timezone tzp;

  if (gettimeofday(&tp,&tzp) == -1) {
    fprintf(stderr,"Error in gettimeofday. Exiting\n");
    exit(1);
  }
  if (!oldtime) {
    oldtime=(double)tp.tv_sec+((double)tp.tv_usec)/1e6;
    return(0);
  }
  newtime=(double)tp.tv_sec+((double)tp.tv_usec)/1e6;
  deltatime=newtime-oldtime;
  oldtime=newtime;
  return(deltatime);
}

static double a[N],b[N],c[N];

int main() {
    int j,k;
    double times[4][NTIMES];
    static double t_overhead,init_time;
    static double rmstime[4] = {0};
    static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
    static double maxtime[4] = {0};
    static char *label[4] = {"Assignment:",
                             "Scaling :",
                             "Summing :",
                             "SAXPYing :"};
    static double bytes[4] = { 2 * sizeof(double) * (double)N,
                              2 * sizeof(double) * (double)N,
                              3 * sizeof(double) * (double)N,
                              3 * sizeof(double) * (double)N};

    /* --- SETUP --- determine precision and check timing --- */
    /* utimeStart(); */
    utime();
    t_overhead=utime();
    printf("Timing calibration 1: utime overhead = %f usec.\n",t_overhead*1e6);
    utime();
    for (j=0; j<N; j++) {
        a[j] = 1.0;
        b[j] = 2.0;
        c[j] = 0.0;
    }
    init_time=utime()-t_overhead;
    printf("Timing calibration 2: array init time = %f usec.\n",init_time*1e6);
    printf("Increase the size of the arrays if this is < 300000\n"
           "and your clock precision is =< 1/100 second.\n");
    printf("---------------------------------------------------\n");
    
    /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
    for (k=0; k<NTIMES; k++) {
        utime();
        for (j=0; j<N; j++)
            c[j] = a[j];
        times[0][k] = utime()-t_overhead;
        
        utime();
        for (j=0; j<N; j++)
            c[j] = 3.0e0*a[j];
        times[1][k] = utime()-t_overhead;
        
        utime();
        for (j=0; j<N; j++)
            c[j] = a[j]+b[j];
        times[2][k] = utime()-t_overhead;
        
        utime();
        for (j=0; j<N; j++)
            c[j] = a[j]+3.0e0*b[j];
        times[3][k] = utime()-t_overhead;
        for (j=0; j<N; j++) {
            a[j] = c[j];
            b[j] = 1.1*a[j];
          }
        if (a[1]<0) {
          printf("Making a dependency: %f\n",c[N-1]);
        }

    }
    
    /* --- SUMMARY --- */
    for (k=0; k<NTIMES; k++) {
        for (j=0; j<4; j++) {
            rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
            mintime[j] = MIN(mintime[j], times[j][k]);
            maxtime[j] = MAX(maxtime[j], times[j][k]);
        }
    }
    
    printf("Function Rate (MB/s) RMS time Min time Max time\n");
    for (j=0; j<4; j++) {
        rmstime[j] = sqrt(rmstime[j]/(double)NTIMES);

        printf("%s%11.3f %11.3f %11.3f %11.3f\n",
               label[j],
               bytes[j]/mintime[j]/(1024*1024),
               rmstime[j],
               mintime[j],
               maxtime[j]);
    }
    return 0;
}



This archive was generated by hypermail 2b29 : Tue Apr 18 2000 - 05:23:04 CDT