Tweaked benchmark code

From: John, dtn 381-0378 24-May-1996 1722 (henning@perfom.ENET.dec.com)
Date: Fri May 24 1996 - 16:32:20 CDT


* Program: Stream
* Revised 5/3/96 - F90/HPF version - Jonathan P. Harris, Digital Equipment Corporation
*
* Programmer: John D. McCalpin
* Revision: 4.0, Aug 30, 1995
*
* This program measures memory transfer rates in MB/s for simple
* computational kernels coded in Fortran. These numbers reveal the
* quality of code generation for simple uncacheable kernels as well
* as showing the cost of floating-point operations relative to memory
* accesses.
*
*=========================================================================
* INSTRUCTIONS:
* 1) Stream requires a cpu timing function called second().
* A sample is shown below. This is unfortunately rather
* system dependent. The code attempts to determine the
* granularity of the clock to help interpret the results.
* For dedicated or parallel runs, you might want to comment
* these out and compile/link with "wallclock.c".
* 2) Stream requires a good bit of memory to run.
* Adjust the Parameter 'N' in the main program to give
* a 'timing calibration' of at least 20 clicks.
* This will provide rate estimates that should be good to
* about 5% precision.
* ------------------------------------------------------------
* Note that you are free to use any array length and offset
* that makes each array larger than the last-level cache.
* The intent is to determine the *best* sustainable bandwidth
* available with this simple coding. Of course, lower values
* are usually fairly easy to obtain on cached machines, but
* by keeping the test to the *best* results, the answers are
* easier to interpret.
* You may put the arrays in common or not, at your discretion.
* There is a commented-out COMMON statement below.
* ------------------------------------------------------------
* 3) Compile the code with full optimization. Many compilers
* generate unreasonably bad code before the optimizer tightens
* things up. If the results are unreasonably good, on the
* other hand, the optimizer might be too smart for me
* Please let me know if this happens.
* 4) Mail the results to mccalpin@udel.edu
* Be sure to include:
* a) computer hardware model number and software revision
* b) the compiler flags
* c) all of the output from the test case.
*
* Thanks
*=========================================================================
*
      PROGRAM stream
* IMPLICIT NONE
C .. Parameters ..
      INTEGER n,offset,ndim,ntimes
      PARAMETER (n=32000000,offset=0,ndim=n+offset,ntimes=10)
C ..
C .. Local Scalars ..
      DOUBLE PRECISION dummy,scalar,t
      INTEGER j,k,nbpw,quantum
C ..
C .. Local Arrays ..
      DOUBLE PRECISION maxtime(4),mintime(4),rmstime(4),
     $ times(4,ntimes)
      INTEGER bytes(4)
      CHARACTER label(4)*11
C ..
C .. External Functions ..
      DOUBLE PRECISION second
      INTEGER checktick,realsize
      EXTERNAL second,checktick,realsize
C ..
C .. Intrinsic Functions ..
C
      INTRINSIC dble,nint,sqrt
C ..
C .. Arrays in Common ..
      DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
CHPF$ distribute (block) :: a,b,c
C ..
C .. Common blocks ..
* COMMON a,b,c
C ..
C .. Data statements ..
      DATA rmstime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
      DATA label/'Assignment:','Scaling :','Summing :',
     $ 'SAXPYing :'/
      DATA bytes/2,2,3,3/,dummy/0.0d0/
C ..

* --- SETUP --- determine precision and check timing ---

      nbpw = realsize()

      WRITE (*,FMT=9010) 'Array size = ',n
      WRITE (*,FMT=9010) 'Offset = ',offset
      WRITE (*,FMT=9020) 'The total memory requirement is ',
     $ 3*nbpw*n/ (1024*1024),' MB'
      WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
      WRITE (*,FMT=9030) 'The *best* time for each test is used'

      a = 1.0d0
      b = 2.0d0
      c = 0.0d0

      t = second(dummy)
      a = 2.0d0*a
      t = second(dummy) - t

      PRINT *,'----------------------------------------------------'

      quantum = checktick()

      WRITE (*,FMT=9000)
     $ 'Your clock granularity/precision appears to be ',quantum,
     $ ' microseconds'
      PRINT *,'The tests below will each take a time on the order '
      PRINT *,'of ',nint(t*1d6),' microseconds'
      PRINT *,' (= ',nint((t*1d6)/quantum),' clock ticks)'
      PRINT *,'Increase the size of the arrays if this shows that'
      PRINT *,'you are not getting at least 20 clock ticks per test.'
      PRINT *,'----------------------------------------------------'
      PRINT *,'WARNING: The above is only a rough guideline.'
      PRINT *,'For best results, please be sure you know the'
      PRINT *,'precision of your system timer.'
      PRINT *,'----------------------------------------------------'

* --- MAIN LOOP --- repeat test cases NTIMES times ---
      scalar = 1.5d0*a(1)
      DO 70 k = 1,ntimes

          t = second(dummy)
          c=a
          t = second(dummy) - t
          times(1,k) = t

          t = second(dummy)
          b = scalar*c
          t = second(dummy) - t
          times(2,k) = t

          t = second(dummy)
          c = a + b
          t = second(dummy) - t
          times(3,k) = t

          t = second(dummy)
          a = b + scalar*c
          t = second(dummy) - t
          times(4,k) = t

   70 CONTINUE

* --- SUMMARY ---

      rmstime = sqrt( sum(times**2, 2) / dble(ntimes-1))
      mintime = minval(times , 2)
      maxtime = maxval(times , 2)

      WRITE (*,FMT=9040)
      DO 100 j = 1,4
          WRITE (*,FMT=9050) label(j),n*nbpw*bytes(j)/mintime(j)/1.0D6,
     $ rmstime(j),mintime(j),maxtime(j)
  100 CONTINUE

      PRINT *, ""
      PRINT *,'Average bandwidth : ',
     $ n*nbpw*(sum(bytes/mintime)/4.0D0)/1.0D6

      PRINT *, ""
      PRINT *,'Sum of a is : ',sum(a)
      PRINT *,'Sum of b is : ',sum(b)
      PRINT *,'Sum of c is : ',sum(c)

 9000 FORMAT (1x,a,i6,a)
 9010 FORMAT (1x,a,i10)
 9020 FORMAT (1x,a,i4,a)
 9030 FORMAT (1x,a,i3,a,a)
 9040 FORMAT ('Function',5x,'Rate (MB/s) RMS time Min time Max time'
     $ )
 9050 FORMAT (a,4 (f10.4,2x))
      END

*-------------------------------------
* INTEGER FUNCTION realsize()
*
* Guess how many bytes of storage a DOUBLE PRECISION
* number occupies.
*
      INTEGER FUNCTION realsize()
* IMPLICIT NONE

C .. Local Scalars ..
      DOUBLE PRECISION test
      INTEGER j,ndigits
C ..
C .. Intrinsic Functions
      INTRINSIC precision
C ..

      ndigits = precision(test)

   40 WRITE (*,FMT='(a)')
     $ '----------------------------------------------'
      WRITE (*,FMT='(1x,a,i2,a)') 'Double precision has ',
     $ ndigits,' digits of accuracy'
      IF (ndigits.LE.8) THEN
          realsize = 4
      ELSE
          realsize = 8
      END IF
      WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
     $ ' bytes per DOUBLE PRECISION word'
      WRITE (*,FMT='(a)')
     $ '----------------------------------------------'
      RETURN
      END

* A semi-portable way to determine the clock granularity
* Adapted from a code by John Henning of Digital Equipment Corporation
*
      INTEGER FUNCTION checktick()
* IMPLICIT NONE

C .. Parameters ..
      INTEGER n
      PARAMETER (n=20)
C ..
C .. Local Scalars ..
      DOUBLE PRECISION dummy,t1,t2
      INTEGER i,j,jmin
C ..
C .. Local Arrays ..
      DOUBLE PRECISION timesfound(n)
C ..
C .. External Functions ..
      DOUBLE PRECISION second
      EXTERNAL second
C ..
C .. Intrinsic Functions ..
      INTRINSIC max,min,nint
C ..

      i = 0
      dummy = 0.0d0
      t1 = second(dummy)

   10 t2 = second(dummy)
      IF (t2.EQ.t1) GO TO 10

      t1 = t2
      i = i + 1
      timesfound(i) = t1
      IF (i.LT.n) GO TO 10

      jmin = 1000000
      DO 20 i = 2,n
          j = nint((timesfound(i)-timesfound(i-1))*1d6)
          jmin = min(jmin,max(j,0))
   20 CONTINUE

      IF (jmin.GT.0) THEN
          checktick = jmin
      ELSE
          PRINT *,'Your clock granularity appears to be less ',
     $ 'than one microsecond'
          checktick = 1
      END IF
      RETURN

      END



This archive was generated by hypermail 2b29 : Tue Apr 18 2000 - 05:23:06 CDT