! Program: Stream ! Programmer: John D. McCalpin ! Revision: 4.1, June 4, 1996 ! Translated to Fortran 90 June 2, 1997 ! ! This program measures memory transfer rates in MB/s for simple ! computational kernels coded in Fortran. These numbers reveal the ! quality of code generation for simple uncacheable kernels as well ! as showing the cost of floating-point operations relative to memory ! accesses. ! !========================================================================= ! INSTRUCTIONS: ! 1) Stream requires a cpu timing function called second(). ! A sample is shown below. This is unfortunately rather ! system dependent. The code attempts to determine the ! granularity of the clock to help interpret the results. ! For dedicated or parallel runs, you might want to comment ! these out and compile/link with "wallclock.c". ! 2) Stream requires a good bit of memory to run. ! Adjust the Parameter 'N' in the main program to give ! a 'timing calibration' of at least 20 clicks. ! This will provide rate estimates that should be good to ! about 5% precision. ! ------------------------------------------------------------ ! Note that you are free to use any array length and offset ! that makes each array larger than the last-level cache. ! The intent is to determine the *best* sustainable bandwidth ! available with this simple coding. Of course, lower values ! are usually fairly easy to obtain on cached machines, but ! by keeping the test to the *best* results, the answers are ! easier to interpret. ! You may put the arrays in common or not, at your discretion. ! There is a commented-out COMMON statement below. ! ------------------------------------------------------------ ! 3) Compile the code with full optimization. Many compilers ! generate unreasonably bad code before the optimizer tightens ! things up. If the results are unreasonably good, on the ! other hand, the optimizer might be too smart for me ! Please let me know if this happens. ! 4) Mail the results to john@mccalpin.com ! Be sure to include: ! a) computer hardware model number and software revision ! b) the compiler flags ! c) all of the output from the test case. ! ! Thanks !========================================================================= ! PROGRAM stream IMPLICIT NONE ! .. Parameters .. INCLUDE 'size.h' INTEGER, PARAMETER:: npages=n/2048+1 ! .. ! .. Local Scalars .. DOUBLE PRECISION dummy,scalar,t INTEGER j,k,nbpw,quantum,iflag,tflag ! .. ! .. Local Arrays .. DOUBLE PRECISION maxtime(4),mintime(4),rmstime(4),sum(3),times(4,ntimes),mean(4),median(4) INTEGER bytes(4),apages(npages),bpages(npages),cpages(npages) CHARACTER label(4)*11 ! .. ! .. External Functions .. DOUBLE PRECISION second INTEGER checktick,realsize,addr2node EXTERNAL second,checktick,realsize,addr2node ! .. ! .. Intrinsic Functions .. ! INTRINSIC dble,max,min,nint,sqrt ! .. ! .. Arrays in Common .. DOUBLE PRECISION a(ndim),b(ndim),c(ndim) !********************************$DISTRIBUTE a(BLOCK), b(BLOCK), c(BLOCK) ! .. ! .. Common blocks .. COMMON a,b,c ! .. ! .. Data statements .. DATA rmstime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/ DATA label/'Copy: ','Scale: ','Add: ','Triad: '/ DATA bytes/2,2,3,3/,dummy/0.0d0/,tflag/-1/ ! .. ! --- SETUP --- determine precision and check timing --- nbpw = realsize() WRITE (*,FMT=9010) 'Array size = ',n WRITE (*,FMT=9010) 'Offset = ',offset WRITE (*,FMT=9020) 'The total memory requirement is ',3*nbpw*n/ (1024*1024),' MB' WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times' WRITE (*,FMT=9030) 'The *best* time for each test is used' a(:) = 1.0d0 ! define everything b(:) = 2.0d0 c(:) = 0.0d0 t = second(tflag) a(1:n) = 2.0d0*a(1:n) ! test basic timing t = second(tflag) - t PRINT *,'----------------------------------------------------' quantum = checktick() WRITE (*,FMT=9000) 'Your clock granularity/precision appears to be ',quantum,' microseconds' PRINT *,'The tests below will each take a time on the order ' PRINT *,'of ',nint(t*1d6),' microseconds' PRINT *,' (= ',nint((t*1d6)/quantum),' clock ticks)' PRINT *,'Increase the size of the arrays if this shows that' PRINT *,'you are not getting at least 20 clock ticks per test.' PRINT *,'----------------------------------------------------' PRINT *,'WARNING -- The above is only a rough guideline.' PRINT *,'For best results, please be sure you know the' PRINT *,'precision of your system timer.' PRINT *,'----------------------------------------------------' ! --- MAIN LOOP --- repeat test cases NTIMES times --- scalar = 1.5d0*a(1) DO 70 k = 1,ntimes t = second(tflag) c(1:n) = a(1:n) ! COPY t = second(tflag) - t times(1,k) = t t = second(tflag) b(1:n) = scalar*c(1:n) ! SCALE t = second(tflag) - t times(2,k) = t t = second(tflag) c(1:n) = a(1:n) + b(1:n) ! ADD t = second(tflag) - t times(3,k) = t t = second(tflag) a(1:n) = b(1:n) + scalar*c(1:n) ! TRIAD t = second(tflag) - t times(4,k) = t 70 CONTINUE ! --- SUMMARY --- DO k = 1,ntimes DO j = 1,4 rmstime(j) = rmstime(j) + times(j,k)**2 mintime(j) = min(mintime(j),times(j,k)) maxtime(j) = max(maxtime(j),times(j,k)) mean(j) = mean(j) + times(j,k) END DO END DO WRITE (*,FMT=9040) DO 100 j = 1,4 rmstime(j) = sqrt(rmstime(j)/dble(ntimes)) mean(j) = mean(j)/dble(ntimes) IF (mod(ntimes,2).eq.0) THEN median(j) = 0.5d0*(times(j,ntimes/2)+times(j,ntimes/2+1)) ELSE median(j) = times(j,ntimes/2+1) END IF WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6, & mintime(j),maxtime(j),mean(j),rmstime(j),median(j) 100 CONTINUE WRITE (*,FMT='(a)') '-------------------------------------------------------------------------------' PRINT *,'All times are' DO k=1,ntimes WRITE (*,'(4(f10.4,2x))') (times(j,k),j=1,4) END DO WRITE (*,FMT='(a)') '-------------------------------------------------------------------------------' sum(1) = 0.0d0 sum(2) = 0.0d0 sum(3) = 0.0d0 scalar = 1.0d0/dble(n)/dble(ntimes) DO 110 j = 1,n sum(1) = sum(1) + scalar*a(j) sum(2) = sum(2) + scalar*b(j) sum(3) = sum(3) + scalar*c(j) 110 CONTINUE PRINT *,'Sum of a is = ',sum(1) PRINT *,'Sum of b is = ',sum(2) PRINT *,'Sum of c is = ',sum(3) 9000 FORMAT (1x,a,i6,a) 9010 FORMAT (1x,a,i10) 9020 FORMAT (1x,a,i8,a) 9030 FORMAT (1x,a,i8,a,a) 9040 FORMAT ('Function',5x,'Rate (MB/s) Min time Max time',' Mean time RMS time Median') 9050 FORMAT (a,f10.2,2x,6(f9.4,2x)) END !------------------------------------- ! INTEGER FUNCTION dblesize() ! ! A semi-portable way to determine the precision of DOUBLE PRECISION ! in Fortran. ! Here used to guess how many bytes of storage a DOUBLE PRECISION ! number occupies. ! INTEGER FUNCTION realsize() ! IMPLICIT NONE ! .. Local Scalars .. DOUBLE PRECISION result,test INTEGER j,ndigits ! .. ! .. Local Arrays .. DOUBLE PRECISION ref(30) ! .. ! .. External Subroutines .. EXTERNAL confuse ! .. ! .. Intrinsic Functions .. INTRINSIC abs,acos,log10,sqrt ! .. ! Test #1 - compare single(1.0d0+delta) to 1.0d0 10 DO 20 j = 1,30 ref(j) = 1.0d0 + 10.0d0** (-j) 20 CONTINUE DO 30 j = 1,30 test = ref(j) ndigits = j CALL confuse(test,result) IF (test.EQ.1.0D0) THEN GO TO 40 END IF 30 CONTINUE GO TO 50 40 WRITE (*,FMT='(a)') '----------------------------------------------' WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',ndigits,' digits of accuracy' IF (ndigits.LE.8) THEN realsize = 4 ELSE realsize = 8 END IF WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,' bytes per DOUBLE PRECISION word' WRITE (*,FMT='(a)') '----------------------------------------------' RETURN 50 PRINT *,'Hmmmm. I am unable to determine the size.' PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION number : ' READ (*,FMT=*) realsize IF (realsize.NE.4 .AND. realsize.NE.8) THEN PRINT *,'Your answer ',realsize,' does not make sense.' PRINT *,'Try again.' PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION number : ' READ (*,FMT=*) realsize END IF PRINT *,'You have manually entered a size of ',realsize,' bytes per DOUBLE PRECISION number' WRITE (*,FMT='(a)') '----------------------------------------------' END SUBROUTINE confuse(q,r) ! IMPLICIT NONE ! .. Scalar Arguments .. DOUBLE PRECISION q,r ! .. ! .. Intrinsic Functions .. INTRINSIC cos ! .. r = cos(q) RETURN END ! A semi-portable way to determine the clock granularity ! Adapted from a code by John Henning of Digital Equipment Corporation ! INTEGER FUNCTION checktick() ! IMPLICIT NONE ! .. Parameters .. INTEGER n PARAMETER (n=20) ! .. ! .. Local Scalars .. DOUBLE PRECISION dummy,t1,t2 INTEGER i,j,jmin ! .. ! .. Local Arrays .. DOUBLE PRECISION timesfound(n) ! .. ! .. External Functions .. DOUBLE PRECISION second EXTERNAL second ! .. ! .. Intrinsic Functions .. INTRINSIC max,min,nint ! .. i = 0 dummy = 0.0d0 t1 = second(dummy) 10 t2 = second(dummy) IF (t2.EQ.t1) GO TO 10 t1 = t2 i = i + 1 timesfound(i) = t1 IF (i.LT.n) GO TO 10 jmin = 1000000 DO 20 i = 2,n j = nint((timesfound(i)-timesfound(i-1))*1d6) jmin = min(jmin,max(j,0)) 20 CONTINUE IF (jmin.GT.0) THEN checktick = jmin ELSE PRINT *,'Your clock granularity appears to be less than one microsecond' checktick = 1 END IF RETURN ! PRINT 14, timesfound(1)*1d6 ! DO 20 i=2,n ! PRINT 14, timesfound(i)*1d6, ! & nint((timesfound(i)-timesfound(i-1))*1d6) ! 14 FORMAT (1X, F18.4, 1X, i8) ! 20 CONTINUE END