stream results for T3E

From: Charles Grassl (cmg@cray.com)
Date: Fri Apr 18 1997 - 14:02:53 CDT


John;

For you information, below are "stream" results for a 450 MHz CRAY T3E with
512 PEs.

A copy of the program is also included below.

Regards,
Charles Grassl

 Number of iterations: 10
 Size of Arrays: 501 Kwords

                              Bandwidth (Mbyte/s)
   PEs Copy Scale Sum Triad Saxpy 1 Load 1 Store
 ------- --------- --------- --------- --------- --------- --------- ---------
     1 484. 482. 571. 568. 672. 435. 306.
     2 935. 935. 1088. 1118. 1333. 859. 604.
     4 1877. 1869. 2147. 2230. 2668. 1719. 1185.
     8 3747. 3739. 4167. 4471. 5305. 3476. 2193.
    16 7497. 7477. 8042. 8828. 10655. 6863. 4218.
    32 14968. 14940. 15935. 17148. 21323. 13726. 8394.
    64 30066. 29694. 31051. 34168. 42399. 27431. 16803.
   128 60175. 58707. 57177. 64952. 84065. 54710. 33363.
   256 119483. 116861. 107532. 126061. 165733. 109631. 66738.
   512 240428. 233501. 243368. 265803. 331126. 210628. 133439.

      program Stream
      integer ofst,aoff,boff,coff
      parameter (N = 501*1024, NTIMES = 10)
      parameter (ofst = 8*1024)

      real*8 a(N+8*1024),b(N+8*1024),c(N+8*1024)
      common /acom/a,b,c

      real times(7)
      real avetime(7),rmstime(7),mintime(7),maxtime(7)
      character*10 label(7)
      integer bytes(7),flops(7)
      data avetime/7*0.0/
      data rmstime/7*0.0/
      data mintime/7*1.0e+36/
      data maxtime/7*0.0/

      data label/' Copy',
     . ' Scale',
     . ' Sum',
     . ' Triad',
     . ' Saxpy',
     . ' 1 Load',
     . '1 Store'/

      parameter (nbpw = 8 )
      data bytes/2,2,3,3,3,1,1/
      data flops/0,1,1,2,2,1,0/
      integer shmem_my_pe,shmem_n_pes
      round_up(ix,index) = ((ix + (index-1))/index)*index

      me = shmem_my_pe()
      numpes = shmem_n_pes()

      istart_a = loc(a)
      istart_b = loc(b)
      istart_c = loc(c)

      istart_a = (round_up(istart_a,8092) - istart_a)/8 + 1
      istart_b = (round_up(istart_b,8092) - istart_b)/8 + 1
      istart_c = (round_up(istart_c,8092) - istart_c)/8 + 1

      if ( me .eq. 0 ) then
        write(6,9020) numpes,ntimes,n/1024
        write(6,9030) (label(j),j=1,7)
      end if

      do 1000 k=1,NTIMES

        call bandwidth(n,a(istart_a), b(istart_b), c(istart_c), times)

        avetime = avetime + times
        rmstime = rmstime + times**2
        mintime = min( mintime, times )
        maxtime = max( maxtime, times )

        if ( me .eq. 0) then
        write(6,9040) k,(N*bytes(j)*nbpw*numpes/times(j)/1.0e6,j=1,7)
        end if

 1000 continue

      if ( me .eq. 0) then
      avetime = avetime/NTIMES
      write(6,9050) (N*bytes(j)*nbpw*numpes/mintime(j)/1.0e6,j=1,7),
     . (N*bytes(j)*nbpw*numpes/avetime(j)/1.0e6,j=1,7),
     . (N*bytes(j)*nbpw*numpes/maxtime(j)/1.0e6,j=1,7)
      end if

      call exit()

 9020 format( /' *** STREAM benchmark ***'
     . /' Number of PEs: ',i8
     . /' Number of iterations: ',i8
     . /' Size of Arrays: ',i8,' Kwords')
 9030 format(// 30(' '),'Bandwidth (Mbyte/s)'
     . /' Iterat. ',7a10,
     . / (' -------'),7(' ---------'))
 9040 format ( i6,2x,7f10.0)
 9050 format (/' Max: ',7f10.0/
     . ' Ave: ',7f10.0/
     . ' Min: ',7f10.0/)
      end

      subroutine dummysub(a,b,c,n)
      return
      end

      subroutine bandwidth(n,a,b,c,times)

      real*8 a(n),b(n),c(n)
      real*8 times(7)
      data scalar /3.0/
 
      common /bandcom/ sum

      timer()=0.001*timef()
       timer()=3.333e-9*rtc()

      t = timer()
      do j=1,N
          c(j) = a(j)
      end do
      call barrier
      times(1) = timer()-t
 
      call dummysub(a,b,c,n)

      t = timer()
      do j=1,N
          c(j) = scalar*a(j)
      end do
      call barrier
      times(2) = timer()-t
 
      call dummysub(a,b,c,n)

      t = timer()
      do j=1,N
          c(j) = a(j) + b(j)
      end do
      call barrier
      times(3) = timer()-t
 
      call dummysub(a,b,c,n)

      t = timer()
      do j=1,N
        c(j) = a(j)+scalar*b(j)
      end do
      call barrier
      times(4) = timer()-t
 
      call dummysub(a,b,c,n)

      t = timer()
      do j=1,N
        c(j) = c(j)+scalar*b(j)
      end do
      call barrier
      times(5) = timer()-t
 
      call dummysub(a,b,c,n)

      sum = 0.
      t = timer()
      do j=1,N
        sum = sum + a(j)
      end do
      call barrier
      times(6) = timer()-t
 
      call dummysub(a,b,c,n)

      t = timer()
      do j=1,N
        c(j) = 0.
      end do
      call barrier
      times(7) = timer()-t
 
      call dummysub(a,b,c,n)

      return
      end



This archive was generated by hypermail 2b29 : Tue Apr 18 2000 - 05:23:06 CDT