* Program: Stream
* Revised 5/3/96 - F90/HPF version - Jonathan P. Harris, Digital Equipment Corporation
*
* Programmer: John D. McCalpin
* Revision: 4.0, Aug 30, 1995
*
* This program measures memory transfer rates in MB/s for simple
* computational kernels coded in Fortran. These numbers reveal the
* quality of code generation for simple uncacheable kernels as well
* as showing the cost of floating-point operations relative to memory
* accesses.
*
*=========================================================================
* INSTRUCTIONS:
* 1) Stream requires a cpu timing function called second().
* A sample is shown below. This is unfortunately rather
* system dependent. The code attempts to determine the
* granularity of the clock to help interpret the results.
* For dedicated or parallel runs, you might want to comment
* these out and compile/link with "wallclock.c".
* 2) Stream requires a good bit of memory to run.
* Adjust the Parameter 'N' in the main program to give
* a 'timing calibration' of at least 20 clicks.
* This will provide rate estimates that should be good to
* about 5% precision.
* ------------------------------------------------------------
* Note that you are free to use any array length and offset
* that makes each array larger than the last-level cache.
* The intent is to determine the *best* sustainable bandwidth
* available with this simple coding. Of course, lower values
* are usually fairly easy to obtain on cached machines, but
* by keeping the test to the *best* results, the answers are
* easier to interpret.
* You may put the arrays in common or not, at your discretion.
* There is a commented-out COMMON statement below.
* ------------------------------------------------------------
* 3) Compile the code with full optimization. Many compilers
* generate unreasonably bad code before the optimizer tightens
* things up. If the results are unreasonably good, on the
* other hand, the optimizer might be too smart for me
* Please let me know if this happens.
* 4) Mail the results to mccalpin@udel.edu
* Be sure to include:
* a) computer hardware model number and software revision
* b) the compiler flags
* c) all of the output from the test case.
*
* Thanks
*=========================================================================
*
PROGRAM stream
* IMPLICIT NONE
C .. Parameters ..
INTEGER n,offset,ndim,ntimes
PARAMETER (n=32000000,offset=0,ndim=n+offset,ntimes=10)
C ..
C .. Local Scalars ..
DOUBLE PRECISION dummy,scalar,t
INTEGER j,k,nbpw,quantum
C ..
C .. Local Arrays ..
DOUBLE PRECISION maxtime(4),mintime(4),rmstime(4),
$ times(4,ntimes)
INTEGER bytes(4)
CHARACTER label(4)*11
C ..
C .. External Functions ..
DOUBLE PRECISION second
INTEGER checktick,realsize
EXTERNAL second,checktick,realsize
C ..
C .. Intrinsic Functions ..
C
INTRINSIC dble,nint,sqrt
C ..
C .. Arrays in Common ..
DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
CHPF$ distribute (block) :: a,b,c
C ..
C .. Common blocks ..
* COMMON a,b,c
C ..
C .. Data statements ..
DATA rmstime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
DATA label/'Assignment:','Scaling :','Summing :',
$ 'SAXPYing :'/
DATA bytes/2,2,3,3/,dummy/0.0d0/
C ..
* --- SETUP --- determine precision and check timing ---
nbpw = realsize()
WRITE (*,FMT=9010) 'Array size = ',n
WRITE (*,FMT=9010) 'Offset = ',offset
WRITE (*,FMT=9020) 'The total memory requirement is ',
$ 3*nbpw*n/ (1024*1024),' MB'
WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
WRITE (*,FMT=9030) 'The *best* time for each test is used'
a = 1.0d0
b = 2.0d0
c = 0.0d0
t = second(dummy)
a = 2.0d0*a
t = second(dummy) - t
PRINT *,'----------------------------------------------------'
quantum = checktick()
WRITE (*,FMT=9000)
$ 'Your clock granularity/precision appears to be ',quantum,
$ ' microseconds'
PRINT *,'The tests below will each take a time on the order '
PRINT *,'of ',nint(t*1d6),' microseconds'
PRINT *,' (= ',nint((t*1d6)/quantum),' clock ticks)'
PRINT *,'Increase the size of the arrays if this shows that'
PRINT *,'you are not getting at least 20 clock ticks per test.'
PRINT *,'----------------------------------------------------'
PRINT *,'WARNING: The above is only a rough guideline.'
PRINT *,'For best results, please be sure you know the'
PRINT *,'precision of your system timer.'
PRINT *,'----------------------------------------------------'
* --- MAIN LOOP --- repeat test cases NTIMES times ---
scalar = 1.5d0*a(1)
DO 70 k = 1,ntimes
t = second(dummy)
c=a
t = second(dummy) - t
times(1,k) = t
t = second(dummy)
b = scalar*c
t = second(dummy) - t
times(2,k) = t
t = second(dummy)
c = a + b
t = second(dummy) - t
times(3,k) = t
t = second(dummy)
a = b + scalar*c
t = second(dummy) - t
times(4,k) = t
70 CONTINUE
* --- SUMMARY ---
rmstime = sqrt( sum(times**2, 2) / dble(ntimes-1))
mintime = minval(times , 2)
maxtime = maxval(times , 2)
WRITE (*,FMT=9040)
DO 100 j = 1,4
WRITE (*,FMT=9050) label(j),n*nbpw*bytes(j)/mintime(j)/1.0D6,
$ rmstime(j),mintime(j),maxtime(j)
100 CONTINUE
PRINT *, ""
PRINT *,'Average bandwidth : ',
$ n*nbpw*(sum(bytes/mintime)/4.0D0)/1.0D6
PRINT *, ""
PRINT *,'Sum of a is : ',sum(a)
PRINT *,'Sum of b is : ',sum(b)
PRINT *,'Sum of c is : ',sum(c)
9000 FORMAT (1x,a,i6,a)
9010 FORMAT (1x,a,i10)
9020 FORMAT (1x,a,i4,a)
9030 FORMAT (1x,a,i3,a,a)
9040 FORMAT ('Function',5x,'Rate (MB/s) RMS time Min time Max time'
$ )
9050 FORMAT (a,4 (f10.4,2x))
END
*-------------------------------------
* INTEGER FUNCTION realsize()
*
* Guess how many bytes of storage a DOUBLE PRECISION
* number occupies.
*
INTEGER FUNCTION realsize()
* IMPLICIT NONE
C .. Local Scalars ..
DOUBLE PRECISION test
INTEGER j,ndigits
C ..
C .. Intrinsic Functions
INTRINSIC precision
C ..
ndigits = precision(test)
40 WRITE (*,FMT='(a)')
$ '----------------------------------------------'
WRITE (*,FMT='(1x,a,i2,a)') 'Double precision has ',
$ ndigits,' digits of accuracy'
IF (ndigits.LE.8) THEN
realsize = 4
ELSE
realsize = 8
END IF
WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
$ ' bytes per DOUBLE PRECISION word'
WRITE (*,FMT='(a)')
$ '----------------------------------------------'
RETURN
END
* A semi-portable way to determine the clock granularity
* Adapted from a code by John Henning of Digital Equipment Corporation
*
INTEGER FUNCTION checktick()
* IMPLICIT NONE
C .. Parameters ..
INTEGER n
PARAMETER (n=20)
C ..
C .. Local Scalars ..
DOUBLE PRECISION dummy,t1,t2
INTEGER i,j,jmin
C ..
C .. Local Arrays ..
DOUBLE PRECISION timesfound(n)
C ..
C .. External Functions ..
DOUBLE PRECISION second
EXTERNAL second
C ..
C .. Intrinsic Functions ..
INTRINSIC max,min,nint
C ..
i = 0
dummy = 0.0d0
t1 = second(dummy)
10 t2 = second(dummy)
IF (t2.EQ.t1) GO TO 10
t1 = t2
i = i + 1
timesfound(i) = t1
IF (i.LT.n) GO TO 10
jmin = 1000000
DO 20 i = 2,n
j = nint((timesfound(i)-timesfound(i-1))*1d6)
jmin = min(jmin,max(j,0))
20 CONTINUE
IF (jmin.GT.0) THEN
checktick = jmin
ELSE
PRINT *,'Your clock granularity appears to be less ',
$ 'than one microsecond'
checktick = 1
END IF
RETURN
END
This archive was generated by hypermail 2b29 : Tue Apr 18 2000 - 05:23:06 CDT