Date: Tue, 8 Oct 91 08:51:11 EDT
From: "John D. McCalpin" <mccalpin@perelandra.cms.udel.edu>
>Here is the CM-2 data for running the stream code.
Would you mind sharing the code that you used for this?
I am still learning about the CM-2 and would like to
see how you translated this....
Thanks!
--
John D. McCalpin mccalpin@perelandra.cms.udel.edu
Assistant Professor mccalpin@brahms.udel.edu
College of Marine Studies, U. Del. DELOCN::MCCALPIN (SPAN)
Here is the code. I did the following changes to your original source:
1. Replaced calls to function second() with calls CM timers.
2. Rewrite the realsize() routine to just return constants, there is no
need for this function to do any work really. The CM is an IEEE machine,
so the double precision values are 8 bytes long.
3. Change the value of n
4. Wrap another loop around all expressions, otherwise the code executes
too fast to get accurate timings.
Then, I pushed this code through a vectorizer and then into CM Fortran
compiler.
First code is the non vectorized source, the second code is the output of a
vectorizer.
xxxxxxxxxxxxxxxxxx code 1 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
* Program: Stream
* Programmer: John D. McCalpin
* Revision: 2.0, September 30,1991
*
* This program measures memory transfer rates in MB/s for simple
* computational kernels coded in Fortran. These numbers reveal the
* quality of code generation for simple uncacheable kernels as well
* as showing the cost of floating-point operations relative to memory
* accesses.
*
* INSTRUCTIONS:
* 1) Stream requires a cpu timing function called second().
* A sample is shown below. This is unfortunately rather
* system dependent. It helps to know the granularity of the
* timing. The code below assumes that the granularity is
* 1/100 seconds.
* 2) Stream requires a good bit of memory to run.
* Adjust the Parameter 'N' in the second line of the main
* program to give a 'timing calibration' of at least 20 clicks.
* This will provide rate estimates that should be good to
* about 5% precision.
* 3) Compile the code with full optimization. Many compilers
* generate unreasonably bad code before the optimizer tightens
* things up. If the results are unreasonable good, on the
* other hand, the optimizer might be too smart for me!
* 4) Mail the results to mccalpin@perelandra.cms.udel.edu
* Be sure to include:
* a) computer hardware model number and software revision
* b) the compiler flags
* c) all of the output from the test case.
*
* Thanks!
*
PROGRAM stream
C .. Parameters ..
INTEGER n,ntimes
PARAMETER (p=256,n=40000*p,ntimes=10)
C ..
C .. Local Scalars ..
DOUBLE PRECISION t,t0
INTEGER j,k,nbpw
C ..
C .. Local Arrays ..
DOUBLE PRECISION a(n),b(n),c(n),maxtime(4),mintime(4),rmstime(4),
$ times(4,ntimes)
INTEGER bytes(4)
CHARACTER label(4)*11
C ..
C .. External Functions ..
INTEGER realsize
EXTERNAL CM_timer_read_cm_busy,realsize
C ..
C .. Intrinsic Functions ..
INTRINSIC dble,max,min,sqrt
C ..
C .. Data statements ..
DATA rmstime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
DATA label/' Assignment:',' Scaling :',' Summing :',
$ ' SAXPYing :'/
DATA bytes/2,2,3,3/
C ..
* --- SETUP --- determine precision and check timing ---
nbpw = realsize()
t = 0.0D0
call CM_timer_clear(0)
DO 10 j = 1,n
a(j) = 1.0D0
b(j) = 2.0D0
c(j) = 0.0D0
10 CONTINUE
call CM_timer_stop(0)
t = CM_timer_read_cm_busy(0) - t
PRINT *,'Timing calibration ; time = ',t*100,' hundredths',
$ ' of a second'
PRINT *,'Increase the size of the arrays if this is <30 ',
$ ' and your clock precision is =<1/100 second'
PRINT *,'---------------------------------------------------'
* --- MAIN LOOP --- repeat test cases NTIMES times ---
DO 60 k = 1,ntimes
call CM_timer_clear(0)
call CM_timer_start(0)
do i=1,100
DO 20 j = 1,n
c(j) = a(j)
20 CONTINUE
enddo
call CM_timer_stop(0)
times(1,k) = CM_timer_read_cm_busy(0)
call CM_timer_clear(0)
call CM_timer_start(0)
do i=1,100
DO 30 j = 1,n
c(j) = 3.0D0*a(j)
30 CONTINUE
enddo
call CM_timer_stop(0)
times(2,k) = CM_timer_read_cm_busy(0)
call CM_timer_clear(0)
call CM_timer_start(0)
do i=1,100
DO 40 j = 1,n
c(j) = a(j) + b(j)
40 CONTINUE
enddo
call CM_timer_stop(0)
times(3,k) = CM_timer_read_cm_busy(0)
call CM_timer_clear(0)
call CM_timer_start(0)
do i=1,100
DO 50 j = 1,n
c(j) = a(j) + 3.0D0*b(j)
50 CONTINUE
enddo
call CM_timer_stop(0)
times(4,k) = CM_timer_read_cm_busy(0)
60 CONTINUE
* --- SUMMARY ---
C*$*NOVECTORIZE
DO 80 k = 1,ntimes
DO 70 j = 1,4
rmstime(j) = rmstime(j) + (times(j,k)/100.0)**2
mintime(j) = min(mintime(j),(times(j,k)/100.0))
maxtime(j) = max(maxtime(j),(times(j,k)/100.0))
70 CONTINUE
80 CONTINUE
WRITE (*,FMT=9000)
DO 90 j = 1,4
rmstime(j) = sqrt(rmstime(j)/dble(ntimes))
WRITE (*,FMT=9010) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
$ rmstime(j),mintime(j),maxtime(j)
90 CONTINUE
9000 FORMAT (' Function',5x,'Rate (MB/s) RMS time Min time Max time'
$ )
9010 FORMAT (a,4 (f10.4,2x))
END
*-------------------------------------
* INTEGER FUNCTION realsize()
*
*
INTEGER FUNCTION realsize()
integer ndigits
ndigits = 16
WRITE (*,FMT='(a)') '--------------------------------------'
WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
$ ndigits,' digits of accuracy'
IF (ndigits.LE.8) THEN
realsize = 4
ELSE
realsize = 8
END IF
WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
$ ' bytes per DOUBLEPRECISION word'
WRITE (*,FMT='(a)') '--------------------------------------'
RETURN
END
xxxxxxxxxxxxxxxxxx code 2 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
C KAP/CAF 6.23 ( 7-Dec-88) o3,r3,d3 8 Oct 1991 10:25:08
* Program: Stream
* Programmer: John D. McCalpin
* Revision: 2.0, September 30,1991
*
* This program measures memory transfer rates in MB/s for simple
* computational kernels coded in Fortran. These numbers reveal the
* quality of code generation for simple uncacheable kernels as well
* as showing the cost of floating-point operations relative to memory
* accesses.
*
* INSTRUCTIONS:
* 1) Stream requires a cpu timing function called second().
* A sample is shown below. This is unfortunately rather
* system dependent. It helps to know the granularity of the
* timing. The code below assumes that the granularity is
* 1/100 seconds.
* 2) Stream requires a good bit of memory to run.
* Adjust the Parameter 'N' in the second line of the main
* program to give a 'timing calibration' of at least 20 clicks.
* This will provide rate estimates that should be good to
* about 5% precision.
* 3) Compile the code with full optimization. Many compilers
* generate unreasonably bad code before the optimizer tightens
* things up. If the results are unreasonable good, on the
* other hand, the optimizer might be too smart for me!
* 4) Mail the results to mccalpin@perelandra.cms.udel.edu
* Be sure to include:
* a) computer hardware model number and software revision
* b) the compiler flags
* c) all of the output from the test case.
*
* Thanks!
*
PROGRAM stream
C .. Parameters ..
INTEGER n,ntimes
PARAMETER (p=256,n=40000*p,ntimes=10)
C ..
C .. Local Scalars ..
DOUBLE PRECISION t,t0
INTEGER j,k,nbpw
C ..
C .. Local Arrays ..
DOUBLE PRECISION a(n),b(n),c(n),maxtime(4),mintime(4),rmstime(4),
$ times(4,ntimes)
INTEGER bytes(4)
CHARACTER label(4)*11
C ..
C .. External Functions ..
INTEGER realsize
EXTERNAL CM_timer_read_cm_busy,realsize
C ..
C .. Intrinsic Functions ..
INTRINSIC dble,max,min,sqrt
C ..
C .. Data statements ..
DATA rmstime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
DATA label/' Assignment:',' Scaling :',' Summing :',
$ ' SAXPYing :'/
DATA bytes/2,2,3,3/
C ..
* --- SETUP --- determine precision and check timing ---
nbpw = realsize()
t = 0.0D0
call CM_timer_clear(0)
A = 1.0D0
B = 2.0D0
C = 0.0D0
call CM_timer_stop(0)
t = CM_timer_read_cm_busy(0) - t
PRINT *,'Timing calibration ; time = ',t*100,' hundredths',
$ ' of a second'
PRINT *,'Increase the size of the arrays if this is <30 ',
$ ' and your clock precision is =<1/100 second'
PRINT *,'---------------------------------------------------'
* --- MAIN LOOP --- repeat test cases NTIMES times ---
DO 60 k = 1,ntimes
call CM_timer_clear(0)
call CM_timer_start(0)
DO 20 I=1,100
C = A
20 CONTINUE
call CM_timer_stop(0)
times(1,k) = CM_timer_read_cm_busy(0)
call CM_timer_clear(0)
call CM_timer_start(0)
DO 30 I=1,100
C = 3.0D0 * A
30 CONTINUE
call CM_timer_stop(0)
times(2,k) = CM_timer_read_cm_busy(0)
call CM_timer_clear(0)
call CM_timer_start(0)
DO 40 I=1,100
C = A + B
40 CONTINUE
call CM_timer_stop(0)
times(3,k) = CM_timer_read_cm_busy(0)
call CM_timer_clear(0)
call CM_timer_start(0)
DO 50 I=1,100
C = A + 3.0D0 * B
50 CONTINUE
call CM_timer_stop(0)
times(4,k) = CM_timer_read_cm_busy(0)
60 CONTINUE
* --- SUMMARY ---
C*$*NOVECTORIZE
DO 80 k = 1,ntimes
DO 70 j = 1,4
rmstime(j) = rmstime(j) + times(j,k)**2
mintime(j) = min(mintime(j),times(j,k))
maxtime(j) = max(maxtime(j),times(j,k))
70 CONTINUE
80 CONTINUE
WRITE (*,FMT=9000)
DO 90 j = 1,4
rmstime(j) = sqrt(rmstime(j)/dble(ntimes))
WRITE (*,FMT=9010) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
$ rmstime(j),mintime(j),maxtime(j)
90 CONTINUE
9000 FORMAT (' Function',5x,'Rate (MB/s) RMS time Min time Max time'
$ )
9010 FORMAT (a,4 (f10.4,2x))
END
C KAP/CAF 6.23 ( 7-Dec-88) o3,r3,d3 8 Oct 1991 10:25:08
*-------------------------------------
* INTEGER FUNCTION realsize()
*
*
INTEGER FUNCTION realsize()
integer ndigits
ndigits = 16
WRITE (*,FMT='(a)') '--------------------------------------'
WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
$ ndigits,' digits of accuracy'
IF (ndigits.LE.8) THEN
realsize = 4
ELSE
realsize = 8
END IF
WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
$ ' bytes per DOUBLEPRECISION word'
WRITE (*,FMT='(a)') '--------------------------------------'
RETURN
END
This archive was generated by hypermail 2b29 : Tue Apr 18 2000 - 05:23:01 CDT