John-
Shown below are "stream" results for a 600 MHz CRAY T3E-1200
with 512 PEs.
A copy of the program used (based on the one C. Grassl used for
earlier T3E systems) is also included.
Regards,
-Joe Glenski.
Benchmarking Group
Cray Inc., 655F Lone Oak Drive, Eagan, MN 55121 USA
glenski@cray.com
------------------------------------------------------------------------
Number of iterations: 10
Size of Arrays: 501 Kwords
STREAM Memory Bandwidth for Cray T3E-1200 4/28/2000
All results are in MB/s --- 1 MB=10^6 B, *not* 2^20 B
---------------------------------------------------------
Machine ID ncpus COPY SCALE ADD TRIAD Saxpy 1-Load 1-Store
---------------------------------------------------------
Cray_T3E-1200 1 474. 480. 566. 558. 446. 397. 327.
Cray_T3E-1200 2 951. 962. 1131. 1115. 894. 794. 645.
Cray_T3E-1200 4 1902. 1918. 2262. 2230. 1788. 1587. 1284.
Cray_T3E-1200 8 3800. 3849. 4523. 4458. 3577. 3173. 2376.
Cray_T3E-1200 16 7616. 7696. 9046. 8913. 7153. 6347. 4733.
Cray_T3E-1200 32 15194. 15386. 18088. 17829. 14304. 12710. 9348.
Cray_T3E-1200 64 30459. 30764. 36170. 35646. 28607. 25385. 18473.
Cray_T3E-1200 128 60882. 61554. 72335. 71283. 57209. 50765. 36516.
Cray_T3E-1200 256 121503. 123064. 144679. 142580. 114418. 101525. 72635.
Cray_T3E-1200 512 242981. 246031. 288345. 284920. 228289. 203031. 145731.
COPY, SCALE, ADD, TRIAD are standard STREAM benchmark.
Saxpy, 1-Load, 1-Store are additional cray-only tests.
-----------------------------------------------------------------------
program Stream
C Based on a version from: C. Grassl, Apr 18 1997
C
C Extended by Grassl to report results for three additional
C tests, "Saxpy", "1 Load", and "1 Store".
C
C Updated 4/28/2000 J. Glenski to avoid compiler warnings.
integer ofst,aoff,boff,coff
parameter (N = 501*1024, NTIMES = 10)
parameter (ofst = 8*1024)
real*8 a(N+8*1024),b(N+8*1024),c(N+8*1024)
common /acom/a,b,c
real times(7)
real avetime(7),rmstime(7),mintime(7),maxtime(7)
character*10 label(7)
integer bytes(7),flops(7)
data avetime/7*0.0/
data rmstime/7*0.0/
data mintime/7*1.0e+36/
data maxtime/7*0.0/
data label/' Copy',
. ' Scale',
. ' Sum',
. ' Triad',
. ' Saxpy',
. ' 1 Load',
. '1 Store'/
parameter (nbpw = 8 )
data bytes/2,2,3,3,3,1,1/
data flops/0,1,1,2,2,1,0/
integer shmem_my_pe,shmem_n_pes
round_up(ix,index) = ((ix + (index-1))/index)*index
me = shmem_my_pe()
numpes = shmem_n_pes()
istart_a = loc(a)
istart_b = loc(b)
istart_c = loc(c)
istart_a = (round_up(istart_a,8092) - istart_a)/8 + 1
istart_b = (round_up(istart_b,8092) - istart_b)/8 + 1
istart_c = (round_up(istart_c,8092) - istart_c)/8 + 1
if ( me .eq. 0 ) then
write(6,9020) numpes,ntimes,n/1024
write(6,9030) (label(j),j=1,7)
end if
do 1000 k=1,NTIMES
call bandwidth(n,a(istart_a), b(istart_b), c(istart_c), times)
avetime = avetime + times
rmstime = rmstime + times**2
mintime = min( mintime, times )
maxtime = max( maxtime, times )
if ( me .eq. 0) then
write(6,9040) k,(N*bytes(j)*nbpw*numpes/times(j)/1.0e6,j=1,7)
end if
1000 continue
if ( me .eq. 0) then
avetime = avetime/NTIMES
write(6,9050) (N*bytes(j)*nbpw*numpes/mintime(j)/1.0e6,j=1,7),
. (N*bytes(j)*nbpw*numpes/avetime(j)/1.0e6,j=1,7),
. (N*bytes(j)*nbpw*numpes/maxtime(j)/1.0e6,j=1,7)
end if
call exit()
9020 format( /' *** STREAM benchmark ***'
. /' Number of PEs: ',i8
. /' Number of iterations: ',i8
. /' Size of Arrays: ',i8,' Kwords')
9030 format(// 30(' '),'Bandwidth (Mbyte/s)'
. /' Iterat. ',7a10,
. / (' -------'),7(' ---------'))
9040 format ( i6,2x,7f10.0)
9050 format (/' Max: ',7f10.0/
. ' Ave: ',7f10.0/
. ' Min: ',7f10.0/)
end
subroutine dummysub(a,b,c,n)
real a(*),b(*),c(*)
return
end
subroutine bandwidth(n,a,b,c,times)
real*8 a(n),b(n),c(n)
real*8 times(7)
data scalar /3.0/
common /bandcom/ sum
timer()=0.001*timef()
t = timer()
do j=1,N
c(j) = a(j)
end do
call barrier
times(1) = timer()-t
call dummysub(a,b,c,n)
t = timer()
do j=1,N
c(j) = scalar*a(j)
end do
call barrier
times(2) = timer()-t
call dummysub(a,b,c,n)
t = timer()
do j=1,N
c(j) = a(j) + b(j)
end do
call barrier
times(3) = timer()-t
call dummysub(a,b,c,n)
t = timer()
do j=1,N
c(j) = a(j)+scalar*b(j)
end do
call barrier
times(4) = timer()-t
call dummysub(a,b,c,n)
t = timer()
do j=1,N
c(j) = c(j)+scalar*b(j)
end do
call barrier
times(5) = timer()-t
call dummysub(a,b,c,n)
sum = 0.
t = timer()
do j=1,N
sum = sum + a(j)
end do
call barrier
times(6) = timer()-t
call dummysub(a,b,c,n)
t = timer()
do j=1,N
c(j) = 0.
end do
call barrier
times(7) = timer()-t
call dummysub(a,b,c,n)
return
end
------------------------------------------------------
This archive was generated by hypermail 2b29 : Thu May 04 2000 - 03:42:16 CDT