John;
For you information, below are "stream" results for a 450 MHz CRAY T3E with
512 PEs.
A copy of the program is also included below.
Regards,
Charles Grassl
Number of iterations: 10
Size of Arrays: 501 Kwords
Bandwidth (Mbyte/s)
PEs Copy Scale Sum Triad Saxpy 1 Load 1 Store
------- --------- --------- --------- --------- --------- --------- ---------
1 484. 482. 571. 568. 672. 435. 306.
2 935. 935. 1088. 1118. 1333. 859. 604.
4 1877. 1869. 2147. 2230. 2668. 1719. 1185.
8 3747. 3739. 4167. 4471. 5305. 3476. 2193.
16 7497. 7477. 8042. 8828. 10655. 6863. 4218.
32 14968. 14940. 15935. 17148. 21323. 13726. 8394.
64 30066. 29694. 31051. 34168. 42399. 27431. 16803.
128 60175. 58707. 57177. 64952. 84065. 54710. 33363.
256 119483. 116861. 107532. 126061. 165733. 109631. 66738.
512 240428. 233501. 243368. 265803. 331126. 210628. 133439.
program Stream
integer ofst,aoff,boff,coff
parameter (N = 501*1024, NTIMES = 10)
parameter (ofst = 8*1024)
real*8 a(N+8*1024),b(N+8*1024),c(N+8*1024)
common /acom/a,b,c
real times(7)
real avetime(7),rmstime(7),mintime(7),maxtime(7)
character*10 label(7)
integer bytes(7),flops(7)
data avetime/7*0.0/
data rmstime/7*0.0/
data mintime/7*1.0e+36/
data maxtime/7*0.0/
data label/' Copy',
. ' Scale',
. ' Sum',
. ' Triad',
. ' Saxpy',
. ' 1 Load',
. '1 Store'/
parameter (nbpw = 8 )
data bytes/2,2,3,3,3,1,1/
data flops/0,1,1,2,2,1,0/
integer shmem_my_pe,shmem_n_pes
round_up(ix,index) = ((ix + (index-1))/index)*index
me = shmem_my_pe()
numpes = shmem_n_pes()
istart_a = loc(a)
istart_b = loc(b)
istart_c = loc(c)
istart_a = (round_up(istart_a,8092) - istart_a)/8 + 1
istart_b = (round_up(istart_b,8092) - istart_b)/8 + 1
istart_c = (round_up(istart_c,8092) - istart_c)/8 + 1
if ( me .eq. 0 ) then
write(6,9020) numpes,ntimes,n/1024
write(6,9030) (label(j),j=1,7)
end if
do 1000 k=1,NTIMES
call bandwidth(n,a(istart_a), b(istart_b), c(istart_c), times)
avetime = avetime + times
rmstime = rmstime + times**2
mintime = min( mintime, times )
maxtime = max( maxtime, times )
if ( me .eq. 0) then
write(6,9040) k,(N*bytes(j)*nbpw*numpes/times(j)/1.0e6,j=1,7)
end if
1000 continue
if ( me .eq. 0) then
avetime = avetime/NTIMES
write(6,9050) (N*bytes(j)*nbpw*numpes/mintime(j)/1.0e6,j=1,7),
. (N*bytes(j)*nbpw*numpes/avetime(j)/1.0e6,j=1,7),
. (N*bytes(j)*nbpw*numpes/maxtime(j)/1.0e6,j=1,7)
end if
call exit()
9020 format( /' *** STREAM benchmark ***'
. /' Number of PEs: ',i8
. /' Number of iterations: ',i8
. /' Size of Arrays: ',i8,' Kwords')
9030 format(// 30(' '),'Bandwidth (Mbyte/s)'
. /' Iterat. ',7a10,
. / (' -------'),7(' ---------'))
9040 format ( i6,2x,7f10.0)
9050 format (/' Max: ',7f10.0/
. ' Ave: ',7f10.0/
. ' Min: ',7f10.0/)
end
subroutine dummysub(a,b,c,n)
return
end
subroutine bandwidth(n,a,b,c,times)
real*8 a(n),b(n),c(n)
real*8 times(7)
data scalar /3.0/
common /bandcom/ sum
timer()=0.001*timef()
timer()=3.333e-9*rtc()
t = timer()
do j=1,N
c(j) = a(j)
end do
call barrier
times(1) = timer()-t
call dummysub(a,b,c,n)
t = timer()
do j=1,N
c(j) = scalar*a(j)
end do
call barrier
times(2) = timer()-t
call dummysub(a,b,c,n)
t = timer()
do j=1,N
c(j) = a(j) + b(j)
end do
call barrier
times(3) = timer()-t
call dummysub(a,b,c,n)
t = timer()
do j=1,N
c(j) = a(j)+scalar*b(j)
end do
call barrier
times(4) = timer()-t
call dummysub(a,b,c,n)
t = timer()
do j=1,N
c(j) = c(j)+scalar*b(j)
end do
call barrier
times(5) = timer()-t
call dummysub(a,b,c,n)
sum = 0.
t = timer()
do j=1,N
sum = sum + a(j)
end do
call barrier
times(6) = timer()-t
call dummysub(a,b,c,n)
t = timer()
do j=1,N
c(j) = 0.
end do
call barrier
times(7) = timer()-t
call dummysub(a,b,c,n)
return
end
This archive was generated by hypermail 2b29 : Tue Apr 18 2000 - 05:23:06 CDT