John,
You were right about there being problems, so I rewrote the stream_d.c
code a bit, and corrected several glitches. But the upshot is that the
results I initially reported were essentially correct. The rate is
now slightly slower because stream_d.c was calculating MBytes as
1e6*Bytes, rather than (1024*1024)*Bytes.
Here is the output:
Timing calibration 1: utime overhead = 48.995018 usec.
Timing calibration 2: array init time = 469684.004784 usec.
Increase the size of the arrays if this is < 300000
and your clock precision is =< 1/100 second.
---------------------------------------------------
Function Rate (MB/s) RMS time Min time Max time
Assignment: 84.349 0.091 0.090 0.091
Scaling : 68.201 0.112 0.112 0.112
Summing : 76.163 0.150 0.150 0.151
SAXPYing : 73.129 0.157 0.156 0.157
I've appended the precise code used to generate this output. It
was compiled with "gcc-2.6.3 -m486 -O4 -funroll-loops -ffast-math"
Regards,
Russell
/*
* Program: Stream
* Programmer: John D. McCalpin
* Revision: 2.0, September 30,1991
*
* Modified slightly to account for timing overhead and correct
* calculation of MBytes by R. L. Carter 5/4/95.
*
* This program measures memory transfer rates in MB/s for simple
* computational kernels coded in Fortran. These numbers reveal the
* quality of code generation for simple uncacheable kernels as well
* as showing the cost of floating-point operations relative to memory
* accesses.
*
* INSTRUCTIONS:
* 1) (fortran-specific, omitted.)
* 2) Stream requires a good bit of memory to run.
* Adjust the Parameter 'N' in the second line of the main
* program to give a 'timing calibration' of at least 20 clicks.
* This will provide rate estimates that should be good to
* about 5% precision.
* 3) Compile the code with full optimization. Many compilers
* generate unreasonably bad code before the optimizer tightens
* things up. If the results are unreasonable good, on the
* other hand, the optimizer might be too smart for me!
* 4) Mail the results to mccalpin@perelandra.cms.udel.edu
* Be sure to include:
* a) computer hardware model number and software revision
* b) the compiler flags
* c) all of the output from the test case.
* Thanks!
*
* this version was ported from fortran to c by mark hahn, hahn+@pitt.edu.
*/
#define N 500000
#define NTIMES 20
#include <limits.h>
#include <float.h>
#include <sys/time.h>
#include <math.h>
#include <stdio.h>
#ifndef MIN
#define MIN(x,y) ((x)<(y)?(x):(y))
#endif
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#endif
double utime(void) /* returns the time since the last call */
{
static double oldtime=0,newtime,deltatime;
static struct timeval tp;
static struct timezone tzp;
if (gettimeofday(&tp,&tzp) == -1) {
fprintf(stderr,"Error in gettimeofday. Exiting\n");
exit(1);
}
if (!oldtime) {
oldtime=(double)tp.tv_sec+((double)tp.tv_usec)/1e6;
return(0);
}
newtime=(double)tp.tv_sec+((double)tp.tv_usec)/1e6;
deltatime=newtime-oldtime;
oldtime=newtime;
return(deltatime);
}
static double a[N],b[N],c[N];
int main() {
int j,k;
double times[4][NTIMES];
static double t_overhead,init_time;
static double rmstime[4] = {0};
static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
static double maxtime[4] = {0};
static char *label[4] = {"Assignment:",
"Scaling :",
"Summing :",
"SAXPYing :"};
static double bytes[4] = { 2 * sizeof(double) * (double)N,
2 * sizeof(double) * (double)N,
3 * sizeof(double) * (double)N,
3 * sizeof(double) * (double)N};
/* --- SETUP --- determine precision and check timing --- */
/* utimeStart(); */
utime();
t_overhead=utime();
printf("Timing calibration 1: utime overhead = %f usec.\n",t_overhead*1e6);
utime();
for (j=0; j<N; j++) {
a[j] = 1.0;
b[j] = 2.0;
c[j] = 0.0;
}
init_time=utime()-t_overhead;
printf("Timing calibration 2: array init time = %f usec.\n",init_time*1e6);
printf("Increase the size of the arrays if this is < 300000\n"
"and your clock precision is =< 1/100 second.\n");
printf("---------------------------------------------------\n");
/* --- MAIN LOOP --- repeat test cases NTIMES times --- */
for (k=0; k<NTIMES; k++) {
utime();
for (j=0; j<N; j++)
c[j] = a[j];
times[0][k] = utime()-t_overhead;
utime();
for (j=0; j<N; j++)
c[j] = 3.0e0*a[j];
times[1][k] = utime()-t_overhead;
utime();
for (j=0; j<N; j++)
c[j] = a[j]+b[j];
times[2][k] = utime()-t_overhead;
utime();
for (j=0; j<N; j++)
c[j] = a[j]+3.0e0*b[j];
times[3][k] = utime()-t_overhead;
for (j=0; j<N; j++) {
a[j] = c[j];
b[j] = 1.1*a[j];
}
if (a[1]<0) {
printf("Making a dependency: %f\n",c[N-1]);
}
}
/* --- SUMMARY --- */
for (k=0; k<NTIMES; k++) {
for (j=0; j<4; j++) {
rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
mintime[j] = MIN(mintime[j], times[j][k]);
maxtime[j] = MAX(maxtime[j], times[j][k]);
}
}
printf("Function Rate (MB/s) RMS time Min time Max time\n");
for (j=0; j<4; j++) {
rmstime[j] = sqrt(rmstime[j]/(double)NTIMES);
printf("%s%11.3f %11.3f %11.3f %11.3f\n",
label[j],
bytes[j]/mintime[j]/(1024*1024),
rmstime[j],
mintime[j],
maxtime[j]);
}
return 0;
}
This archive was generated by hypermail 2b29 : Tue Apr 18 2000 - 05:23:04 CDT