for( int i=0; i<N1; i++ ) // repeat inner loop, to
make runtime longer
for(int j=min; j<max; j++) // striped per CPU, still much
larger than all caches
a[j] = b[j]+scalar*c[j]; // original loop body
/*-----------------------------------------------------------------------*/
/* Program: Stream */
/* Revision: $Id: stream.c,v 5.8 2007/02/19 23:57:39 mccalpin Exp mccalpin $ */
/* Original code developed by John D. McCalpin */
/* Programmers: John D. McCalpin */
/* Joe R. Zagar */
/* */
/* This program measures memory transfer rates in MB/s for simple */
/* computational kernels coded in C. */
/*-----------------------------------------------------------------------*/
/* Copyright 1991-2005: John D. McCalpin */
/*-----------------------------------------------------------------------*/
/* License: */
/* 1. You are free to use this program and/or to redistribute */
/* this program. */
/* 2. You are free to modify this program for your own use, */
/* including commercial use, subject to the publication */
/* restrictions in item 3. */
/* 3. You are free to publish results obtained from running this */
/* program, or from works that you derive from this program, */
/* with the following limitations: */
/* 3a. In order to be referred to as "STREAM benchmark results", */
/* published results must be in conformance to the STREAM */
/* Run Rules, (briefly reviewed below) published at */
/* http://www.cs.virginia.edu/stream/ref.html */
/* and incorporated herein by reference. */
/* As the copyright holder, John McCalpin retains the */
/* right to determine conformity with the Run Rules. */
/* 3b. Results based on modified source code or on runs not in */
/* accordance with the STREAM Run Rules must be clearly */
/* labelled whenever they are published. Examples of */
/* proper labelling include: */
/* "tuned STREAM benchmark results" */
/* "based on a variant of the STREAM benchmark code" */
/* Other comparable, clear and reasonable labelling is */
/* acceptable. */
/* 3c. Submission of results to the STREAM benchmark web site */
/* is encouraged, but not required. */
/* 4. Use of this program or creation of derived works based on this */
/* program constitutes acceptance of these licensing restrictions. */
/* 5. Absolutely no warranty is expressed or implied. */
/*-----------------------------------------------------------------------*/
import java.util.concurrent.*;
/* INSTRUCTIONS:
*
* 1) Stream requires a good bit of memory to run. Adjust the
* value of 'N' (below) to give a 'timing calibration' of
* at least 20 clock-ticks. This will provide rate estimates
* that should be good to about 5% precision.
*/
public abstract class stream {
static final long N = 10000000000L;
// Because Java is limited to 31bit array indices, I loop over the same
// array data enough times to reach 'N' total array elements. The actual
// array is sized below, and must exceed any cache by a substantial amount.
static final int N0 = 10000000;
static final int N1 = (int)(N/N0);
static final int NTIMES = 20;
static final int OFFSET = 125000;
/*
* 3) Compile the code with full optimization. Many compilers
* generate unreasonably bad code before the optimizer tightens
* things up. If the results are unreasonably good, on the
* other hand, the optimizer might be too smart for me!
*
* Try compiling with:
* cc -O stream_omp.c -o stream_omp
*
* This is known to work on Cray, SGI, IBM, and Sun machines.
*
*
* 4) Mail the results to mccalpin@cs.virginia.edu
* Be sure to include:
* a) computer hardware model number and software revision
* b) the compiler flags
* c) all of the output from the test case.
* Thanks!
*
*/
static final String HLINE = "-------------------------------------------------------------\n";
static double a[] = new double[N0+OFFSET];
static double b[] = new double[N0+OFFSET];
static double c[] = new double[N0+OFFSET];
static double avgtime[] = {0.0, 0.0, 0.0, 0.0};
static double maxtime[] = {0.0, 0.0, 0.0, 0.0};
static double mintime[] = {Double.MAX_VALUE,Double.MAX_VALUE,Double.MAX_VALUE,Double.MAX_VALUE};
static final String label[] = {"Copy: ", "Scale: ", "Add: ", "Triad: "};
static final double BytesPerWord = 8.0;
static final double bytes[] = {
2 * BytesPerWord * N,
2 * BytesPerWord * N,
3 * BytesPerWord * N,
3 * BytesPerWord * N
};
// Worker threads
static Thread workers[];
static volatile stream _work;
static CyclicBarrier _barrier_start;
static CyclicBarrier _barrier_stop;
abstract boolean work( final int min, final int max );
void do_par( ) throws InterruptedException, BrokenBarrierException {
_work = this; // Inform workers of what to do
_barrier_start.await(); // Start workers
_barrier_stop .await(); // Main continues when work is done
}
static public void main( String args[] ) throws InterruptedException, BrokenBarrierException {
/* --- SETUP --- determine precision and check timing --- */
System.out.printf(HLINE);
System.out.printf("STREAM version $Revision: 5.8 $\n");
System.out.printf(HLINE);
System.out.printf("This system uses %d bytes per DOUBLE PRECISION word.\n", (int)BytesPerWord);
System.out.printf(HLINE);
System.out.printf("Array size = %d, Offset = %d\n" , N, OFFSET);
System.out.printf("Total memory required = %.1f MB.\n",
(3.0 * BytesPerWord) * ( (double) N / 1048576.0));
System.out.printf("Each test is run %d times, but only\n", NTIMES);
System.out.printf("the *best* time for each is used.\n");
System.out.printf(HLINE);
final int num_threads = args.length > 0 ? Integer.parseInt(args[0]) : Runtime.getRuntime().availableProcessors();
System.out.printf ("Number of Threads requested = %d\n",num_threads);
// Classic cyclic-barriers. Number of threads is +1, so the
// workers will await for the main thread before proceeding.
_barrier_start = new CyclicBarrier(num_threads+1);
_barrier_stop = new CyclicBarrier(num_threads+1);
// Array of worker threads
workers = new Thread[num_threads];
final int NSLICE = N0/num_threads;
for( int i=0; i<num_threads; i++ ) {
final int pid = i;
final int min = pid*NSLICE;
final int max = (i == num_threads-1) ? N0 :(pid+1)*NSLICE;
workers[i] = new Thread() { public void run() {
// Workers do_forever: await barrier, then do "_work"
try {
_barrier_start.await();
while( _work.work(min,max) ) {
_barrier_stop .await();
_barrier_start.await();
}
}
catch( InterruptedException e ) { return; }
catch( BrokenBarrierException e ) { return; }
} };
workers[i].start();
}
System.out.printf(HLINE);
System.out.printf ("Printing one line per active thread....\n");
/* Get initial value for system clock. */
new stream() { public boolean work( final int min, final int max ) {
//System.out.println(" min="+min+" max="+max);
for( int j=min; j<max; j++ ) {
a[j] = 1.0;
b[j] = 2.0;
c[j] = 0.0;
}
return true; // do not kill worker
} }.do_par();
System.out.printf(HLINE);
int quantum = checktick();
if( quantum >= 1 )
System.out.printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum);
else {
System.out.printf("Your clock granularity appears to be less than one microsecond.\n");
quantum = 1;
}
double t = mysecond();
new stream() { public boolean work( final int min, final int max ) {
for( int j=min; j<max; j++ )
a[j] = 2.0E0 * a[j];
return true; // do not kill worker
} }.do_par();
t = 1.0E6 * (mysecond() - t);
System.out.printf("Each test below will take on the order of %d microseconds.\n", (int) t );
System.out.printf(" (= %d clock ticks)\n", (int) (t/quantum) );
System.out.printf("Increase the size of the arrays if this shows that\n");
System.out.printf("you are not getting at least 20 clock ticks per test.\n");
System.out.printf(HLINE);
System.out.printf("WARNING -- The above is only a rough guideline.\n");
System.out.printf("For best results, please be sure you know the\n");
System.out.printf("precision of your system timer.\n");
System.out.printf(HLINE);
/* --- MAIN LOOP --- repeat test cases NTIMES times --- */
double times[][] = new double[4][NTIMES];
final double scalar = 3.0;
for( int k=0; k<NTIMES; k++ ) {
times[0][k] = mysecond();
new stream() { public boolean work( final int min, final int max ) {
for( int i=0; i<N1; i++ )
for(int j=min; j<max; j++)
c[j] = a[j];
return true; // do not kill worker
} }.do_par();
times[0][k] = mysecond() - times[0][k];
times[1][k] = mysecond();
new stream() { public boolean work( final int min, final int max ) {
for( int i=0; i<N1; i++ )
for(int j=min; j<max; j++)
b[j] = scalar*c[j];
return true; // do not kill worker
} }.do_par();
times[1][k] = mysecond() - times[1][k];
times[2][k] = mysecond();
new stream() { public boolean work( final int min, final int max ) {
for( int i=0; i<N1; i++ )
for(int j=min; j<max; j++)
c[j] = a[j]+b[j];
return true; // do not kill worker
} }.do_par();
times[2][k] = mysecond() - times[2][k];
times[3][k] = mysecond();
new stream() { public boolean work( final int min, final int max ) {
for( int i=0; i<N1; i++ )
for(int j=min; j<max; j++)
a[j] = b[j]+scalar*c[j];
return true; // do not kill worker
} }.do_par();
times[3][k] = mysecond() - times[3][k];
}
// Shutdown workers
_work = new stream() { public boolean work( final int min, final int max ) { return false; } };
_barrier_start.await();
/* --- SUMMARY --- */
for( int k=3; k<NTIMES; k++) { /* note -- skip first three iterations as warmup */
for(int j=0; j<4; j++) {
avgtime[j] = avgtime[j] + times[j][k];
mintime[j] = Math.min(mintime[j], times[j][k]);
maxtime[j] = Math.max(maxtime[j], times[j][k]);
}
}
System.out.printf("Function Rate (MB/s) Avg time Min time Max time\n");
for(int j=0; j<4; j++) {
avgtime[j] = avgtime[j]/(double)(NTIMES-1);
System.out.printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j],
1.0E-06 * bytes[j]/mintime[j],
avgtime[j],
mintime[j],
maxtime[j]);
}
System.out.printf(HLINE);
/* --- Check Results --- */
checkSTREAMresults();
System.out.printf(HLINE);
}
static int checktick() {
final int M = 20;
double timesfound[] = new double[M];
/* Collect a sequence of M unique time values from the system. */
for( int i = 0; i < timesfound.length; i++) {
final double t1 = mysecond();
double t2;
while( ((t2=mysecond()) - t1) < 1.0E-6 )
;
timesfound[i] = t2;
}
/*
* Determine the minimum difference between these M values.
* This result will be our estimate (in microseconds) for the
* clock granularity.
*/
double minDelta = 1000000;
for( int i = 1; i < timesfound.length; i++ ) {
final double Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
minDelta = Math.min(minDelta, Math.max(Delta,0));
}
return (int)minDelta;
}
/* A gettimeofday routine to give access to the wall
clock timer on most UNIX-like systems. */
static double mysecond() {
return ((double)System.nanoTime())/ 1.0e9;
}
static void checkSTREAMresults () {
/* reproduce initialization */
double aj = 1.0;
double bj = 2.0;
double cj = 0.0;
/* a[] is modified during timing check */
aj = 2.0E0 * aj;
/* now execute timing loop */
final double scalar = 3.0;
for( int k=0; k<NTIMES; k++) {
cj = aj;
bj = scalar*cj;
cj = aj+bj;
aj = bj+scalar*cj;
}
aj = aj * (double) (N);
bj = bj * (double) (N);
cj = cj * (double) (N);
double asum = 0.0;
double bsum = 0.0;
double csum = 0.0;
for(int j=0; j<N0; j++) {
asum += a[j]*N1;
bsum += b[j]*N1;
csum += c[j]*N1;
}
//System.out.printf ("Results Comparison: \n");
//System.out.printf (" Expected : %f %f %f \n",aj,bj,cj);
//System.out.printf (" Observed : %f %f %f \n",asum,bsum,csum);
final double epsilon = 1.e-8;
if (Math.abs(aj-asum)/asum > epsilon) {
System.out.printf ("Failed Validation on array a[]\n");
System.out.printf (" Expected : %f \n",aj);
System.out.printf (" Observed : %f \n",asum);
}
else if (Math.abs(bj-bsum)/bsum > epsilon) {
System.out.printf ("Failed Validation on array b[]\n");
System.out.printf (" Expected : %f \n",bj);
System.out.printf (" Observed : %f \n",bsum);
}
else if (Math.abs(cj-csum)/csum > epsilon) {
System.out.printf ("Failed Validation on array c[]\n");
System.out.printf (" Expected : %f \n",cj);
System.out.printf (" Observed : %f \n",csum);
}
else {
System.out.printf ("Solution Validates\n");
}
}
}
Received on Wed Jan 30 10:41:39 2008
This archive was generated by hypermail 2.1.8 : Tue Apr 15 2008 - 10:01:44 CDT