----------
X-Sun-Data-Type: text
X-Sun-Data-Description: text
X-Sun-Data-Name: text
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 89


I stole a few minutes from our OS group's 16-way UE6000 (with 16-way
interleaved memory) to get the following results using block stores:

---------------------------------------------
                 SUSTAINED BANDWIDTH (MB/s)
   cpus    COPY      SCALE     ADD      TRIAD
----------------------------------------------
    16    2551.0    2449.9    2434.6    2434.9
    15    2547.6    2441.9    2381.2    2385.4
    12    2517.9    2371.8    2119.1    2118.7
     8    2202.5    1814.7    1551.7    1537.3
     4    1206.0     995.7     842.7     825.3
     2     641.8     534.1     451.2     439.3
     1     366.8     285.2     241.4     238.2
----------------------------------------------


I included the number for 15 CPUs to give you a rough idea of the trend
at around 16 CPUs.  As before, I did not bother to make sure that up to
8 CPUs the CPUs are all on different boards.

I've attached the code that I used for these numbers.  I think that
they can be improved upon, especially for single-processor performance,
but that is not high on my list of priorities right now. My aim was to
get pretty close to our peak of 2600 MB/s and I think these results are
close enough for me (besides, we've more than doubled your
1200 MB/s barrier on the triad :-)

Attachements:

stream_p.c: The original streadm_d.c except modified to call new parallel
	functions instead of the loop.

second_wall.c: The original file (included here for completeness).

ploops.c: Contains code for creating parallel threads (I did not
	use the auto-parallel option) and handling the
	unaligned beginning and end of blocked loops.  The assignment
	function just calls memcpy, the others call assembly for the
	block aligned parts.

blocks.s: Assembly code for doing 8-element, 64-byte aligned chunks
	for each operation.

Here is the makefile line I used:

stream_p: stream_p.c second_wall.c ploops.c blocks.s
	cc -fast -xO5 -dalign -xarch=v8plusa -xchip=ultra stream_p.c second_wall.c ploops.c blocks.s -o stream_p -lthread

The C compiler was from SunPro:

cc: SC4.0 18 Oct 1995 C 4.0


Here is the system info:

System Configuration:  Sun Microsystems  sun4u 16-slot Ultra Enterprise 6000
System clock frequency: 83 MHz
Memory size: 4096Mb
       CPU Units: Frequency Cache-Size Version
            A: MHz  MB  Impl. Mask  B: MHz  MB  Impl. Mask
            ----------  ----- ----  ----------  ----- ----
Board 0:       167 0.5   10    2.2     167 0.5   10    2.2  
Board 2:       167 0.5   10    2.2     167 0.5   10    2.2  
Board 4:       167 0.5   10    2.2     167 0.5   10    2.2  
Board 6:       167 0.5   10    2.2     167 0.5   10    2.2  
Board 8:       167 0.5   10    2.2     167 0.5   10    2.2  
Board10:       167 0.5   10    2.2     167 0.5   10    2.2  
Board12:       167 0.5   10    2.2     167 0.5   10    2.2  
Board14:       167 0.5   10    2.2     167 0.5   10    2.2  
          Memory Units: Size, Interleave Factor, Interleave With
          0: MB  Factor:  With:  1: MB  Factor:  With:
          -----  -------  -----  -----  -------  -----
Board 0:   256   16-way     A      256   16-way    A
Board 2:   256   16-way     A      256   16-way    A
Board 4:   256   16-way     A      256   16-way    A
Board 6:   256   16-way     A      256   16-way    A
Board 8:   256   16-way     A      256   16-way    A
Board10:   256   16-way     A      256   16-way    A
Board12:   256   16-way     A      256   16-way    A
Board14:   256   16-way     A      256   16-way    A


Thanks,

Ashok Singhal
Sun Microsystems

----------
X-Sun-Data-Type: c-file
X-Sun-Data-Name: stream_p.c
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 234

# include <stdio.h>
# include <math.h>
# include <float.h>
# include <limits.h>
# include <sys/time.h>

/*
 * Program: Stream
 * Programmer: Joe R. Zagar
 * Revision: 4.0-BETA, October 24, 1995
 * Original code developed by John D. McCalpin
 *
 * This program measures memory transfer rates in MB/s for simple 
 * computational kernels coded in C.  These numbers reveal the quality
 * of code generation for simple uncacheable kernels as well as showing
 * the cost of floating-point operations relative to memory accesses.
 *
 * INSTRUCTIONS:
 *
 *	1) Stream requires a good bit of memory to run.  Adjust the
 *          value of 'N' (below) to give a 'timing calibration' of 
 *          at least 20 clock-ticks.  This will provide rate estimates
 *          that should be good to about 5% precision.
 */

# define N	16000000
# define NTIMES	10
# define OFFSET	0

/*
 *	3) Compile the code with full optimization.  Many compilers
 *	   generate unreasonably bad code before the optimizer tightens
 *	   things up.  If the results are unreasonably good, on the
 *	   other hand, the optimizer might be too smart for me!
 *
 *         Try compiling with:
 *               cc -O stream_d.c second.c -o stream_d -lm
 *
 *         This is known to work on Cray, SGI, IBM, and Sun machines.
 *
 *
 *	4) Mail the results to mccalpin@udel.edu
 *	   Be sure to include:
 *		a) computer hardware model number and software revision
 *		b) the compiler flags
 *		c) all of the output from the test case.
 * Thanks!
 *
 */

# define HLINE "-------------------------------------------------------------\n"

# ifndef MIN
# define MIN(x,y) ((x)<(y)?(x):(y))
# endif
# ifndef MAX
# define MAX(x,y) ((x)>(y)?(x):(y))
# endif

static double	a[N+OFFSET],
		b[N+OFFSET],
		c[N+OFFSET];

static double	rmstime[4] = {0}, maxtime[4] = {0},
		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};

static char	*label[4] = {"Assignment:", "Scaling   :",
    "Summing   :", "SAXPYing  :"};

static double	bytes[4] = {
    2 * sizeof(double) * N,
    2 * sizeof(double) * N,
    3 * sizeof(double) * N,
    3 * sizeof(double) * N
    };

extern double second();

int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    register int	j, k;
    double		scalar, t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    BytesPerWord = sizeof(double);
    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
	BytesPerWord);

    printf(HLINE);
    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
    printf("Total memory required = %.1f MB.\n",
	(3 * N * BytesPerWord) / 1048576.0);
    printf("Each test is run %d times, but only\n", NTIMES);
    printf("the *best* time for each is used.\n");

    /* Get initial value for system clock. */

    for (j=0; j<N; j++) {
	a[j] = 1.0;
	b[j] = 2.0;
	c[j] = 0.0;
	}

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) 
	printf("Your clock granularity/precision appears to be "
	    "%d microseconds.\n", quantum);
    else
	printf("Your clock granularity appears to be "
	    "less than one microsecond.\n");

    t = second();
    for (j = 0; j < N; j++)
	a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (second() - t);

    printf("Each test below will take on the order"
	" of %d microseconds.\n", (int) t  );
    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING: The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

	

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = second();
/*	
	for (j=0; j<N; j++)
	    c[j] = a[j];
*/
	passign(c ,a, N);

	times[0][k] = second() - times[0][k];
	
	times[1][k] = second();
/*
    for (j=0; j<N; j++)
	    b[j] = scalar*c[j];
*/
	pscale(b, c, scalar, N);
	times[1][k] = second() - times[1][k];
	
	times[2][k] = second();
/*
	for (j=0; j<N; j++)
	    c[j] = a[j]+b[j];
*/
	padd(c, a, b, N);
	times[2][k] = second() - times[2][k];
	
	times[3][k] = second();
/*
	for (j=0; j<N; j++)
	    a[j] = b[j]+scalar*c[j];
*/
	ptriad (a, b, c, scalar, N);
	times[3][k] = second() - times[3][k];
	}
    
    /*	--- SUMMARY --- */

    for (k=0; k<NTIMES; k++)
	{
	for (j=0; j<4; j++)
	    {
	    rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function      Rate (MB/s)   RMS time     Min time     Max time\n");
    for (j=0; j<4; j++) {
	rmstime[j] = sqrt(rmstime[j]/(double)NTIMES);

	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       rmstime[j],
	       mintime[j],
	       maxtime[j]);
    }

    return 0;
}

# define	M	20

int
checktick()
    {
    int		i, minDelta, Delta;
    double	t1, t2, timesfound[M];

/*  Collect a sequence of M unique time values from the system. */

    for (i = 0; i < M; i++) {
	t1 = second();
	while( ((t2=second()) - t1) < 1.0E-6 )
	    ;
	timesfound[i] = t1 = t2;
	}

/*
 * Determine the minimum difference between these M values.
 * This result will be our estimate (in microseconds) for the
 * clock granularity.
 */

    minDelta = 1000000;
    for (i = 1; i < M; i++) {
	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
	minDelta = MIN(minDelta, MAX(Delta,0));
	}

    return(minDelta);
    }

----------
X-Sun-Data-Type: c-file
X-Sun-Data-Name: second_wall.c
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 25

/* A Fortran-callable gettimeofday routine to give access
   to the wall clock timer.

   This subroutine may need to be modified slightly to get
   it to link with Fortran on your computer.
*/

#include <sys/time.h>
/* int gettimeofday(struct timeval *tp, struct timezone *tzp); */

double second()
{
/* struct timeval { long	tv_sec;	
	    long	tv_usec;	};

struct timezone { int	tz_minuteswest;
	     int	tz_dsttime;	 };	*/

	struct timeval tp;
	struct timezone tzp;
	int i;

	i = gettimeofday(&tp,&tzp);
	return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
}
----------
X-Sun-Data-Type: c-file
X-Sun-Data-Name: ploops.c
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 273

#include <string.h>
#include <thread.h>

#define MAXPROCS 32

typedef struct ar_t {
	double *a;
	double *b;
	double *c;
	double scalar;
	int n;
} ar_t;

ar_t argmnts[MAXPROCS];

extern void b_scale(double *b, double *c, double *scalar, int n);
extern void b_add(double *c, double *a, double *b, int n);
extern void b_triad(double *a, double *b, double *c, double *scalar, int n);

void *p_assign(void *ar)
{
	ar_t *arg;
	double *c, *a;
	int n, nb;

	arg = (ar_t *) ar;
	c = arg->c;
	a = arg->a;
	n = arg->n;

	memcpy((void *)c, (void *) a, n * sizeof(double));
}

void *p_scale(void *ar)
{
	ar_t *arg;
	double *b, *c, scalar;
	int n, nb;

	arg = (ar_t *) ar;
	b = arg->b;
	c = arg->c;
	scalar = arg->scalar;
	n = arg->n;

	while ((((unsigned int)b & 0x3f) != 0) && (n > 0))
		{
			*b++ = scalar * (*c++);
			n--;
		}
	nb = n / 8;
	b_scale(b, c, &scalar, nb);
	b = b + (8 * nb);
	c = c + (8 * nb);
	n = n - (8 * nb);

	while (n > 0)
		{
			*b++ = scalar * (*c++);
			n--;
		}
}

void *p_add(void *ar)
{
	ar_t *arg;
	double *a, *b, *c;
	int n, nb;

	arg = (ar_t *) ar;
	a = arg->a;
	b = arg->b;
	c = arg->c;
	n = arg->n;

	while ((((unsigned int)c & 0x3f) != 0) && (n > 0))
		{
			*c++ = (*a++) + (*b++);
			n--;
		}
	nb = n / 8;
	b_add(c, a, b, nb);
	a = a + (8 * nb);
	b = b + (8 * nb);
	c = c + (8 * nb);
	n = n - (8 * nb);

	while (n > 0)
		{
			*c++ = (*a++) + (*b++);
			n--;
		}
}

void *p_triad(void *ar)
{
	ar_t *arg;
	double *a, *b, *c, scalar;
	int n, nb;

	arg = (ar_t *) ar;
	a = arg->a;
	b = arg->b;
	c = arg->c;
	scalar = arg->scalar;	
	n = arg->n;

	while ((((unsigned int)a & 0x3f) != 0) && (n > 0))
		{
			*a++ = (*b++) + (scalar * (*c++));
			n--;
		}
	nb = n / 8;
	b_triad(a, b, c, &scalar, nb);
	a = a + (8 * nb);
	b = b + (8 * nb);
	c = c + (8 * nb);
	n = n - (8 * nb);

	while (n > 0)
		{
			*a++ = (*b++) + (scalar * (*c++));
			n--;
		}
}

void *passign( double *c, double *a, int n)
{
	int par, p, p_n;
	thread_t tids[MAXPROCS];

	par = atoi(getenv("PARALLEL"));
	
	p_n = n / par;
	for (p = 0; p < par- 1; p++)
	{
		argmnts[p].c = c;
		c = c + p_n;
		argmnts[p].a = a;
		a = a + p_n;
		argmnts[p].n = p_n;
		n = n - p_n;
		if (thr_create(NULL, NULL, p_assign, (void *) (argmnts + p), THR_BOUND, tids+p) != 0)
			{
				printf("thread_create failed\n");
				exit (-1);
			}
	}
	argmnts[p].c = c;
	argmnts[p].a = a;
	argmnts[p].n = n;
	p_assign((void *) (argmnts + p));

	for (p = 0; p < par-1; p++)
	{
		thr_join(tids[p], NULL, NULL);
	}
	return;
}

void *pscale( double *b, double *c, double scalar, int n)
{
	int par, p, p_n;
	thread_t tids[MAXPROCS];

	par = atoi(getenv("PARALLEL"));
	
	p_n = n / par;
	for (p = 0; p < par- 1; p++)
	{
		argmnts[p].b = b;
		b = b + p_n;
		argmnts[p].c = c;
		c = c + p_n;
		argmnts[p].scalar = scalar;
		argmnts[p].n = p_n;
		n = n - p_n;
		if (thr_create(NULL, NULL, p_scale, (void *) (argmnts + p), THR_BOUND, tids+p) != 0)
			{
				printf("thread_create failed\n");
				exit (-1);
			}
	}
	argmnts[p].b = b;
	argmnts[p].c = c;
	argmnts[p].scalar = scalar;
	argmnts[p].n = n;
	p_scale((void *) (argmnts + p));

	for (p = 0; p < par-1; p++)
	{
		thr_join(tids[p], NULL, NULL);
	}
	return;
}


void *padd( double *c, double *a, double *b, int n)
{
	int par, p, p_n;
	thread_t tids[MAXPROCS];

	par = atoi(getenv("PARALLEL"));
	
	p_n = n / par;
	for (p = 0; p < par- 1; p++)
	{
		argmnts[p].a = a;
		a = a + p_n;
		argmnts[p].b = b;
		b = b + p_n;
		argmnts[p].c = c;
		c = c + p_n;
		argmnts[p].n = p_n;
		n = n - p_n;
		if (thr_create(NULL, NULL, p_add, (void *) (argmnts + p), THR_BOUND, tids+p) != 0)
			{
				printf("thread_create failed\n");
				exit (-1);
			}
	}
	argmnts[p].a = a;
	argmnts[p].b = b;
	argmnts[p].c = c;
	argmnts[p].n = n;
	p_add((void *) (argmnts + p));

	for (p = 0; p < par-1; p++)
	{
		thr_join(tids[p], NULL, NULL);
	}
	return;
}

void *ptriad( double *a, double *b, double *c, double scalar, int n)
{
	int par, p, p_n;
	thread_t tids[MAXPROCS];

	par = atoi(getenv("PARALLEL"));
	
	p_n = n / par;
	for (p = 0; p < par- 1; p++)
	{
		argmnts[p].a = a;
		a = a + p_n;
		argmnts[p].b = b;
		b = b + p_n;
		argmnts[p].c = c;
		c = c + p_n;
		argmnts[p].scalar = scalar;
		argmnts[p].n = p_n;
		n = n - p_n;
		if (thr_create(NULL, NULL, p_triad, (void *) (argmnts + p), THR_BOUND, tids+p) != 0)
			{
				printf("thread_create failed\n");
				exit (-1);
			}
	}
	argmnts[p].a = a;
	argmnts[p].b = b;
	argmnts[p].c = c;
	argmnts[p].scalar = scalar;
	argmnts[p].n = n;
	p_triad((void *) (argmnts + p));

	for (p = 0; p < par-1; p++)
	{
		thr_join(tids[p], NULL, NULL);
	}
	return;
}

----------
X-Sun-Data-Type: default
X-Sun-Data-Name: blocks.s
X-Sun-Charset: us-ascii
X-Sun-Content-Lines: 133

	.seg	"text"			! [internal]
	.proc	16
	.align	4
	.global	b_scale
b_scale:
		tst		%o3
		ldd [%o2], %f32
		ble		1f
		nop
2:		
		ldd	[%o1], %f0
		ldd [%o1+8], %f2
		ldd [%o1+16], %f4
		ldd [%o1+24], %f6
		ldd [%o1+32], %f8
		ldd [%o1+40], %f10
		ldd [%o1+48], %f12
		ldd [%o1+56], %f14
		fmuld %f0, %f32, %f16
		fmuld %f2, %f32, %f18
		fmuld %f4, %f32, %f20
		fmuld %f6, %f32, %f22
		fmuld %f8, %f32, %f24
		fmuld %f10, %f32, %f26
		fmuld %f12, %f32, %f28
		fmuld %f14, %f32, %f30
		subcc %o3, 1, %o3
		add %o1, 64, %o1
		stda %f16, [%o0] 0xf0
		bg		2b
		add %o0, 64, %o0
1:				
	retl
	nop

	.proc	16
	.align	4
	.global	b_add
b_add:
		tst		%o3
		ble		1f
		nop
2:		
		ldd	[%o1], %f0
		ldd [%o1+8], %f2
		ldd [%o1+16], %f4
		ldd [%o1+24], %f6
		ldd [%o1+32], %f8
		ldd [%o1+40], %f10
		ldd [%o1+48], %f12
		ldd [%o1+56], %f14

		ldd	[%o2], %f16
		ldd [%o2+8], %f18
		ldd [%o2+16], %f20
		ldd [%o2+24], %f22
		ldd [%o2+32], %f24
		ldd [%o2+40], %f26
		ldd [%o2+48], %f28
		ldd [%o2+56], %f30

		faddd %f0, %f16, %f32
		faddd %f2, %f18, %f34
		faddd %f4, %f20, %f36
		faddd %f6, %f22, %f38
		faddd %f8, %f24, %f40
		faddd %f10, %f26, %f42
		faddd %f12, %f28, %f44
		faddd %f14, %f30, %f46
		subcc %o3, 1, %o3
		add %o1, 64, %o1
		add %o2, 64, %o2
		stda %f32, [%o0] 0xf0
		bg		2b
		add %o0, 64, %o0
1:				
	retl
	nop

	.proc	16
	.align	4
	.global	b_triad
b_triad:
		tst		%o4
		ldd [%o3], %f48
		ble		1f
		nop
2:		
		ldd	[%o2], %f16
		ldd [%o2+8], %f18
		ldd [%o2+16], %f20
		ldd [%o2+24], %f22
		ldd [%o2+32], %f24
		ldd [%o2+40], %f26
		ldd [%o2+48], %f28
		ldd [%o2+56], %f30

		ldd	[%o1], %f0
		ldd [%o1+8], %f2
		ldd [%o1+16], %f4
		ldd [%o1+24], %f6
		ldd [%o1+32], %f8
		ldd [%o1+40], %f10
		ldd [%o1+48], %f12
		ldd [%o1+56], %f14
		
		fmuld %f16, %f48, %f16
		fmuld %f18, %f48, %f18
		fmuld %f20, %f48, %f20
		fmuld %f22, %f48, %f22
		fmuld %f24, %f48, %f24
		fmuld %f26, %f48, %f26
		fmuld %f28, %f48, %f28
		fmuld %f30, %f48, %f30

		faddd %f0, %f16, %f32
		faddd %f2, %f18, %f34
		faddd %f4, %f20, %f36
		faddd %f6, %f22, %f38
		faddd %f8, %f24, %f40
		faddd %f10, %f26, %f42
		faddd %f12, %f28, %f44
		faddd %f14, %f30, %f46
		subcc %o4, 1, %o4
		add %o1, 64, %o1
		add %o2, 64, %o2
		stda %f32, [%o0] 0xf0
		bg		2b
		add %o0, 64, %o0
1:				
	retl
	nop

