/* Copyright 1995 N.M. Maclaren Copyright 1995 The University of Cambridge This may be used and distributed freely, provided that due acknowledgement is given and that the copyright holders are not held responsible for any errors. Please send comments and corrections to nmm1@cam.ac.uk. This program gives an indication of the memory performance of a system, which may be some help in deducing how it will perform under a heavy interactive load. Note that the indication will be unreasonably low for systems with CPUs that are slow relative to their memory capacity, or for poor compilers, because of the loop overheads; this variation may be as much as a factor of two. The reason that a complicated loop is used is to avoid certain common cache optimisations. For investigating a system in more detail, a more complete test suite (such as Larry McVoy's lmbench) should be used. John McCalpin has a set of CPU and bandwidth comparisons, using his stream benchmark, which is extremely useful. To use this program, follow these steps: 1) Compile it with a suitable C compiler under full optimisation. It should compile without trouble under any reasonable compiler, ANSI or not, though it may be necessary to tweak CLOCKS_PER_SEC for non-UNIX, non-ANSI compilers, or to disable the use of volatile. There are pre-processor statements to do both of these. 2) Run it on a near-idle system with one parameter, which should be set to about half the maximum real memory available to a single process. For very fast systems with a small amount of store, a second parameter can be supplied to increase the loop length. The symptom of an inadequate loop length is that totally ridiculous values are printed for the access rate and bandwidth. 3) It will print out the access rates and access bandwidth for store increasing from 1K (2K on systems with 8-byte longs) by factors of 2 up to the maximum specified. This should show the various cache boundaries etc. In 1995, a fast workstation's main memory will have a rate of about 2.5 million/sec. and a bandwidth of 50 MB/sec. Figures of 1.0 million/sec and 20 MB/sec are more typical. */ #include #include #include #include #ifndef EXIT_SUCCESS #define EXIT_SUCCESS 0 /* For non-ANSI systems, like SunOS! */ #define EXIT_FAILURE 1 #endif #ifndef CLOCKS_PER_SEC #ifdef CLK_TCK #define CLOCKS_PER_SEC CLK_TCK /* Some systems still use this older form */ #endif #endif #ifndef CLOCKS_PER_SEC #define CLOCKS_PER_SEC 1000000 /* For non-ANSI systems, like SunOS! */ #endif #ifdef __STDC__ #define VOLATILE volatile /* Unset this if volatile is not accepted */ #else #define VOLATILE /* Most non-ANSI compilers won't like it */ #endif VOLATILE long junk = 0; /* Used to prevent over-optimisation */ long read_a (array, offsets, limit, count) long *array, *offsets, limit, count; { long i; register long *ptr, *lwb, n; clock_t t; t = clock(); n = 0; count *= 16; lwb = &array[128]; while (--count >= 0) { for (i = 0; i < 8; ++i) { ptr = &array[offsets[i]+limit]; do n += *(ptr -= 128); while (ptr >= lwb); } } junk = n; return ((t = clock()-t) > 0 ? t : 1);; } long write_a (array, offsets, limit, count) long *array, *offsets, limit, count; { long i; register long *ptr, *lwb, n; clock_t t; t = clock(); n = junk; count *= 16; lwb = &array[128]; while (--count >= 0) { for (i = 0; i < 8; ++i) { ptr = &array[offsets[i]+limit]; do *(ptr -= 128) = n; while (ptr >= lwb); } } return ((t = clock()-t) > 0 ? t : 1);; } long read_b (array, limit, count) long *array, limit, count; { register long *ptr, *upb, n; clock_t t; t = clock(); n = 0; upb = &array[limit]; while (--count >= 0) { ptr = array; do n += *ptr++; while (ptr < upb); } junk = n; return ((t = clock()-t) > 0 ? t : 1);; } long write_b (array, limit, count) long *array, limit, count; { register long *ptr, *upb, n; clock_t t; t = clock(); n = junk; upb = &array[limit]; while (--count >= 0) { ptr = array; do *ptr++ = n; while (ptr < upb); } return ((t = clock()-t) > 0 ? t : 1);; } int main (argc, argv) int argc; char *argv[]; { static long offsets[8] = {112, 48, 80, 16, 96, 32, 64, 0}; long *array, limit, count, repeat, i, k, n; int error = 0; unsigned char c1, c2; double a_read, a_write, b_read, b_write, scale; count = 1; if (argc <= 1) { printf("Syntax: store [ nk | nK | nm | nM ] k\n"); printf(" the store size to test, and the repetition count\n"); exit(EXIT_SUCCESS); } else if (argc > 3 || sscanf(argv[1],"%ld%c%c",&limit,&c1,&c2) != 2 || (c1 != 'k' && c1 != 'K' && c1 != 'm' && c1 != 'M')) error = 1; else if (argc == 3 && (sscanf(argv[2],"%ld%c",&count,&c2) != 1 || count < 1 || count > LONG_MAX/1000)) error = 1; k = (c1 == 'm' || c1 == 'M' ? 1048576 : 1024); if (error || limit < 1 || limit > LONG_MAX/k || limit*k/(256*sizeof(long)) > LONG_MAX/count) { fprintf(stderr,"Invalid arguments specified\n"); exit(EXIT_FAILURE); } limit = (limit*k/(128*sizeof(long)))*128; if ((array = (long *)malloc(((int)limit+128)*sizeof(long))) == NULL) { fprintf(stderr,"Unable to get store\n"); exit(EXIT_FAILURE); } printf("Testing the store performance in up to %ld %s\n", limit*sizeof(long)/k,(k > 10000 ? "MB" : "KB")); printf("Accesses in millions/sec, bandwidth in MB/sec\n\n"); printf(" Accesses Bandwidth\n"); printf(" Read Write Read Write\n"); for (n = 256; n <= limit; n <<= 1) { repeat = count*(limit/n); scale = count*(double)limit*1.0e-6*(double)CLOCKS_PER_SEC; for (i = 0; i < limit; ++i) array[i] = 0; a_write = scale/write_a(array,offsets,n,repeat); a_read = scale/read_a(array,offsets,n,repeat); scale = count*(double)limit*sizeof(long)*1.0e-6*CLOCKS_PER_SEC; for (i = 0; i < limit; ++i) array[i] = 0; b_write = scale/write_b(array,n,repeat); b_read = scale/read_b(array,n,repeat); if ((k = n*sizeof(long)) > 1000000) printf("%4d MB: ",(int)(k/1048576)); else printf("%4d KB: ",(int)(k/1024)); printf("%7.2f %7.2f %7.2f %7.2f\n",a_read,a_write,b_read,b_write); } free(array); return 0; }