/* Copyright 1995 N.M. Maclaren
   Copyright 1995 The University of Cambridge

This may be used and distributed freely, provided that due acknowledgement is
given and that the copyright holders are not held responsible for any errors.
Please send comments and corrections to nmm1@cam.ac.uk.


This program gives an indication of the memory performance of a system, which
may be some help in deducing how it will perform under a heavy interactive
load.  Note that the indication will be unreasonably low for systems with CPUs
that are slow relative to their memory capacity, or for poor compilers, because
of the loop overheads; this variation may be as much as a factor of two.  The
reason that a complicated loop is used is to avoid certain common cache
optimisations.  For investigating a system in more detail, a more complete test
suite (such as Larry McVoy's lmbench) should be used.  John McCalpin has a set
of CPU and bandwidth comparisons, using his stream benchmark, which is extremely
useful.

To use this program, follow these steps:

    1) Compile it with a suitable C compiler under full optimisation.  It
should compile without trouble under any reasonable compiler, ANSI or not,
though it may be necessary to tweak CLOCKS_PER_SEC for non-UNIX, non-ANSI
compilers, or to disable the use of volatile.  There are pre-processor
statements to do both of these.

    2) Run it on a near-idle system with one parameter, which should be set to
about half the maximum real memory available to a single process.  For very
fast systems with a small amount of store, a second parameter can be supplied
to increase the loop length.  The symptom of an inadequate loop length is that
totally ridiculous values are printed for the access rate and bandwidth.

    3) It will print out the access rates and access bandwidth for store
increasing from 1K (2K on systems with 8-byte longs) by factors of 2 up to
the maximum specified.  This should show the various cache boundaries etc.
In 1995, a fast workstation's main memory will have a rate of about 2.5
million/sec. and a bandwidth of 50 MB/sec.  Figures of 1.0 million/sec and 20
MB/sec are more typical.

*/


#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#ifndef EXIT_SUCCESS
#define EXIT_SUCCESS 0            /* For non-ANSI systems, like SunOS! */
#define EXIT_FAILURE 1
#endif
#ifndef CLOCKS_PER_SEC
#ifdef CLK_TCK
#define CLOCKS_PER_SEC CLK_TCK   /* Some systems still use this older form */
#endif
#endif
#ifndef CLOCKS_PER_SEC
#define CLOCKS_PER_SEC 1000000   /* For non-ANSI systems, like SunOS! */
#endif
#ifdef __STDC__
#define VOLATILE volatile        /* Unset this if volatile is not accepted */
#else
#define VOLATILE                 /* Most non-ANSI compilers won't like it */
#endif


VOLATILE long junk = 0;          /* Used to prevent over-optimisation */


long read_a (array, offsets, limit, count)
long *array, *offsets, limit, count;
{
    long i;
    register long *ptr, *lwb, n;
    clock_t t;

    t = clock();
    n = 0;
    count *= 16;
    lwb = &array[128];
    while (--count >= 0) {
        for (i = 0; i < 8; ++i) {
            ptr = &array[offsets[i]+limit];
            do n += *(ptr -= 128); while (ptr >= lwb);
        }
    }
    junk = n;
    return ((t = clock()-t) > 0 ? t : 1);;
}


long write_a (array, offsets, limit, count)
long *array, *offsets, limit, count;
{
    long i;
    register long *ptr, *lwb, n;
    clock_t t;

    t = clock();
    n = junk;
    count *= 16;
    lwb = &array[128];
    while (--count >= 0) {
        for (i = 0; i < 8; ++i) {
            ptr = &array[offsets[i]+limit];
            do *(ptr -= 128) = n; while (ptr >= lwb);
        }
    }
    return ((t = clock()-t) > 0 ? t : 1);;
}


long read_b (array, limit, count)
long *array, limit, count;
{
    register long *ptr, *upb, n;
    clock_t t;

    t = clock();
    n = 0;
    upb = &array[limit];
    while (--count >= 0) {
        ptr = array;
        do n += *ptr++; while (ptr < upb);
    }
    junk = n;
    return ((t = clock()-t) > 0 ? t : 1);;
}


long write_b (array, limit, count)
long *array, limit, count;
{
    register long *ptr, *upb, n;
    clock_t t;

    t = clock();
    n = junk;
    upb = &array[limit];
    while (--count >= 0) {
        ptr = array;
        do *ptr++ = n; while (ptr < upb);
    }
    return ((t = clock()-t) > 0 ? t : 1);;
}


int main (argc, argv)
int argc;
char *argv[];
{
    static long offsets[8] = {112, 48, 80, 16, 96, 32, 64, 0};
    long *array, limit, count, repeat, i, k, n;
    int error = 0;
    unsigned char c1, c2;
    double a_read, a_write, b_read, b_write, scale;

    count = 1;
    if (argc <= 1) {
        printf("Syntax:  store [ nk | nK | nm | nM ] k\n");
        printf("    the store size to test, and the repetition count\n");
        exit(EXIT_SUCCESS);
    } else if (argc > 3 ||
            sscanf(argv[1],"%ld%c%c",&limit,&c1,&c2) != 2 ||
            (c1 != 'k' && c1 != 'K' && c1 != 'm' && c1 != 'M'))
        error = 1;
    else if (argc == 3 &&
            (sscanf(argv[2],"%ld%c",&count,&c2) != 1 ||
                count < 1 || count > LONG_MAX/1000))
        error = 1;
    k = (c1 == 'm' || c1 == 'M' ? 1048576 : 1024);
    if (error || limit < 1 || limit > LONG_MAX/k ||
            limit*k/(256*sizeof(long)) > LONG_MAX/count) {
        fprintf(stderr,"Invalid arguments specified\n");
        exit(EXIT_FAILURE);
    }

    limit = (limit*k/(128*sizeof(long)))*128;
    if ((array = (long *)malloc(((int)limit+128)*sizeof(long))) == NULL) {
        fprintf(stderr,"Unable to get store\n");
        exit(EXIT_FAILURE);
    }

    printf("Testing the store performance in up to %ld %s\n",
        limit*sizeof(long)/k,(k > 10000 ? "MB" : "KB"));
    printf("Accesses in millions/sec, bandwidth in MB/sec\n\n");
    printf("               Accesses          Bandwidth\n");
    printf("             Read    Write     Read    Write\n");

    for (n = 256; n <= limit; n <<= 1) {
        repeat = count*(limit/n);
        scale = count*(double)limit*1.0e-6*(double)CLOCKS_PER_SEC;
        for (i = 0; i < limit; ++i) array[i] = 0;
        a_write = scale/write_a(array,offsets,n,repeat);
        a_read = scale/read_a(array,offsets,n,repeat);
        scale = count*(double)limit*sizeof(long)*1.0e-6*CLOCKS_PER_SEC;
        for (i = 0; i < limit; ++i) array[i] = 0;
        b_write = scale/write_b(array,n,repeat);
        b_read = scale/read_b(array,n,repeat);
        if ((k = n*sizeof(long)) > 1000000)
            printf("%4d MB:  ",(int)(k/1048576));
        else
            printf("%4d KB:  ",(int)(k/1024));
        printf("%7.2f  %7.2f  %7.2f  %7.2f\n",a_read,a_write,b_read,b_write);
    }

    free(array);
    return 0;
}