#include "pitsTT6lib.h" #define UNROLL_A 8 #define BREAKb 4 #define BREAKc 4 #define BREAKd 4 #define PARAM ((16 << 24) | (32 << 16) | 64) #define AV_SIZE 24 #define AV_COUNT 16 #define AV_STRIDE 0 #include "trace.h" #if TRACE void start_trace(char *name) { char buf[16]; strcpy(buf, name); startTrace(buf); } void stop_trace(void) { stopTrace(); } #else #define start_trace(name) #define stop_trace() #endif void do_copy(double *a, double *b, double *c, double scalar, int N) { int i; register double *ai, *ci; register int k8 = 8; register int k40 = 40; start_trace("copy"); vec_dstt((__vector float *) &a[0], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0); ai = &a[-1]; // bias backwards by 8 bytes; allows use of address-update below ci = &c[-1]; for (i = 0; i < N; i += UNROLL_A) { register double t0, t1, t2, t3; asm { dcbz k8, ci dcbz k40, ci lfd t0, 8(ai) lfd t1, 16(ai) lfd t2, 24(ai) lfd t3, 32(ai) stfd t0, 8(ci) stfd t1, 16(ci) stfd t2, 24(ci) stfd t3, 32(ci) lfd t0, 40(ai) lfd t1, 48(ai) lfd t2, 56(ai) lfdu t3, 64(ai) stfd t0, 40(ci) stfd t1, 48(ci) stfd t2, 56(ci) stfdu t3, 64(ci) } if ((i & 255) == 256-UNROLL_A) vec_dstt((__vector float *) &a[i+128], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0); } vec_dssall(); stop_trace(); } void do_scale(double *a, double *b, double *c, register double scalar, int N) { int i; register double *ai, *ci; register int k8 = 8; register int k40 = 40; start_trace("scale"); vec_dstt((__vector float *) &a[0], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0); ai = &a[-1]; // bias backwards by 8 bytes; allows use of address-update below ci = &c[-1]; for (i = 0; i < N; i += UNROLL_A) { register double t0, t1, t2, t3, t4, t5, t6, t7; asm { dcbz k8, ci dcbz k40, ci lfd t0, 8(ai) lfd t1, 16(ai) fmul t0, t0, scalar lfd t2, 24(ai) lfd t3, 32(ai) fmul t1, t1, scalar stfd t0, 8(ci) fmul t2, t2, scalar stfd t1, 16(ci) fmul t3, t3, scalar stfd t2, 24(ci) stfd t3, 32(ci) lfd t0, 40(ai) lfd t1, 48(ai) fmul t0, t0, scalar lfd t2, 56(ai) lfdu t3, 64(ai) fmul t1, t1, scalar stfd t0, 40(ci) fmul t2, t2, scalar stfd t1, 48(ci) fmul t3, t3, scalar stfd t2, 56(ci) stfdu t3, 64(ci) } if ((i & 255) == 256-UNROLL_A) vec_dstt((__vector float *) &a[i+128], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0); } vec_dssall(); stop_trace(); } void do_add(double *a, double *b, double *c, double scalar, int N) { int i; register double *ai, *bi, *ci; register int k8 = 8; register int k40 = 40; start_trace("add"); vec_dstt((__vector float *) &a[0], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0); vec_dstt((__vector float *) &b[0], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 1); ai = &a[-1]; // bias backwards by 8 bytes; allows use of address-update below bi = &b[-1]; ci = &c[-1]; for (i = 0; i < N; i += UNROLL_A) { register double t0, t1, t2, t3, u0, u1, u2, u3; asm { dcbz k8, ci dcbz k40, ci lfd t0, 8(ai) lfd t1, 16(ai) lfd u0, 8(bi) lfd u1, 16(bi) fadd t0, t0, u0 lfd t2, 24(ai) lfd t3, 32(ai) fadd t1, t1, u1 lfd u2, 24(bi) lfd u3, 32(bi) stfd t0, 8(ci) fadd t2, t2, u2 stfd t1, 16(ci) fadd t3, t3, u3 stfd t2, 24(ci) stfd t3, 32(ci) lfd t0, 40(ai) lfd t1, 48(ai) lfd u0, 40(bi) lfd u1, 48(bi) fadd t0, t0, u0 lfd t2, 56(ai) lfdu t3, 64(ai) fadd t1, t1, u1 lfd u2, 56(bi) lfdu u3, 64(bi) stfd t0, 40(ci) fadd t2, t2, u2 stfd t1, 48(ci) fadd t3, t3, u3 stfd t2, 56(ci) stfdu t3, 64(ci) } if ((i & 255) == 256-UNROLL_A) { vec_dstt((__vector float *) &a[i+128], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0); vec_dstt((__vector float *) &b[i+128], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 1); } } vec_dssall(); stop_trace(); } void do_triad(double *a, double *b, double *c, register double scalar, int N) { int i; register double *ai, *bi, *ci; register int k8 = 8; register int k40 = 40; start_trace("triad"); vec_dstt((__vector float *) &a[0], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0); vec_dstt((__vector float *) &b[0], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 1); ai = &a[-1]; // bias backwards by 8 bytes; allows use of address-update below bi = &b[-1]; ci = &c[-1]; for (i = 0; i < N; i += UNROLL_A) { register double t0, t1, t2, t3, u0, u1, u2, u3; asm { dcbz k8, ci dcbz k40, ci lfd t0, 8(ai) lfd t1, 16(ai) lfd u0, 8(bi) lfd u1, 16(bi) fmadd t0, scalar, t0, u0 lfd t2, 24(ai) lfd t3, 32(ai) fmadd t1, scalar, t1, u1 lfd u2, 24(bi) lfd u3, 32(bi) stfd t0, 8(ci) fmadd t2, scalar, t2, u2 stfd t1, 16(ci) fmadd t3, scalar, t3, u3 stfd t2, 24(ci) stfd t3, 32(ci) lfd t0, 40(ai) lfd t1, 48(ai) lfd u0, 40(bi) lfd u1, 48(bi) fmadd t0, scalar, t0, u0 lfd t2, 56(ai) lfdu t3, 64(ai) fmadd t1, scalar, t1, u1 lfd u2, 56(bi) lfdu u3, 64(bi) stfd t0, 40(ci) fmadd t2, scalar, t2, u2 stfd t1, 48(ci) fmadd t3, scalar, t3, u3 stfd t2, 56(ci) stfdu t3, 64(ci) } if ((i & 255) == 256-UNROLL_A) { vec_dstt((__vector float *) &a[i+128], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 0); vec_dstt((__vector float *) &b[i+128], (2 << AV_SIZE) | (255 << AV_COUNT) | (32 << AV_STRIDE), 1); } } vec_dssall(); stop_trace(); }