/* Author: David Tarjan; * Description: This file defines an improved hashed percepetron predictor. */ #ifndef PREDICTOR_H_SEEN #define PREDICTOR_H_SEEN #include #include #include #include "op_state.h" // defines op_state_c (architectural state) class #include "tread.h" // defines branch_record_c class class PREDICTOR { public: typedef uint32_t address_t; private: typedef uint32_t history_t; typedef uint64_t lhist_t; typedef int8_t counter_t; static const std::size_t PHT_SIZE = 1024; static const std::size_t PHT_SIZE_LOG = 10; static const std::size_t PHT_INDEX_MASK = (PHT_SIZE - 1); static const std::size_t PHT_INDEX_MASK_LARGE = (int)(2*PHT_SIZE - 1); static const std::size_t PHT_INDEX_MASK_LARGER = (int)(4*PHT_SIZE - 1); static const std::size_t PHT_INDEX_MASK_SMALL = (int)(0.5*PHT_SIZE - 1); static const std::size_t PHT_INDEX_MASK_SMALLER = (int)(0.25*PHT_SIZE - 1); static const counter_t PHT_INIT = /* weakly taken */ 0; static const std::size_t PURE_PATH = 3; static const std::size_t NR_TABLES = 10; static const std::size_t BITWIDTH = 4; static const signed int MAX_COUNTER = 15; static const signed int MAX_COUNTER_TAIL = 7; static const signed int THRESFIT = 64; static const signed int MAX_THETA = 4*NR_TABLES - 1; static const signed int MIN_THETA = NR_TABLES; static const signed int MAX_UPDOWN = 256; static const signed int MAX_START_WEIGHT = (int)(NR_TABLES*0.25); static const signed int TOTAL_SIZE = /*note that when we talk about weights, we talk about unsigned numbers, since the sign bits are stored separately*/ /*bias weights, array needs to be large, especially for SERV and MM*/(int)(BITWIDTH*1.875*PHT_SIZE*1) + /*bias sign bits, high ratio because lots of branches are highly biased*/(int)(1*7.5*PHT_SIZE*1) + /*2 tables with path indexed weights*/BITWIDTH*PHT_SIZE*2 + /*the first table is highly dealiased (4 sign bits for each weight) because it has the most recent path history*/1*4*PHT_SIZE*1 + /*the second table is normally dealiased (2 sign bits per weight)*/1*2*PHT_SIZE*1 + /*4 normal tables with gshare indexed weights*/BITWIDTH*PHT_SIZE*4 + /*the first table is again highly dealiased (4 sign bits) because it has the most recent global history*/1*4*PHT_SIZE*1 + /*the other 3 are normally dealiased (2 sign bits)*/1*2*PHT_SIZE*3 + /*the last 3 tables (all gshare indexed) are half the normal size, because few branches need old history, but INT needs it*/(int)(BITWIDTH*0.5*PHT_SIZE*3) + /*all 3 are normally dealiased (2 sign bits)*/1*1*PHT_SIZE*3; int weight_distr_table[NR_TABLES + 10][NR_TABLES + 10]; int total_cc_branches; int total_updates; int total_out; int total_theta; int total_sweight; int TC; int THETA; int START_WEIGHT; int updowncounter; std::vector pht,upper_pht,direction_pht; std::vector spec_past_addr; history_t bhr[8]; void update_bhr(bool taken) { int i; for (i = 7; i > 0; i--) { bhr[i] = (bhr[i] << 1) + !!(bhr[i-1] & (1<<31))/*(bhr[i - 1] < 0)*/; } bhr[0] = bhr[0] << 1; if (taken) bhr[0] |= 1; } history_t constr_hist_vector(unsigned int c) { int i,exact, diff; history_t upper, lower, result; i = exact = 0; diff = c; if(diff == 0) { return bhr[0]; } while(diff>=32) { diff -= 32; i++; } if(diff==0) { return bhr[i]; } else { upper = bhr[i+1] & ((1<> diff; result = (upper|lower); return result; } } static std::size_t pht_index(address_t pc, history_t bhr) { return (static_cast(pc ^(bhr << 1)) & PHT_INDEX_MASK); } static std::size_t pht_index_large(address_t pc, history_t bhr) { return (static_cast(pc ^(bhr << 1)) & PHT_INDEX_MASK_LARGE); } static std::size_t pht_index_larger(address_t pc, history_t bhr) { return (static_cast(pc ^(bhr << 1)) & PHT_INDEX_MASK_LARGER); } static std::size_t pht_index_small(address_t pc, history_t bhr) { return (static_cast(pc ^(bhr << 1)) & PHT_INDEX_MASK_SMALL); } static std::size_t pht_index_smaller(address_t pc, history_t bhr) { return (static_cast(pc ^(bhr << 1)) & PHT_INDEX_MASK_SMALLER); } static bool counter_msb(/* 2-bit counter */ counter_t cnt) { return (cnt >= 2); } static counter_t counter_inc(/* 8-bit counter */ counter_t cnt) { if (cnt < MAX_COUNTER) cnt++; return cnt; } static counter_t counter_dec(/* 8-bit counter */ counter_t cnt) { if (cnt > (-MAX_COUNTER + 1)) cnt--; return cnt; } static counter_t counter_inc_orig(/* 2-bit counter */ counter_t cnt) { if (cnt < 4) cnt++; return cnt; } static counter_t counter_dec_orig(/* 2-bit counter */ counter_t cnt) { if (cnt > 0) cnt--; return cnt; } static counter_t counter_inc_tail(/* 8-bit counter */ counter_t cnt) { if (cnt < MAX_COUNTER_TAIL) cnt++; return cnt; } static counter_t counter_dec_tail(/* 8-bit counter */ counter_t cnt) { if (cnt > (-MAX_COUNTER_TAIL + 1)) cnt--; return cnt; } void update_path(address_t pc) { int i; for(i=20;i>0;i--) { spec_past_addr[i] = spec_past_addr[i-1]; } spec_past_addr[0] = pc; } void classify_weight(counter_t w,int index) { int j,mask; int bits = abs(w); j=0;mask=1; while((bits > mask) && (j < 8)){ j++; mask=2*mask + 1; } weight_distr_table[j][index]++; } public: PREDICTOR(void) : pht((NR_TABLES+2)*PHT_SIZE*16 , counter_t(PHT_INIT)),upper_pht((NR_TABLES+2)*PHT_SIZE*32 , counter_t(PHT_INIT)), direction_pht((NR_TABLES+2)*PHT_SIZE*128 , counter_t(PHT_INIT)),spec_past_addr(25, int32_t(0)) { unsigned int i,j; for(i=0;i<(NR_TABLES + 10);i++) { for(j=0;j<(NR_TABLES + 10);j++) { weight_distr_table[i][j] = 0; } } total_cc_branches = 0; total_updates = 0; total_out = 0; total_theta = 0; THETA = (int)(1.3*NR_TABLES); START_WEIGHT = (int)(-1*NR_TABLES); updowncounter = 0; total_sweight = 0; printf("totbal bits = %d\n",TOTAL_SIZE); } ~PREDICTOR(void) { unsigned int i,j; printf("avg out is %f and avg theta is %f and avg sweight is %f\n",float(total_out)/(float(total_cc_branches)),float(total_theta)/(float(total_cc_branches)),float(total_sweight)/(float(total_cc_branches))); printf("Weight distribution follows, in ascending order of magnitude for both weights and tables \n"); for(i=0;iis_conditional) { address_t pc = br->instruction_addr; std::size_t index = pht_index(pc,0); counter_t cnt = pht[index]; prediction = counter_msb(cnt); } return prediction; // true for taken, false for not taken } bool get_prediction(branch_record_c* br, const op_state_c* os) { bool prediction = false; if (/* conditional branch */ br->is_conditional) { address_t pc = br->instruction_addr; int path_bits; int y = START_WEIGHT; history_t gshare_bits; unsigned int i,j; std::size_t path_addr, path_addr_large; //std::size_t path_addr_small; total_cc_branches++; /* leave recent path alone because it's the most important information for the branch prediction */ path_addr = pc % (int)(1.875*PHT_SIZE); path_addr_large = pc % (int)(7.5*PHT_SIZE); y += 2*direction_pht[path_addr_large + 16*0*PHT_SIZE]*pht[path_addr + 8*0*PHT_SIZE]; for (i = 1, j = 0; i < PURE_PATH; i++,j+=4) { path_bits = pc^((spec_past_addr[j]^(spec_past_addr[j+1] << 1))^(spec_past_addr[j+2] << 2))^(spec_past_addr[j+3] << 3); path_bits = (path_bits >> PHT_SIZE_LOG) ^ path_bits; path_addr = pht_index(path_bits,0); if(i == 1) { path_addr_large = pht_index_larger(path_bits,0); y +=(int) 2*direction_pht[path_addr_large + i*16*PHT_SIZE]*pht[path_addr + i*8*PHT_SIZE]; } else { path_addr_large = pht_index_large(path_bits,0); y +=(int) direction_pht[path_addr_large + i*16*PHT_SIZE]*pht[path_addr + i*8*PHT_SIZE]; } } /* XOR global history with branch address, which gives better results than branches leading up to the branch */ for (i = PURE_PATH, j=0; i < NR_TABLES - 3; i++,j+=PHT_SIZE_LOG) { gshare_bits = constr_hist_vector(j); path_addr = pht_index(gshare_bits,pc); if(i == PURE_PATH) { path_addr_large = pht_index_larger(gshare_bits,pc); y +=(int) 2*direction_pht[path_addr_large + i*16*PHT_SIZE]*pht[path_addr + i*8*PHT_SIZE]; } else { path_addr_large = pht_index_large(gshare_bits,pc); y +=(int) direction_pht[path_addr_large + i*16*PHT_SIZE]*pht[path_addr + i*8*PHT_SIZE]; } } for (i = NR_TABLES - 3,j+=PHT_SIZE_LOG - 1; i < NR_TABLES; i++,j+=PHT_SIZE_LOG-1) { gshare_bits = constr_hist_vector(j); path_addr = pht_index_small(gshare_bits,pc); path_addr_large = pht_index(gshare_bits,pc); y +=(int) direction_pht[path_addr_large + i*16*PHT_SIZE]*pht[path_addr + i*8*PHT_SIZE]; } br->perc_out = y; total_out += abs(y); total_theta += THETA; total_sweight +=START_WEIGHT; prediction = y >=0; } return prediction; // true for taken, false for not taken } // Update the predictor after a prediction has been made. This should accept // the branch record (br) and architectural state (os), as well as a third // argument (taken) indicating whether or not the branch was taken. void update_predictor_orig(const branch_record_c* br, const op_state_c* os, bool taken) { if (/* conditional branch */ br->is_conditional) { address_t pc = br->instruction_addr; std::size_t index = pht_index(pc, 0); counter_t cnt = pht[index]; if (taken) cnt = counter_inc(cnt); else cnt = counter_dec(cnt); pht[index] = cnt; update_bhr(taken); } } void update_predictor(const branch_record_c* br, const op_state_c* os, bool taken) { if (/* conditional branch */ br->is_conditional) { address_t pc = br->instruction_addr; int path_bits, gshare_bits; int y; unsigned int i,j; counter_t weight; bool pred_taken, correct, strong; int dir = 0; std::size_t path_addr, path_addr_large; y = br->perc_out; if(y >= 0) { pred_taken = true; } else { pred_taken = false; } if(abs(y) >= THETA) { strong = true; } else { strong = false; } if(pred_taken == taken) { correct = true; } else { correct = false; } /* leave recent path alone because it's the most important information for the branch prediction */ if(taken && ((correct && !strong) || !correct)) { dir = 1; } else if(!taken && ((correct && !strong) || !correct)) { dir = -1; } if(dir != 0) { total_updates++; path_addr = pc % (int)(1.875*PHT_SIZE); path_addr_large = pc % (int)(7.5*PHT_SIZE); weight = direction_pht[path_addr_large + 0*16*PHT_SIZE]*pht[path_addr + 0*8*PHT_SIZE]; if (dir == 1) { weight = counter_inc(weight); } else { weight = counter_dec(weight); } pht[path_addr + 0*8*PHT_SIZE] = abs(weight); direction_pht[path_addr_large + 0*16*PHT_SIZE] = (weight>-1)?1:-1; for (i = 1, j = 0; i < PURE_PATH; i++,j+=4) { path_bits = pc^((spec_past_addr[j]^(spec_past_addr[j+1] << 1))^(spec_past_addr[j+2] << 2))^(spec_past_addr[j+3] << 3); path_bits = (path_bits >> PHT_SIZE_LOG) ^ path_bits; path_addr = pht_index(path_bits,0); if(i == 1) { path_addr_large = pht_index_larger(path_bits,0); weight = direction_pht[path_addr_large + i*16*PHT_SIZE]*pht[path_addr + i*8*PHT_SIZE]; } else { path_addr_large = pht_index_large(path_bits,0); weight = direction_pht[path_addr_large + i*16*PHT_SIZE]*pht[path_addr + i*8*PHT_SIZE]; } if (dir == 1) { weight = counter_inc(weight); } else { weight = counter_dec(weight); } pht[path_addr + i*8*PHT_SIZE] = abs(weight); direction_pht[path_addr_large + i*16*PHT_SIZE] = (weight>-1)?1:-1; } /* XOR older path information with all the history information */ for (i = PURE_PATH,j=0; i < NR_TABLES - 3; i++,j+=PHT_SIZE_LOG) { gshare_bits = constr_hist_vector(j); path_addr = pht_index(gshare_bits,pc); if(i == PURE_PATH) { path_addr_large = pht_index_larger(gshare_bits,pc); weight = direction_pht[path_addr_large + i*16*PHT_SIZE]*pht[path_addr + i*8*PHT_SIZE]; } else { path_addr_large = pht_index_large(gshare_bits,pc); weight = direction_pht[path_addr_large + i*16*PHT_SIZE]*pht[path_addr + i*8*PHT_SIZE]; } if (dir == 1) { weight = counter_inc(weight); } else { weight = counter_dec(weight); } pht[path_addr + i*8*PHT_SIZE] = abs(weight); direction_pht[path_addr_large + i*16*PHT_SIZE] = (weight>-1)?1:-1; } for (i = NR_TABLES - 3,j+=PHT_SIZE_LOG-1 ; i < NR_TABLES; i++,j+=PHT_SIZE_LOG-1) { gshare_bits = constr_hist_vector(j); path_addr = pht_index_small(gshare_bits,pc); path_addr_large = pht_index(gshare_bits,pc); weight = direction_pht[path_addr_large + i*16*PHT_SIZE]*pht[path_addr + i*8*PHT_SIZE]; if (dir == 1) { weight = counter_inc(weight); } else { weight = counter_dec(weight); } pht[path_addr + i*8*PHT_SIZE] = abs(weight); direction_pht[path_addr_large + i*16*PHT_SIZE] = (weight>-1)?1:-1; } } update_path(pc); update_bhr(taken); } } }; #endif // PREDICTOR_H_SEEN