typical memory access — less than cache block
what if two processors modify different parts same cache block?
typically how caches work — write instructions happen one at a time:
void *sum_up(void *raw_dest) {
int *dest = (int *) raw_dest;
for (int i = 0; i < 64 * 1024 * 1024; ++i) {
*dest += data[i];
}
}
__attribute__((aligned(4096)))
int dests[1024]; /* aligned = address is mult. of 4096 */
void sum_twice(int distance) {
pthread_t threads[2];
pthread_create(&threads[0], NULL, sum_up, &dests[0]);
pthread_create(&threads[1], NULL, sum_up, &dests[distance]);
pthread_join(threads[0], NULL);
pthread_join(threads[1], NULL);
}(assuming sum_up compiled to not omit memory accesses)
int values[1024]; int results[2];
void *sum_front(void *ignored_argument) {
results[0] = 0;
for (int i = 0; i < 512; ++i)
results[0] += values[i];
return NULL;
}
void *sum_back(void *ignored_argument) {
results[1] = 0;
for (int i = 512; i < 1024; ++i)
results[1] += values[i];
return NULL;
}
int sum_all() {
pthread_t sum_front_thread, sum_back_thread;
pthread_create(&sum_front_thread, NULL, sum_front, NULL);
pthread_create(&sum_back_thread, NULL, sum_back, NULL);
pthread_join(sum_front_thread, NULL);
pthread_join(sum_back_thread, NULL);
return results[0] + results[1];
}Where is false sharing likely to occur? How to fix?
struct ThreadInfo { int *values; int start; int end; int result };
void *sum_thread(void *argument) {
ThreadInfo *my_info = (ThreadInfo *) argument;
int sum = 0;
for (int i = my_info->start; i < my_info->end; ++i) {
my_info->result += my_info->values[i];
}
return NULL;
}
int sum_all(int *values) {
ThreadInfo info[2]; pthread_t thread[2];
for (int i = 0; i < 2; ++i) {
info[i].values = values; info[i].start = i*512; info[i].end = (i+1)*512;
pthread_create(&threads[i], NULL, sum_thread, (void *) &info[i]);
}
for (int i = 0; i < 2; ++i)
pthread_join(threads[i], NULL);
return info[0].result + info[1].result;
}Where is false sharing likely to occur now?