/**
* Parallel Code Count
* parallel.h
* Parallel (OpenMP) code count implementation.
*/
#pragma once
#include <memory>
#include <omp.h>
#include "reference.h"
struct ParallelCountCodes
{
// requires zero-filled amount array
void operator()(const std::string &text, int amount[COUNTERS]) const
{
// two phases:
// 1) each thread fills its own "amount" array over a part of text,
// 2) "amount" arrays are reduced
// phase 2 may be done in parallel but it may be not meaningful,
// actually it may be deadly hurt by false sharing since amount is quite small (e.g. 1kb)
// however, SIMD may help to speed-up phase 2 up to RAM subsystem read/write limit
struct Amount
{
int amount[COUNTERS];
char padding[128];
};
// possibly not safe...
const int threads = omp_get_max_threads(),
amountByteSize = sizeof(int) * COUNTERS;
std::unique_ptr<Amount[]> amounts(new Amount[threads]);
// phase 1: parallel
#pragma omp parallel
{
const int tid = omp_get_thread_num();
// fill with zeroes
int * const myAmount = amounts[tid].amount;
std::memset(myAmount, 0, amountByteSize);
// count part of the text
const std::ptrdiff_t size = text.size();
//for (std::ptrdiff_t i = tid * size / threads, imax = (tid + 1) * size / threads; i < imax; ++i)
#pragma omp for
for (std::ptrdiff_t i = 0; i < size; ++i)
myAmount[static_cast<unsigned char>(text[i])]++;
}
// phase 2: sequential
// copy the first
std::memcpy(amount, amounts[0].amount, amountByteSize);
// add the rest
for (int i = 1; i < threads; ++i)
{
int * const added = amounts[i].amount;
for (int c = 0; c < COUNTERS; ++c)
amount[c] += added[c];
}
}
};