// bucket sort 4
// (based on variant 2) -- no merge, each thread has its own buckets
template <class RanIt>
void parBucketSort4 (RanIt begin, RanIt end)
{
using namespace Detail::BucketSort;
typedef typename std::iterator_traits <RanIt>::value_type T;
const Index sz = end - begin;
if (sz < 2) return;
// find min and max
T minv, maxv;
parallel_min_and_max (begin, end, minv, maxv);
if (minv == maxv) return;
// prepare buckets
const int threads = omp_get_max_threads();
const Index bsz =
std::max (static_cast <Index> (threads),
static_cast <Index> ((sz / BucketSize <T>::value / threads) * threads));
const T inv = T (bsz - 1) / (maxv - minv);
const Index groups = bsz / threads;
std::unique_ptr <T[]> B (new T[sz]); // temporary store
// offset[i+1] corresponds to end of the i-th bucket
// pos[i] corresponds current writing position (size) of i-th bucket
std::unique_ptr <Index[]> offset (new Index[bsz + 1]),
pos (new Index[bsz]);
offset[0] = 0;
#pragma omp parallel for
for (int tid = 0; tid < threads; ++tid)
{
const Index base = tid * groups;
std::fill (offset.get() + base + 1, offset.get() + base + groups + 1, 0);
// calculate sizes
for (RanIt p = begin; p != end; ++p)
{
const Index index = static_cast <Index> ((*p - minv) * inv);
if (base <= index && (index - groups) < base)
offset[index + 1]++;
}
}
// finish offset and pos
for (Index i = 0; i < bsz; ++i)
{
pos[i] = offset[i];
offset[i + 1] += offset[i];
}
#pragma omp parallel for
for (int tid = 0; tid < threads; ++tid)
{
const Index base = tid * groups;
// scatter
for (RanIt p = begin; p != end; ++p)
{
const Index index = static_cast <Index> ((*p - minv) * inv);
if (base <= index && (index - groups) < base)
B[pos[index]++] = *p;
}
// sort
for (Index b = base, bend = base + groups; b < bend; ++b)
std::sort (B.get() + offset[b], B.get() + offset[b + 1]);
}
// copy data back, it seems today we have no profit from multithreaded copy, so
std::copy (B.get(), B.get() + sz, begin); // just memcpy (rep movs) behind
}