// bucket sort 3/p
// (based on variant 1) -- no merge, each thread has its own buckets
template <class RanIt>
void parBucketSort3 (RanIt begin, RanIt end)
{
using namespace Detail::BucketSort;
typedef typename std::iterator_traits <RanIt>::value_type T;
const Index sz = end - begin;
if (sz < 2) return;
// find min and max
T minv, maxv;
parallel_min_and_max (begin, end, minv, maxv);
if (minv == maxv) return;
// prepare buckets
typedef std::vector <T> Bucket;
typedef std::vector <Bucket> Buckets;
const int threads = omp_get_max_threads();
const Index bsz =
std::max (static_cast <Index> (threads),
static_cast <Index> ((sz / BucketSize <T>::value / threads) * threads));
const T inv = T (bsz - 1) / (maxv - minv);
const Index groups = bsz / threads;
Buckets B (bsz);
#pragma omp parallel for
for (int tid = 0; tid < threads; ++tid)
{
const Index base = tid * groups;
for (Index b = base, bend = base + groups; b < bend; ++b)
B[b].reserve (BucketSize <T>::value + BucketSize <T>::value / 2);
// scatter
for (RanIt p = begin; p != end; ++p)
{
const Index index = static_cast <Index> ((*p - minv) * inv);
if (base <= index && (index - groups) < base)
B[index].push_back (*p);
}
// sort
for (Index b = base, bend = base + groups; b < bend; ++b)
std::sort (B[b].begin(), B[b].end());
}
// calculate offsets
std::unique_ptr <Index[]> offset (new Index[bsz + 1]);
offset[0] = 0;
for (Index i = 1; i <= bsz; ++i)
offset[i] = B[i - 1].size() + offset[i - 1];
// gather
#pragma omp parallel for
for (Index b = 0; b < bsz; ++b)
std::copy (B[b].begin(), B[b].end(), begin + offset[b]);
}