/// halves method with iSSE2 with intrinsics
void prb1 (unsigned *begin, unsigned count)
{
unsigned alignx = 4 - ((reinterpret_cast<unsigned>(begin) >> 2) & 3);
if (count < alignx)
alignx = count;
for (unsigned i = alignx; i; --i, ++begin)
*begin = rbH5 (*begin);
if ((count -= alignx) != 0)
{
// masks
const __m128i ma0 = _mm_set_epi32 (0x55555555, 0x55555555, 0x55555555, 0x55555555)
, mb0 = _mm_set_epi32 (0xAAAAAAAA, 0xAAAAAAAA, 0xAAAAAAAA, 0xAAAAAAAA)
, ma1 = _mm_set_epi32 (0x33333333, 0x33333333, 0x33333333, 0x33333333)
, mb1 = _mm_set_epi32 (0xCCCCCCCC, 0xCCCCCCCC, 0xCCCCCCCC, 0xCCCCCCCC)
, ma2 = _mm_set_epi32 (0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F)
, mb2 = _mm_set_epi32 (0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0)
, ma3 = _mm_set_epi32 (0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF)
, mb3 = _mm_set_epi32 (0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00)
; // 8 registers of 16
// loop
for (; count > 7; begin += 8, count -= 8)
{
_mm_prefetch ((const char*)(begin + 96), _MM_HINT_NTA);
__m128i a = _mm_load_si128 ((const __m128i*)(begin));
a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 1), ma0)
, _mm_and_si128 (_mm_slli_epi32(a, 1), mb0));
__m128i b = _mm_load_si128 ((const __m128i*)(begin + 4));
b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 1), ma0)
, _mm_and_si128 (_mm_slli_epi32(b, 1), mb0));
a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 2), ma1)
, _mm_and_si128 (_mm_slli_epi32(a, 2), mb1));
b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 2), ma1)
, _mm_and_si128 (_mm_slli_epi32(b, 2), mb1));
a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 4), ma2)
, _mm_and_si128 (_mm_slli_epi32(a, 4), mb2));
b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 4), ma2)
, _mm_and_si128 (_mm_slli_epi32(b, 4), mb2));
a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 8), ma3)
, _mm_and_si128 (_mm_slli_epi32(a, 8), mb3));
_mm_store_si128 ((__m128i*)begin
, _mm_shufflehi_epi16(_mm_shufflelo_epi16(a, 0xB1), 0xB1));
b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 8), ma3)
, _mm_and_si128 (_mm_slli_epi32(b, 8), mb3));
_mm_store_si128 ((__m128i*)(begin + 4)
, _mm_shufflehi_epi16(_mm_shufflelo_epi16(b, 0xB1), 0xB1));
}
while (count--)
{
*begin = rbH5 (*begin);
++begin;
}
}
}