[ create a new paste ] login | about

Link: http://codepad.org/wNL3ir9t    [ raw code | fork ]

Evetro - C++, pasted on Jul 11:
/// halves method with iSSE2 with intrinsics
void prb1 (unsigned *begin, unsigned count)
{
  unsigned alignx = 4 - ((reinterpret_cast<unsigned>(begin) >> 2) & 3);
  if (count < alignx)
    alignx = count;
  
  for (unsigned i = alignx; i; --i, ++begin)
    *begin = rbH5 (*begin);

  if ((count -= alignx) != 0)
  {
    // masks
    const __m128i ma0 = _mm_set_epi32 (0x55555555, 0x55555555, 0x55555555, 0x55555555)
                , mb0 = _mm_set_epi32 (0xAAAAAAAA, 0xAAAAAAAA, 0xAAAAAAAA, 0xAAAAAAAA)
                , ma1 = _mm_set_epi32 (0x33333333, 0x33333333, 0x33333333, 0x33333333)
                , mb1 = _mm_set_epi32 (0xCCCCCCCC, 0xCCCCCCCC, 0xCCCCCCCC, 0xCCCCCCCC)
                , ma2 = _mm_set_epi32 (0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F)
                , mb2 = _mm_set_epi32 (0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0)
                , ma3 = _mm_set_epi32 (0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF)
                , mb3 = _mm_set_epi32 (0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00)
                ; // 8 registers of 16

    // loop
    for (; count > 7; begin += 8, count -= 8)
    {
      _mm_prefetch ((const char*)(begin + 96), _MM_HINT_NTA);

      __m128i a = _mm_load_si128 ((const __m128i*)(begin));
            
      a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 1), ma0)
                      , _mm_and_si128 (_mm_slli_epi32(a, 1), mb0));

      __m128i b = _mm_load_si128 ((const __m128i*)(begin + 4));

      b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 1), ma0)
                      , _mm_and_si128 (_mm_slli_epi32(b, 1), mb0));

      a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 2), ma1)
                      , _mm_and_si128 (_mm_slli_epi32(a, 2), mb1));

      b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 2), ma1)
                      , _mm_and_si128 (_mm_slli_epi32(b, 2), mb1));

      a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 4), ma2)
                      , _mm_and_si128 (_mm_slli_epi32(a, 4), mb2));

      b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 4), ma2)
                      , _mm_and_si128 (_mm_slli_epi32(b, 4), mb2));

      a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 8), ma3)
                      , _mm_and_si128 (_mm_slli_epi32(a, 8), mb3));

      _mm_store_si128 ((__m128i*)begin
                      , _mm_shufflehi_epi16(_mm_shufflelo_epi16(a, 0xB1), 0xB1));

      b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 8), ma3)
                      , _mm_and_si128 (_mm_slli_epi32(b, 8), mb3));

      _mm_store_si128 ((__m128i*)(begin + 4)
                      , _mm_shufflehi_epi16(_mm_shufflelo_epi16(b, 0xB1), 0xB1));
    }

    while (count--)
    {
      *begin = rbH5 (*begin);
      ++begin;
    }

  }
}


Create a new paste based on this one


Comments: