codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
/// halves method with iSSE2 with intrinsics void prb1 (unsigned *begin, unsigned count) { unsigned alignx = 4 - ((reinterpret_cast<unsigned>(begin) >> 2) & 3); if (count < alignx) alignx = count; for (unsigned i = alignx; i; --i, ++begin) *begin = rbH5 (*begin); if ((count -= alignx) != 0) { // masks const __m128i ma0 = _mm_set_epi32 (0x55555555, 0x55555555, 0x55555555, 0x55555555) , mb0 = _mm_set_epi32 (0xAAAAAAAA, 0xAAAAAAAA, 0xAAAAAAAA, 0xAAAAAAAA) , ma1 = _mm_set_epi32 (0x33333333, 0x33333333, 0x33333333, 0x33333333) , mb1 = _mm_set_epi32 (0xCCCCCCCC, 0xCCCCCCCC, 0xCCCCCCCC, 0xCCCCCCCC) , ma2 = _mm_set_epi32 (0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F) , mb2 = _mm_set_epi32 (0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0) , ma3 = _mm_set_epi32 (0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF) , mb3 = _mm_set_epi32 (0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00) ; // 8 registers of 16 // loop for (; count > 7; begin += 8, count -= 8) { _mm_prefetch ((const char*)(begin + 96), _MM_HINT_NTA); __m128i a = _mm_load_si128 ((const __m128i*)(begin)); a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 1), ma0) , _mm_and_si128 (_mm_slli_epi32(a, 1), mb0)); __m128i b = _mm_load_si128 ((const __m128i*)(begin + 4)); b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 1), ma0) , _mm_and_si128 (_mm_slli_epi32(b, 1), mb0)); a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 2), ma1) , _mm_and_si128 (_mm_slli_epi32(a, 2), mb1)); b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 2), ma1) , _mm_and_si128 (_mm_slli_epi32(b, 2), mb1)); a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 4), ma2) , _mm_and_si128 (_mm_slli_epi32(a, 4), mb2)); b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 4), ma2) , _mm_and_si128 (_mm_slli_epi32(b, 4), mb2)); a = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(a, 8), ma3) , _mm_and_si128 (_mm_slli_epi32(a, 8), mb3)); _mm_store_si128 ((__m128i*)begin , _mm_shufflehi_epi16(_mm_shufflelo_epi16(a, 0xB1), 0xB1)); b = _mm_or_si128 (_mm_and_si128 (_mm_srli_epi32(b, 8), ma3) , _mm_and_si128 (_mm_slli_epi32(b, 8), mb3)); _mm_store_si128 ((__m128i*)(begin + 4) , _mm_shufflehi_epi16(_mm_shufflelo_epi16(b, 0xB1), 0xB1)); } while (count--) { *begin = rbH5 (*begin); ++begin; } } }
Private
[
?
]
Run code
Submit