codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
/// halves method with SSSE3 (pshufb used as bswap) void prb2 (unsigned *begin, unsigned count) { unsigned alignx = 4 - ((reinterpret_cast<unsigned>(begin) >> 2) & 3); if (count < alignx) alignx = count; for (unsigned i = alignx; i; --i, ++begin) *begin = rbH5 (*begin); if ((count -= alignx) != 0) { __declspec(align(16)) unsigned char shm[16] = { '\x3', '\x2', '\x1', '\x0', '\x7', '\x6', '\x5', '\x4' , '\xB', '\xA', '\x9', '\x8', '\xF', '\xE', '\xD', '\xC' }; __asm { // prepare masks mov eax, 55555555H mov edx, 33333333H movd xmm2, eax // xmm2 stores (5) == not (A) movd xmm3, edx // xmm3 stores (3) == not (C) pshufd xmm2, xmm2, 0 // broadcast pshufd xmm3, xmm3, 0 // broadcast mov eax, 0F0F0F0FH movd xmm4, eax // xmm4 stores (0F) == not (F0) pshufd xmm4, xmm4, 0 // shuffle mask movdqa xmm5, OWORD PTR [shm] // begin loop mov edx, begin mov ecx, count align 16 LOOP1: prefetchnta [edx+384] movdqa xmm0, OWORD PTR [edx] // load next four integers movdqa xmm7, xmm2 // 0-1-7 movdqa xmm1, xmm0 psrld xmm0, 1 pslld xmm1, 1 pand xmm0, xmm2 // mask (5) pandn xmm7, xmm1 // mask (A) por xmm0, xmm7 // now adjacent bits are swapped movdqa xmm1, xmm3 // 0-6-1 movdqa xmm6, xmm0 psrld xmm0, 2 pslld xmm6, 2 pand xmm0, xmm3 // mask (3) pandn xmm1, xmm6 // mask (C) por xmm0, xmm1 // now adjacent bit pairs are swapped movdqa xmm7, xmm4 // 0-1-7 movdqa xmm1, xmm0 psrld xmm0, 4 pslld xmm1, 4 pand xmm0, xmm4 // mask (0F) pandn xmm7, xmm1 // mask (F0) por xmm0, xmm7 // adjacent bit groups of four are swapped pshufb xmm0, xmm5 movdqa OWORD PTR [edx], xmm0 // store resulting four integers add edx, 16 sub ecx, 4 cmp ecx, 3 ja LOOP1 // while ecx > 3 // end loop mov begin, edx mov count, ecx } while (count--) { *begin = rbH5 (*begin); ++begin; } } }
Private
[
?
]
Run code
Submit