/// halves-optimized, 32-bit using intrinsics for bswap + rotation
unsigned rbH5 (const unsigned arg)
{
const unsigned a = (_rotl(arg, 2) & 0x55555555) | (arg & 0xAAAAAAAA);
const unsigned b = (_rotl(a, 4) & 0x66666666) | (a & 0x99999999);
return _byteswap_ulong(_rotr((_rotl(b, 8) & 0x78787878) | (b & 0x87878787), 7));
}