[ create a new paste ] login | about

Link: http://codepad.org/NV5aixkz    [ raw code | fork ]

mohit_at_codepad - C++, pasted on Sep 4:
/* This program emulates population count for GPUs */
/* Memory look up is very costly and so is the integer arithmetic */
/* Supports parallel processing and there are plenty of optimizations */
/* possible in this algo making it suitable for the requirement */

/* Number of constants required = logn for masks + logn for shifts */
/* Number of temps required = 1 to keep ui >> k value */
/* Number of shifts = logn */
/* Number of addition = logn */
/* Number of andings = 2logn */

int algo1(unsigned int ui)
{
  return static_cast<int>( __builtin_popcount(ui) );
}

int algo2(unsigned int ui)
{
  int masks[] = {0x55555555, 0x33333333, 0x0f0f0f0f, 0x00ff00ff, 0x0000ffff};
  ui = (ui & masks[0]) + ((ui >> 1)  & masks[0]);
  ui = (ui & masks[1]) + ((ui >> 2)  & masks[1]);
  ui = (ui & masks[2]) + ((ui >> 4)  & masks[2]);
  ui = (ui & masks[3]) + ((ui >> 8)  & masks[3]);
  ui = (ui & masks[4]) + ((ui >> 16) & masks[4]);
  return static_cast<int>(ui);
}

int main()
{
  for(int i = 0; i < 1000; ++i)
  {
    const unsigned int ui = (rand() % 0xFFFF) << 16 | rand() % 0xFFFF;
    const int m1 = algo1(ui);
    const int m2 = algo2(ui);
    if(m1 != m2) cout << "Fails for " << ui << endl;
  }
}


Output:
No errors or program output.


Create a new paste based on this one


Comments: