Efficient Algorithm for Bit Reversal (from MSB->LSB to LSB->MSB) in C

C Problem Overview

What is the most efficient algorithm to achieve the following:

0010 0000 => 0000 0100

The conversion is from MSB->LSB to LSB->MSB. All bits must be reversed; that is, this is not endianness-swapping.

C Solutions

Solution 1 - C

NOTE: All algorithms below are in C, but should be portable to your language of choice (just don't look at me when they're not as fast :)

Options

Low Memory (32-bit int, 32-bit machine)(from here):

unsigned int
reverse(register unsigned int x)
{
	x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
	x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
	x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
	x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
	return((x >> 16) | (x << 16));

}

From the famous Bit Twiddling Hacks page:

Fastest (lookup table):

static const unsigned char BitReverseTable256[] = 
{
  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 
  0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 
  0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 
  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 
  0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 
  0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 
  0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
  0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 
  0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
  0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 
  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
  0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 
  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
};

unsigned int v; // reverse 32-bit value, 8 bits at time
unsigned int c; // c will get v reversed

// Option 1:
c = (BitReverseTable256[v & 0xff] << 24) | 
    (BitReverseTable256[(v >> 8) & 0xff] << 16) | 
    (BitReverseTable256[(v >> 16) & 0xff] << 8) |
    (BitReverseTable256[(v >> 24) & 0xff]);

// Option 2:
unsigned char * p = (unsigned char *) &v;
unsigned char * q = (unsigned char *) &c;
q[3] = BitReverseTable256[p[0]]; 
q[2] = BitReverseTable256[p[1]]; 
q[1] = BitReverseTable256[p[2]]; 
q[0] = BitReverseTable256[p[3]];

You can extend this idea to 64-bit ints, or trade off memory for speed (assuming your L1 Data Cache is large enough), and reverse 16 bits at a time with a 64K-entry lookup table.

Others

Simple

unsigned int v;     // input bits to be reversed
unsigned int r = v & 1; // r will be reversed bits of v; first get LSB of v
int s = sizeof(v) * CHAR_BIT - 1; // extra shift needed at end

for (v >>= 1; v; v >>= 1)
{   
  r <<= 1;
  r |= v & 1;
  s--;
}
r <<= s; // shift when v's highest bits are zero

Faster (32-bit processor)

unsigned char b = x;
b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;

Faster (64-bit processor)

unsigned char b; // reverse this (8-bit) byte
b = (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;

If you want to do this on a 32-bit int, just reverse the bits in each byte, and reverse the order of the bytes. That is:

unsigned int toReverse;
unsigned int reversed;
unsigned char inByte0 = (toReverse & 0xFF);
unsigned char inByte1 = (toReverse & 0xFF00) >> 8;
unsigned char inByte2 = (toReverse & 0xFF0000) >> 16;
unsigned char inByte3 = (toReverse & 0xFF000000) >> 24;
reversed = (reverseBits(inByte0) << 24) | (reverseBits(inByte1) << 16) | (reverseBits(inByte2) << 8) | (reverseBits(inByte3);

Results

I benchmarked the two most promising solutions, the lookup table, and bitwise-AND (the first one). The test machine is a laptop w/ 4GB of DDR2-800 and a Core 2 Duo T7500 @ 2.4GHz, 4MB L2 Cache; YMMV. I used gcc 4.3.2 on 64-bit Linux. OpenMP (and the GCC bindings) were used for high-resolution timers.

reverse.c

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

unsigned int
reverse(register unsigned int x)
{
    x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
    x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
    x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
    x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
    return((x >> 16) | (x << 16));

}

int main()
{
    unsigned int *ints = malloc(100000000*sizeof(unsigned int));
    unsigned int *ints2 = malloc(100000000*sizeof(unsigned int));
    for(unsigned int i = 0; i < 100000000; i++)
      ints[i] = rand();
    
    unsigned int *inptr = ints;
    unsigned int *outptr = ints2;
    unsigned int *endptr = ints + 100000000;
    // Starting the time measurement
    double start = omp_get_wtime();
    // Computations to be measured
    while(inptr != endptr)
    {
      (*outptr) = reverse(*inptr);
      inptr++;
      outptr++;
    }
    // Measuring the elapsed time
    double end = omp_get_wtime();
    // Time calculation (in seconds)
    printf("Time: %f seconds\n", end-start);
    
    free(ints);
    free(ints2);
    
    return 0;
}

reverse_lookup.c

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

static const unsigned char BitReverseTable256[] = 
{
  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 
  0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 
  0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 
  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 
  0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 
  0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 
  0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
  0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 
  0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
  0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 
  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
  0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 
  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
};

int main()
{
    unsigned int *ints = malloc(100000000*sizeof(unsigned int));
    unsigned int *ints2 = malloc(100000000*sizeof(unsigned int));
    for(unsigned int i = 0; i < 100000000; i++)
      ints[i] = rand();
    
    unsigned int *inptr = ints;
    unsigned int *outptr = ints2;
    unsigned int *endptr = ints + 100000000;
    // Starting the time measurement
    double start = omp_get_wtime();
    // Computations to be measured
    while(inptr != endptr)
    {
    unsigned int in = *inptr;  
    
    // Option 1:
    //*outptr = (BitReverseTable256[in & 0xff] << 24) | 
    //    (BitReverseTable256[(in >> 8) & 0xff] << 16) | 
    //    (BitReverseTable256[(in >> 16) & 0xff] << 8) |
    //    (BitReverseTable256[(in >> 24) & 0xff]);
    
    // Option 2:
    unsigned char * p = (unsigned char *) &(*inptr);
    unsigned char * q = (unsigned char *) &(*outptr);
    q[3] = BitReverseTable256[p[0]]; 
    q[2] = BitReverseTable256[p[1]]; 
    q[1] = BitReverseTable256[p[2]]; 
    q[0] = BitReverseTable256[p[3]];
    
      inptr++;
      outptr++;
    }
    // Measuring the elapsed time
    double end = omp_get_wtime();
    // Time calculation (in seconds)
    printf("Time: %f seconds\n", end-start);
    
    free(ints);
    free(ints2);
    
    return 0;
}

I tried both approaches at several different optimizations, ran 3 trials at each level, and each trial reversed 100 million random unsigned ints. For the lookup table option, I tried both schemes (options 1 and 2) given on the bitwise hacks page. Results are shown below.

Bitwise AND

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse reverse.c
mrj10@mjlap:~/code$ ./reverse
Time: 2.000593 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 1.938893 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 1.936365 seconds
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse reverse.c
mrj10@mjlap:~/code$ ./reverse
Time: 0.942709 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 0.991104 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 0.947203 seconds
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse reverse.c
mrj10@mjlap:~/code$ ./reverse
Time: 0.922639 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 0.892372 seconds
mrj10@mjlap:~/code$ ./reverse
Time: 0.891688 seconds

Lookup Table (option 1)

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.201127 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.196129 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.235972 seconds              
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.633042 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.655880 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.633390 seconds              
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.652322 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.631739 seconds              
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 0.652431 seconds

Lookup Table (option 2)

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.671537 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.688173 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.664662 seconds
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.049851 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.048403 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.085086 seconds
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse_lookup reverse_lookup.c
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.082223 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.053431 seconds
mrj10@mjlap:~/code$ ./reverse_lookup
Time: 1.081224 seconds

Conclusion

Use the lookup table, with option 1 (byte addressing is unsurprisingly slow) if you're concerned about performance. If you need to squeeze every last byte of memory out of your system (and you might, if you care about the performance of bit reversal), the optimized versions of the bitwise-AND approach aren't too shabby either.

Caveat

Yes, I know the benchmark code is a complete hack. Suggestions on how to improve it are more than welcome. Things I know about:

I don't have access to ICC. This may be faster (please respond in a comment if you can test this out).
A 64K lookup table may do well on some modern microarchitectures with large L1D.
-mtune=native didn't work for -O2/-O3 (ld blew up with some crazy symbol redefinition error), so I don't believe the generated code is tuned for my microarchitecture.
There may be a way to do this slightly faster with SSE. I have no idea how, but with fast replication, packed bitwise AND, and swizzling instructions, there's got to be something there.
I know only enough x86 assembly to be dangerous; here's the code GCC generated on -O3 for option 1, so somebody more knowledgable than myself can check it out:

32-bit

.L3:
movl    (%r12,%rsi), %ecx
movzbl  %cl, %eax
movzbl  BitReverseTable256(%rax), %edx
movl    %ecx, %eax
shrl    $24, %eax
mov     %eax, %eax
movzbl  BitReverseTable256(%rax), %eax
sall    $24, %edx
orl     %eax, %edx
movzbl  %ch, %eax
shrl    $16, %ecx
movzbl  BitReverseTable256(%rax), %eax
movzbl  %cl, %ecx
sall    $16, %eax
orl     %eax, %edx
movzbl  BitReverseTable256(%rcx), %eax
sall    $8, %eax
orl     %eax, %edx
movl    %edx, (%r13,%rsi)
addq    $4, %rsi
cmpq    $400000000, %rsi
jne     .L3

EDIT: I also tried using uint64_t types on my machine to see if there was any performance boost. Performance was about 10% faster than 32-bit, and was nearly identical whether you were just using 64-bit types to reverse bits on two 32-bit int types at a time, or whether you were actually reversing bits in half as many 64-bit values. The assembly code is shown below (for the former case, reversing bits for two 32-bit int types at a time):

.L3:
movq    (%r12,%rsi), %rdx
movq    %rdx, %rax
shrq    $24, %rax
andl    $255, %eax
movzbl  BitReverseTable256(%rax), %ecx
movzbq  %dl,%rax
movzbl  BitReverseTable256(%rax), %eax
salq    $24, %rax
orq     %rax, %rcx
movq    %rdx, %rax
shrq    $56, %rax
movzbl  BitReverseTable256(%rax), %eax
salq    $32, %rax
orq     %rax, %rcx
movzbl  %dh, %eax
shrq    $16, %rdx
movzbl  BitReverseTable256(%rax), %eax
salq    $16, %rax
orq     %rax, %rcx
movzbq  %dl,%rax
shrq    $16, %rdx
movzbl  BitReverseTable256(%rax), %eax
salq    $8, %rax
orq     %rax, %rcx
movzbq  %dl,%rax
shrq    $8, %rdx
movzbl  BitReverseTable256(%rax), %eax
salq    $56, %rax
orq     %rax, %rcx
movzbq  %dl,%rax
shrq    $8, %rdx
movzbl  BitReverseTable256(%rax), %eax
andl    $255, %edx
salq    $48, %rax
orq     %rax, %rcx
movzbl  BitReverseTable256(%rdx), %eax
salq    $40, %rax
orq     %rax, %rcx
movq    %rcx, (%r13,%rsi)
addq    $8, %rsi
cmpq    $400000000, %rsi
jne     .L3

Solution 2 - C

This thread caught my attention since it deals with a simple problem that requires a lot of work (CPU cycles) even for a modern CPU. And one day I also stood there with the same ¤#%"#" problem. I had to flip millions of bytes. However I know all my target systems are modern Intel-based so let's start optimizing to the extreme!!!

So I used Matt J's lookup code as the base. the system I'm benchmarking on is a i7 haswell 4700eq.

Matt J's lookup bitflipping 400 000 000 bytes: Around 0.272 seconds.

I then went ahead and tried to see if Intel's ISPC compiler could vectorise the arithmetics in the reverse.c.

I'm not going to bore you with my findings here since I tried a lot to help the compiler find stuff, anyhow I ended up with performance of around 0.15 seconds to bitflip 400 000 000 bytes. It's a great reduction but for my application that's still way way too slow..

So people let me present the fastest Intel based bitflipper in the world. Clocked at:

Time to bitflip 400000000 bytes: 0.050082 seconds !!!!!

// Bitflip using AVX2 - The fastest Intel based bitflip in the world!!
// Made by Anders Cedronius 2014 (anders.cedronius (you know what) gmail.com)

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

using namespace std;

#define DISPLAY_HEIGHT	4
#define DISPLAY_WIDTH 	32
#define NUM_DATA_BYTES	400000000

// Constants (first we got the mask, then the high order nibble look up table and last we got the low order nibble lookup table)
__attribute__ ((aligned(32))) static unsigned char k1[32*3]={
		0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,
		0x00,0x08,0x04,0x0c,0x02,0x0a,0x06,0x0e,0x01,0x09,0x05,0x0d,0x03,0x0b,0x07,0x0f,0x00,0x08,0x04,0x0c,0x02,0x0a,0x06,0x0e,0x01,0x09,0x05,0x0d,0x03,0x0b,0x07,0x0f,
		0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0,0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0
};

// The data to be bitflipped (+32 to avoid the quantization out of memory problem)
__attribute__ ((aligned(32))) static unsigned char data[NUM_DATA_BYTES+32]={};

extern "C" {
void bitflipbyte(unsigned char[],unsigned int,unsigned char[]);
}

int main()
{

	for(unsigned int i = 0; i < NUM_DATA_BYTES; i++)
	{
		data[i] = rand();
	}

	printf ("\r\nData in(start):\r\n");
	for (unsigned int j = 0; j < 4; j++)
	{
		for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
		{
			printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);
		}
		printf ("\r\n");
	}

	printf ("\r\nNumber of 32-byte chunks to convert: %d\r\n",(unsigned int)ceil(NUM_DATA_BYTES/32.0));

	double start_time = omp_get_wtime();
	bitflipbyte(data,(unsigned int)ceil(NUM_DATA_BYTES/32.0),k1);
	double end_time = omp_get_wtime();

	printf ("\r\nData out:\r\n");
	for (unsigned int j = 0; j < 4; j++)
	{
		for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
		{
			printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);
		}
		printf ("\r\n");
	}
	printf("\r\n\r\nTime to bitflip %d bytes: %f seconds\r\n\r\n",NUM_DATA_BYTES, end_time-start_time);

	// return with no errors
	return 0;
}

The printf's are for debugging..

Here is the workhorse:

bits 64
global bitflipbyte

bitflipbyte:	
		vmovdqa 	ymm2, [rdx]
		add 		rdx, 20h
		vmovdqa 	ymm3, [rdx]
		add 		rdx, 20h
		vmovdqa 	ymm4, [rdx]
bitflipp_loop:
		vmovdqa 	ymm0, [rdi] 
		vpand 		ymm1, ymm2, ymm0 
		vpandn 		ymm0, ymm2, ymm0 
		vpsrld 		ymm0, ymm0, 4h 
		vpshufb 	ymm1, ymm4, ymm1 
		vpshufb 	ymm0, ymm3, ymm0		 
		vpor 		ymm0, ymm0, ymm1
		vmovdqa 	[rdi], ymm0
		add		rdi, 20h
		dec		rsi
		jnz		bitflipp_loop
    	ret

The code takes 32 bytes then masks out the nibbles. The high nibble gets shifted right by 4. Then I use vpshufb and ymm4 / ymm3 as lookup tables. I could use a single lookup table but then I would have to shift left before ORing the nibbles together again.

There are even faster ways of flipping the bits. But I'm bound to single thread and CPU so this was the fastest I could achieve. Can you make a faster version?

Please make no comments about using the Intel C/C++ Compiler Intrinsic Equivalent commands...

Solution 3 - C

This is another solution for folks who love recursion.

The idea is simple. Divide up input by half and swap the two halves, continue until it reaches single bit.

Illustrated in the example below.

Ex : If Input is 00101010   ==> Expected output is 01010100

1. Divide the input into 2 halves 
    0010 --- 1010
    
2. Swap the 2 Halves
    1010     0010

3. Repeat the same for each half.
    10 -- 10 ---  00 -- 10
    10    10      10    00

    1-0 -- 1-0 --- 1-0 -- 0-0
    0 1    0 1     0 1    0 0

Done! Output is 01010100

Here is a recursive function to solve it. (Note I have used unsigned ints, so it can work for inputs up to sizeof(unsigned int)*8 bits.

> The recursive function takes 2 parameters - The value whose bits need > to be reversed and the number of bits in the value.

int reverse_bits_recursive(unsigned int num, unsigned int numBits)
{
    unsigned int reversedNum;;
    unsigned int mask = 0;
   
    mask = (0x1 << (numBits/2)) - 1;

    if (numBits == 1) return num;
    reversedNum = reverse_bits_recursive(num >> numBits/2, numBits/2) |
                   reverse_bits_recursive((num & mask), numBits/2) << numBits/2;
    return reversedNum;
}

int main()
{
    unsigned int reversedNum;
    unsigned int num;

    num = 0x55;
    reversedNum = reverse_bits_recursive(num, 8);
    printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);

    num = 0xabcd;
    reversedNum = reverse_bits_recursive(num, 16);
    printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);

    num = 0x123456;
    reversedNum = reverse_bits_recursive(num, 24);
    printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);

    num = 0x11223344;
    reversedNum = reverse_bits_recursive(num,32);
    printf ("Bit Reversal Input = 0x%x Output = 0x%x\n", num, reversedNum);
}

This is the output:

Bit Reversal Input = 0x55 Output = 0xaa
Bit Reversal Input = 0xabcd Output = 0xb3d5
Bit Reversal Input = 0x123456 Output = 0x651690
Bit Reversal Input = 0x11223344 Output = 0x22cc4488

Solution 4 - C

Well this certainly won't be an answer like Matt J's but hopefully it will still be useful.

size_t reverse(size_t n, unsigned int bytes)
{
    __asm__("BSWAP %0" : "=r"(n) : "0"(n));
    n >>= ((sizeof(size_t) - bytes) * 8);
    n = ((n & 0xaaaaaaaaaaaaaaaa) >> 1) | ((n & 0x5555555555555555) << 1);
    n = ((n & 0xcccccccccccccccc) >> 2) | ((n & 0x3333333333333333) << 2);
    n = ((n & 0xf0f0f0f0f0f0f0f0) >> 4) | ((n & 0x0f0f0f0f0f0f0f0f) << 4);
    return n;
}

This is exactly the same idea as Matt's best algorithm except that there's this little instruction called BSWAP which swaps the bytes (not the bits) of a 64-bit number. So b7,b6,b5,b4,b3,b2,b1,b0 becomes b0,b1,b2,b3,b4,b5,b6,b7. Since we are working with a 32-bit number we need to shift our byte-swapped number down 32 bits. This just leaves us with the task of swapping the 8 bits of each byte which is done and voila! we're done.

Timing: on my machine, Matt's algorithm ran in ~0.52 seconds per trial. Mine ran in about 0.42 seconds per trial. 20% faster is not bad I think.

If you're worried about the availability of the instruction BSWAP Wikipedia lists the instruction BSWAP as being added with 80846 which came out in 1989. It should be noted that Wikipedia also states that this instruction only works on 32 bit registers which is clearly not the case on my machine, it very much works only on 64-bit registers.

This method will work equally well for any integral datatype so the method can be generalized trivially by passing the number of bytes desired:

    size_t reverse(size_t n, unsigned int bytes)
    {
        __asm__("BSWAP %0" : "=r"(n) : "0"(n));
        n >>= ((sizeof(size_t) - bytes) * 8);
        n = ((n & 0xaaaaaaaaaaaaaaaa) >> 1) | ((n & 0x5555555555555555) << 1);
        n = ((n & 0xcccccccccccccccc) >> 2) | ((n & 0x3333333333333333) << 2);
        n = ((n & 0xf0f0f0f0f0f0f0f0) >> 4) | ((n & 0x0f0f0f0f0f0f0f0f) << 4);
        return n;
    }

which can then be called like:

    n = reverse(n, sizeof(char));//only reverse 8 bits
    n = reverse(n, sizeof(short));//reverse 16 bits
    n = reverse(n, sizeof(int));//reverse 32 bits
    n = reverse(n, sizeof(size_t));//reverse 64 bits

The compiler should be able to optimize the extra parameter away (assuming the compiler inlines the function) and for the sizeof(size_t) case the right-shift would be removed completely. Note that GCC at least is not able to remove the BSWAP and right-shift if passed sizeof(char).

Solution 5 - C

Anders Cedronius's answer provides a great solution for people that have an x86 CPU with AVX2 support. For x86 platforms without AVX support or non-x86 platforms, either of the following implementations should work well.

The first code is a variant of the classic binary partitioning method, coded to maximize the use of the shift-plus-logic idiom useful on various ARM processors. In addition, it uses on-the-fly mask generation which could be beneficial for RISC processors that otherwise require multiple instructions to load each 32-bit mask value. Compilers for x86 platforms should use constant propagation to compute all masks at compile time rather than run time.

/* Classic binary partitioning algorithm */
inline uint32_t brev_classic (uint32_t a)
{
    uint32_t m;
    a = (a >> 16) | (a << 16);                            // swap halfwords
    m = 0x00ff00ff; a = ((a >> 8) & m) | ((a << 8) & ~m); // swap bytes
    m = m^(m << 4); a = ((a >> 4) & m) | ((a << 4) & ~m); // swap nibbles
    m = m^(m << 2); a = ((a >> 2) & m) | ((a << 2) & ~m);
    m = m^(m << 1); a = ((a >> 1) & m) | ((a << 1) & ~m);
    return a;
}

In volume 4A of "The Art of Computer Programming", D. Knuth shows clever ways of reversing bits that somewhat surprisingly require fewer operations than the classical binary partitioning algorithms. One such algorithm for 32-bit operands, that I cannot find in TAOCP, is shown in this document on the Hacker's Delight website.

/* Knuth's algorithm from http://www.hackersdelight.org/revisions.pdf. Retrieved 8/19/2015 */
inline uint32_t brev_knuth (uint32_t a)
{
    uint32_t t;
    a = (a << 15) | (a >> 17);
    t = (a ^ (a >> 10)) & 0x003f801f; 
    a = (t + (t << 10)) ^ a;
    t = (a ^ (a >>  4)) & 0x0e038421; 
    a = (t + (t <<  4)) ^ a;
    t = (a ^ (a >>  2)) & 0x22488842; 
    a = (t + (t <<  2)) ^ a;
    return a;
}

Using the Intel compiler C/C++ compiler 13.1.3.198, both of the above functions auto-vectorize nicely targetting XMM registers. They could also be vectorized manually without a lot of effort.

On my IvyBridge Xeon E3 1270v2, using the auto-vectorized code, 100 million uint32_t words were bit-reversed in 0.070 seconds using brev_classic(), and 0.068 seconds using brev_knuth(). I took care to ensure that my benchmark was not limited by system memory bandwidth.

Solution 6 - C

Presuming that you have an array of bits, how about this:

Starting from MSB, push bits into a stack one by one.
Pop bits from this stack into another array (or the same array if you want to save space), placing the first popped bit into MSB and going on to less significant bits from there.

Stack stack = new Stack(); Bit[] bits = new Bit[] { 0, 0, 1, 0, 0, 0, 0, 0 };

for (int i = 0; i < bits.Length; i++) { stack.push(bits[i]); }

for (int i = 0; i < bits.Length; i++) { bits[i] = stack.pop(); }

Solution 7 - C

This ain't no job for a human! ... but perfect for a machine

This is 2015, 6 years from when this question was first asked. Compilers have since become our masters, and our job as humans is only to help them. So what's the best way to give our intentions to the machine?

Bit-reversal is so common that you have to wonder why the x86's ever growing ISA doesn't include an instruction to do it one go.

The reason: if you give your true concise intent to the compiler, bit reversal should only take ~20 CPU cycles. Let me show you how to craft reverse() and use it:

#include <inttypes.h>
#include <stdio.h>

uint64_t reverse(const uint64_t n,
                 const uint64_t k)
{
        uint64_t r, i;
        for (r = 0, i = 0; i < k; ++i)
                r |= ((n >> i) & 1) << (k - i - 1);
        return r;
}

int main()
{
        const uint64_t size = 64;
        uint64_t sum = 0;
        uint64_t a;
        for (a = 0; a < (uint64_t)1 << 30; ++a)
                sum += reverse(a, size);
        printf("%" PRIu64 "\n", sum);
        return 0;
}

Compiling this sample program with Clang version >= 3.6, -O3, -march=native (tested with Haswell), gives artwork-quality code using the new AVX2 instructions, with a runtime of 11 seconds processing ~1 billion reverse()s. That's ~10 ns per reverse(), with .5 ns CPU cycle assuming 2 GHz puts us at the sweet 20 CPU cycles.

You can fit 10 reverse()s in the time it takes to access RAM once for a single large array!
You can fit 1 reverse() in the time it takes to access an L2 cache LUT twice.

Caveat: this sample code should hold as a decent benchmark for a few years, but it will eventually start to show its age once compilers are smart enough to optimize main() to just printf the final result instead of really computing anything. But for now it works in showcasing reverse().

Solution 8 - C

Native ARM instruction "rbit" can do it with 1 cpu cycle and 1 extra cpu register, impossible to beat.

Solution 9 - C

Of course the obvious source of bit-twiddling hacks is here: http://graphics.stanford.edu/~seander/bithacks.html#BitReverseObvious

Solution 10 - C

Implementation with low memory and fastest.

private Byte  BitReverse(Byte bData)
    {
        Byte[] lookup = { 0, 8,  4, 12, 
                          2, 10, 6, 14 , 
                          1, 9,  5, 13,
                          3, 11, 7, 15 };
        Byte ret_val = (Byte)(((lookup[(bData & 0x0F)]) << 4) + lookup[((bData & 0xF0) >> 4)]);
        return ret_val;
    }

Solution 11 - C

Well, this is basically the same as the first "reverse()" but it is 64 bit and only needs one immediate mask to be loaded from the instruction stream. GCC creates code without jumps, so this should be pretty fast.

#include <stdio.h>

static unsigned long long swap64(unsigned long long val)
{
#define ZZZZ(x,s,m) (((x) >>(s)) & (m)) | (((x) & (m))<<(s));
/* val = (((val) >>16) & 0xFFFF0000FFFF) | (((val) & 0xFFFF0000FFFF)<<16); */

val = ZZZZ(val,32,  0x00000000FFFFFFFFull );
val = ZZZZ(val,16,  0x0000FFFF0000FFFFull );
val = ZZZZ(val,8,   0x00FF00FF00FF00FFull );
val = ZZZZ(val,4,   0x0F0F0F0F0F0F0F0Full );
val = ZZZZ(val,2,   0x3333333333333333ull );
val = ZZZZ(val,1,   0x5555555555555555ull );

return val;
#undef ZZZZ
}

int main(void)
{
unsigned long long val, aaaa[16] =
 { 0xfedcba9876543210,0xedcba9876543210f,0xdcba9876543210fe,0xcba9876543210fed
 , 0xba9876543210fedc,0xa9876543210fedcb,0x9876543210fedcba,0x876543210fedcba9
 , 0x76543210fedcba98,0x6543210fedcba987,0x543210fedcba9876,0x43210fedcba98765
 , 0x3210fedcba987654,0x210fedcba9876543,0x10fedcba98765432,0x0fedcba987654321
 };
unsigned iii;

for (iii=0; iii < 16; iii++) {
    val = swap64 (aaaa[iii]);
    printf("A[]=%016llX Sw=%016llx\n", aaaa[iii], val);
    }
return 0;
}

Solution 12 - C

I was curious how fast would be the obvious raw rotation. On my machine (i7@2600), the average for 1,500,150,000 iterations was 27.28 ns (over a a random set of 131,071 64-bit integers).

Advantages: the amount of memory needed is little and the code is simple. I would say it is not that large, either. The time required is predictable and constant for any input (128 arithmetic SHIFT operations + 64 logical AND operations + 64 logical OR operations).

I compared to the best time obtained by @Matt J - who has the accepted answer. If I read his answer correctly, the best he has got was 0.631739 seconds for 1,000,000 iterations, which leads to an average of 631 ns per rotation.

The code snippet I used is this one below:

unsigned long long reverse_long(unsigned long long x)
{
    return (((x >> 0) & 1) << 63) |
           (((x >> 1) & 1) << 62) |
           (((x >> 2) & 1) << 61) |
           (((x >> 3) & 1) << 60) |
           (((x >> 4) & 1) << 59) |
           (((x >> 5) & 1) << 58) |
           (((x >> 6) & 1) << 57) |
           (((x >> 7) & 1) << 56) |
           (((x >> 8) & 1) << 55) |
           (((x >> 9) & 1) << 54) |
           (((x >> 10) & 1) << 53) |
           (((x >> 11) & 1) << 52) |
           (((x >> 12) & 1) << 51) |
           (((x >> 13) & 1) << 50) |
           (((x >> 14) & 1) << 49) |
           (((x >> 15) & 1) << 48) |
           (((x >> 16) & 1) << 47) |
           (((x >> 17) & 1) << 46) |
           (((x >> 18) & 1) << 45) |
           (((x >> 19) & 1) << 44) |
           (((x >> 20) & 1) << 43) |
           (((x >> 21) & 1) << 42) |
           (((x >> 22) & 1) << 41) |
           (((x >> 23) & 1) << 40) |
           (((x >> 24) & 1) << 39) |
           (((x >> 25) & 1) << 38) |
           (((x >> 26) & 1) << 37) |
           (((x >> 27) & 1) << 36) |
           (((x >> 28) & 1) << 35) |
           (((x >> 29) & 1) << 34) |
           (((x >> 30) & 1) << 33) |
           (((x >> 31) & 1) << 32) |
           (((x >> 32) & 1) << 31) |
           (((x >> 33) & 1) << 30) |
           (((x >> 34) & 1) << 29) |
           (((x >> 35) & 1) << 28) |
           (((x >> 36) & 1) << 27) |
           (((x >> 37) & 1) << 26) |
           (((x >> 38) & 1) << 25) |
           (((x >> 39) & 1) << 24) |
           (((x >> 40) & 1) << 23) |
           (((x >> 41) & 1) << 22) |
           (((x >> 42) & 1) << 21) |
           (((x >> 43) & 1) << 20) |
           (((x >> 44) & 1) << 19) |
           (((x >> 45) & 1) << 18) |
           (((x >> 46) & 1) << 17) |
           (((x >> 47) & 1) << 16) |
           (((x >> 48) & 1) << 15) |
           (((x >> 49) & 1) << 14) |
           (((x >> 50) & 1) << 13) |
           (((x >> 51) & 1) << 12) |
           (((x >> 52) & 1) << 11) |
           (((x >> 53) & 1) << 10) |
           (((x >> 54) & 1) << 9) |
           (((x >> 55) & 1) << 8) |
           (((x >> 56) & 1) << 7) |
           (((x >> 57) & 1) << 6) |
           (((x >> 58) & 1) << 5) |
           (((x >> 59) & 1) << 4) |
           (((x >> 60) & 1) << 3) |
           (((x >> 61) & 1) << 2) |
           (((x >> 62) & 1) << 1) |
           (((x >> 63) & 1) << 0);
}

Solution 13 - C

You might want to use the standard template library. It might be slower than the above mentioned code. However, it seems to me clearer and easier to understand.

 #include<bitset>
 #include<iostream>


 template<size_t N>
 const std::bitset<N> reverse(const std::bitset<N>& ordered)
 {
      std::bitset<N> reversed;
      for(size_t i = 0, j = N - 1; i < N; ++i, --j)
           reversed[j] = ordered[i];
      return reversed;
 };


 // test the function
 int main()
 {
      unsigned long num; 
      const size_t N = sizeof(num)*8;

      std::cin >> num;
      std::cout << std::showbase << std::hex;
      std::cout << "ordered  = " << num << std::endl;
      std::cout << "reversed = " << reverse<N>(num).to_ulong()  << std::endl;
      std::cout << "double_reversed = " << reverse<N>(reverse<N>(num)).to_ulong() << std::endl;  
 }

Solution 14 - C

Generic

C code. Using 1 byte input data num as example.

    unsigned char num = 0xaa;   // 1010 1010 (aa) -> 0101 0101 (55)
    int s = sizeof(num) * 8;    // get number of bits
    int i, x, y, p;
    int var = 0;                // make var data type to be equal or larger than num

    for (i = 0; i < (s / 2); i++) {
        // extract bit on the left, from MSB
        p = s - i - 1;
        x = num & (1 << p);
        x = x >> p;
        printf("x: %d\n", x);

        // extract bit on the right, from LSB
        y = num & (1 << i);
        y = y >> i;
        printf("y: %d\n", y);

        var = var | (x << i);       // apply x
        var = var | (y << p);       // apply y
    }

    printf("new: 0x%x\n", new);

Solution 15 - C

How about the following:

    uint reverseMSBToLSB32ui(uint input)
    {
        uint output = 0x00000000;
        uint toANDVar = 0;
        int places = 0;

        for (int i = 1; i < 32; i++)
        {
            places = (32 - i);
            toANDVar = (uint)(1 << places);
            output |= (uint)(input & (toANDVar)) >> places;

        }


        return output;
    }

Small and easy (though, 32 bit only).

Solution 16 - C

I thought this is one of the simplest way to reverse the bit. please let me know if there is any flaw in this logic. basically in this logic, we check the value of the bit in position. set the bit if value is 1 on reversed position.

void bit_reverse(ui32 *data)
{
  ui32 temp = 0;    
  ui32 i, bit_len;    
  {    
   for(i = 0, bit_len = 31; i <= bit_len; i++)   
   {    
    temp |= (*data & 1 << i)? (1 << bit_len-i) : 0;    
   }    
   *data = temp;    
  }    
  return;    
}

Solution 17 - C

unsigned char ReverseBits(unsigned char data)
{
    unsigned char k = 0, rev = 0;

    unsigned char n = data;

    while(n)

    {
        k = n & (~(n - 1));
        n &= (n - 1);
        rev |= (128 / k);
    }
    return rev;
}

Solution 18 - C

I think the simplest method I know follows. MSB is input and LSB is 'reversed' output:

unsigned char rev(char MSB) {
    unsigned char LSB=0;  // for output
    _FOR(i,0,8) {
        LSB= LSB << 1;
        if(MSB&1) LSB = LSB | 1;
        MSB= MSB >> 1;
    }
    return LSB;
}
    
//    It works by rotating bytes in opposite directions. 
//    Just repeat for each byte.

Solution 19 - C

// Purpose: to reverse bits in an unsigned short integer 
// Input: an unsigned short integer whose bits are to be reversed
// Output: an unsigned short integer with the reversed bits of the input one
unsigned short ReverseBits( unsigned short a )
{
     // declare and initialize number of bits in the unsigned short integer
	 const char num_bits = sizeof(a) * CHAR_BIT;

	 // declare and initialize bitset representation of integer a
	 bitset<num_bits> bitset_a(a);			

	 // declare and initialize bitset representation of integer b (0000000000000000)
	 bitset<num_bits> bitset_b(0);					

	 // declare and initialize bitset representation of mask (0000000000000001)
	 bitset<num_bits> mask(1);			

	 for ( char i = 0; i < num_bits; ++i )
	 {
		  bitset_b = (bitset_b << 1) | bitset_a & mask;
		  bitset_a >>= 1;
	 }

	 return (unsigned short) bitset_b.to_ulong();
}

void PrintBits( unsigned short a )
{
	 // declare and initialize bitset representation of a
	 bitset<sizeof(a) * CHAR_BIT> bitset(a);

	 // print out bits
     cout << bitset << endl;
}


// Testing the functionality of the code

int main ()
{
	 unsigned short a = 17, b;

	 cout << "Original: "; 
	 PrintBits(a);

	 b = ReverseBits( a );

	 cout << "Reversed: ";
	 PrintBits(b);
}

// Output:
Original: 0000000000010001
Reversed: 1000100000000000

Solution 20 - C

Another loop-based solution that exits quickly when the number is low (in C++ for multiple types)

template<class T>
T reverse_bits(T in) {
    T bit = static_cast<T>(1) << (sizeof(T) * 8 - 1);
    T out;

    for (out = 0; bit && in; bit >>= 1, in >>= 1) {
        if (in & 1) {
            out |= bit;
        }
    }
    return out;
}

or in C for an unsigned int

unsigned int reverse_bits(unsigned int in) {
    unsigned int bit = 1u << (sizeof(T) * 8 - 1);
    unsigned int out;

    for (out = 0; bit && in; bit >>= 1, in >>= 1) {
        if (in & 1)
            out |= bit;
    }
    return out;
}

Solution 21 - C

It seems that many other posts are concerned about speed (i.e best = fastest). What about simplicity? Consider:

char ReverseBits(char character) {
    char reversed_character = 0;
    for (int i = 0; i < 8; i++) {
        char ith_bit = (c >> i) & 1;
        reversed_character |= (ith_bit << (sizeof(char) - 1 - i));
    }
    return reversed_character;
}

and hope that clever compiler will optimise for you.

If you want to reverse a longer list of bits (containing sizeof(char) * n bits), you can use this function to get:

void ReverseNumber(char* number, int bit_count_in_number) {
    int bytes_occupied = bit_count_in_number / sizeof(char);      

    // first reverse bytes
    for (int i = 0; i <= (bytes_occupied / 2); i++) {
        swap(long_number[i], long_number[n - i]);
    }
    
    // then reverse bits of each individual byte
    for (int i = 0; i < bytes_occupied; i++) {
         long_number[i] = ReverseBits(long_number[i]);
    }
}

This would reverse [10000000, 10101010] into [01010101, 00000001].

Solution 22 - C

Efficient can mean throughput or latency.

For throughout, see the answer by Anders Cedronius, it’s a good one.

For lower latency, I would recommend this code:

uint32_t reverseBits( uint32_t x )
{
#if defined(__arm__) || defined(__aarch64__)
	__asm__( "rbit %0, %1" : "=r" ( x ) : "r" ( x ) );
	return x;
#endif
	// Flip pairwise
	x = ( ( x & 0x55555555 ) << 1 ) | ( ( x & 0xAAAAAAAA ) >> 1 );
	// Flip pairs
	x = ( ( x & 0x33333333 ) << 2 ) | ( ( x & 0xCCCCCCCC ) >> 2 );
	// Flip nibbles
	x = ( ( x & 0x0F0F0F0F ) << 4 ) | ( ( x & 0xF0F0F0F0 ) >> 4 );

	// Flip bytes. CPUs have an instruction for that, pretty fast one.
#ifdef _MSC_VER
	return _byteswap_ulong( x );
#elif defined(__INTEL_COMPILER)
	return (uint32_t)_bswap( (int)x );
#else
	// Assuming gcc or clang
	return __builtin_bswap32( x );
#endif
}

Compilers output: https://godbolt.org/z/5ehd89

Solution 23 - C

For other web-searchers who might encounter this question, here is a summary (for C and JavaScript).

For a complete solution in JavaScript, we can first generate the table:

const BIT_REVERSAL_TABLE = new Array(256)

for (var i = 0; i < 256; ++i) {
  var v = i, r = i, s = 7;
  for (v >>>= 1; v; v >>>= 1) {
    r <<= 1;
    r |= v & 1;
    --s;
  }
  BIT_REVERSAL_TABLE[i] = (r << s) & 0xff;
}

This gives us BIT_REVERSAL_TABLE, which is what @MattJ posted:

const BIT_REVERSAL_TABLE = new Uint8Array([      
  0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, 
  0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, 
  0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, 
  0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, 
  0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, 
  0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
  0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, 
  0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
  0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
  0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, 
  0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
  0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
  0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, 
  0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
  0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, 
  0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
])

Then the algorithms for 8-bit, 16-bit, and 32-bit unsigned integers can be found here:

function reverseBits8(n) {
  return BIT_REVERSAL_TABLE[n]
}

function reverseBits16(n) {
  return (BIT_REVERSAL_TABLE[(n >> 8) & 0xff] |
    BIT_REVERSAL_TABLE[n & 0xff] << 8)
}

function reverseBits32(n) {
  return (BIT_REVERSAL_TABLE[n & 0xff] << 24) |
    (BIT_REVERSAL_TABLE[(n >>> 8) & 0xff] << 16) |
    (BIT_REVERSAL_TABLE[(n >>> 16) & 0xff] << 8) |
    BIT_REVERSAL_TABLE[(n >>> 24) & 0xff];
}

Note, the 32-bit version doesn't work in JavaScript (must convert to using BigInts which is straightforward), but should work in a 64-bit language:

log8(0b11000100)
log16(0b1110001001001100)
log32(0b11110010111110111100110010101011)

// 0b11000100 => 0b00100011
// 0b1110001001001100 => 0b0011001001000111
// doesn't work in JS it seems:
// 0b11110010111110111100110010101011 => 0b0-101010110011000010000010110001

function log8(n) {
  console.log(`${bits(n, 8)} => ${bits(reverseBits8(n), 8)}`)
}

function log16(n) {
  console.log(`${bits(n, 16)} => ${bits(reverseBits16(n), 16)}`)
}

function log32(n) {
  console.log(`${bits(n, 32)} => ${bits(reverseBits32(n), 32)}`)
}

function bits(n, size) {
  return `0b${n.toString(2).padStart(size, '0')}`
}

Note: This solution works in JavaScript for 32-bits:

function reverseBits32(n) {
  let res = 0;
  for (let i = 0; i < 32; i++) {
    res = (res << 1) + (n & 1);
    n = n >>> 1;
  }

  return res >>> 0;
}

All 3 table based solutions will work fine in C. Here is a rough C version:

#include <stdlib.h>

static uint8_t* BIT_REVERSAL_TABLE;

uint8_t* 
make_bit_reversal_table() {
  uint8_t *table = malloc(256 * sizeof(uint8_t));
  uint8_t i;
  for (i = 0; i < 256 ; ++i) {
    uint8_t v = i;
    uint8_t r = i;
    uint8_t s = 7;
    for (v = v >> 1; v; v = v >> 1) {
      r <<= 1;
      r |= v & 1;
      --s;
    }
    table[i] = (r << s) & 0xff;
  }
  return table;
}

uint8_t 
reverse_bits_8(uint8_t n) {
  return BIT_REVERSAL_TABLE[n];
}

uint16_t
reverse_bits_16(uint16_t n)
{
  return (BIT_REVERSAL_TABLE[(n >> 8) & 0xff]
    | BIT_REVERSAL_TABLE[n & 0xff] << 8);
}

uint32_t
reverse_bits_32(uint32_t n) {
  return (BIT_REVERSAL_TABLE[n & 0xff] << 24) 
    | (BIT_REVERSAL_TABLE[(n >> 8) & 0xff] << 16) 
    | (BIT_REVERSAL_TABLE[(n >> 16) & 0xff] << 8) 
    | BIT_REVERSAL_TABLE[(n >> 24) & 0xff];
}

int 
main(void) {
  BIT_REVERSAL_TABLE = make_bit_reversal_table();
  return 0;
}

Solution 24 - C

Bit reversal in pseudo code

source -> byte to be reversed b00101100 destination -> reversed, also needs to be of unsigned type so sign bit is not propogated down

copy into temp so original is unaffected, also needs to be of unsigned type so that sign bit is not shifted in automaticaly

bytecopy = b0010110

LOOP8: //do this 8 times test if bytecopy is < 0 (negative)

	set bit8 (msb) of reversed = reversed | b10000000 

else do not set bit8

shift bytecopy left 1 place
bytecopy = bytecopy << 1 = b0101100 result

shift result right 1 place
reversed = reversed >> 1 = b00000000
8 times no then up^ LOOP8
8 times yes then done.

Solution 25 - C

This is for 32 bit, we need to change the size if we consider 8 bits.

    void bitReverse(int num)
    {
        int num_reverse = 0;
        int size = (sizeof(int)*8) -1;
        int i=0,j=0;
        for(i=0,j=size;i<=size,j>=0;i++,j--)
        {
            if((num >> i)&1)
            {
                num_reverse = (num_reverse | (1<<j));
            }
        }
        printf("\n rev num = %d\n",num_reverse);
    }

Reading the input integer "num" in LSB->MSB order and storing in num_reverse in MSB->LSB order.

Solution 26 - C

My simple solution

BitReverse(IN)
	OUT = 0x00;
	R = 1;		// Right mask   ...0000.0001
	L = 0;		// Left mask    1000.0000...
    L = ~0; 
    L = ~(i >> 1);
	int size = sizeof(IN) * 4;  // bit size

	while(size--){
		if(IN & L) OUT = OUT | R; // start from MSB  1000.xxxx
		if(IN & R) OUT = OUT | L; // start from LSB  xxxx.0001
		L = L >> 1;
		R = R << 1; 
    }
    return OUT;

Content Type	Original Author	Original Content on Stackoverflow
Question	green_t	View Question on Stackoverflow
Solution 1 - C	Matt J	View Answer on Stackoverflow
Solution 2 - C	Anders Cedronius	View Answer on Stackoverflow
Solution 3 - C	Dennis Mathews	View Answer on Stackoverflow
Solution 4 - C	SirGuy	View Answer on Stackoverflow
Solution 5 - C	njuffa	View Answer on Stackoverflow
Solution 6 - C	Frederick The Fool	View Answer on Stackoverflow
Solution 7 - C	user2875414	View Answer on Stackoverflow
Solution 8 - C	metalogic	View Answer on Stackoverflow
Solution 9 - C	Anders Hansson	View Answer on Stackoverflow
Solution 10 - C	Aung	View Answer on Stackoverflow
Solution 11 - C	wildplasser	View Answer on Stackoverflow
Solution 12 - C	marian adam	View Answer on Stackoverflow
Solution 13 - C	Cem	View Answer on Stackoverflow
Solution 14 - C	vjangus	View Answer on Stackoverflow
Solution 15 - C	BlueAutumn	View Answer on Stackoverflow
Solution 16 - C	Arun Nagendran	View Answer on Stackoverflow
Solution 17 - C	user3615967	View Answer on Stackoverflow
Solution 18 - C	user7726695	View Answer on Stackoverflow
Solution 19 - C	MikhailJacques	View Answer on Stackoverflow
Solution 20 - C	Daniel Santos	View Answer on Stackoverflow
Solution 21 - C	mercury0114	View Answer on Stackoverflow
Solution 22 - C	Soonts	View Answer on Stackoverflow
Solution 23 - C	Lance	View Answer on Stackoverflow
Solution 24 - C	Peter Sikora	View Answer on Stackoverflow
Solution 25 - C	karthik kalakodimi	View Answer on Stackoverflow
Solution 26 - C	Ivan Hionidi	View Answer on Stackoverflow