Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/joshua/decoder/ff/lm/kenlm/util/bit_packing.hh
00001 #ifndef UTIL_BIT_PACKING__
00002 #define UTIL_BIT_PACKING__
00003 
00004 /* Bit-level packing routines */
00005 
00006 #include <assert.h>
00007 #ifdef __APPLE__
00008 #include <architecture/byte_order.h>
00009 #elif __linux__
00010 #include <endian.h>
00011 #else
00012 #include <arpa/nameser_compat.h>
00013 #endif 
00014 
00015 #include <inttypes.h>
00016 
00017 namespace util {
00018 
00019 /* WARNING WARNING WARNING:
00020  * The write functions assume that memory is zero initially.  This makes them
00021  * faster and is the appropriate case for mmapped language model construction.
00022  * These routines assume that unaligned access to uint64_t is fast and that
00023  * storage is little endian.  This is the case on x86_64.  I'm not sure how 
00024  * fast unaligned 64-bit access is on x86 but my target audience is large
00025  * language models for which 64-bit is necessary.  
00026  *
00027  * Call the BitPackingSanity function to sanity check.  Calling once suffices,
00028  * but it may be called multiple times when that's inconvenient.  
00029  */
00030 
00031 
00032 // Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.  
00033 #if BYTE_ORDER == LITTLE_ENDIAN
00034 inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) {
00035   return bit;
00036 }
00037 #elif BYTE_ORDER == BIG_ENDIAN
00038 inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
00039   return 64 - length - bit;
00040 }
00041 #else
00042 #error "Bit packing code isn't written for your byte order."
00043 #endif
00044 
00045 inline uint64_t ReadOff(const void *base, uint64_t bit_off) {
00046   return *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3));
00047 }
00048 
00049 /* Pack integers up to 57 bits using their least significant digits. 
00050  * The length is specified using mask:
00051  * Assumes mask == (1 << length) - 1 where length <= 57.   
00052  */
00053 inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, uint64_t mask) {
00054   return (ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, length)) & mask;
00055 }
00056 /* Assumes value < (1 << length) and length <= 57.
00057  * Assumes the memory is zero initially. 
00058  */
00059 inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) {
00060   *reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |= 
00061     (value << BitPackShift(bit_off & 7, length));
00062 }
00063 
00064 /* Same caveats as above, but for a 25 bit limit. */
00065 inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) {
00066   return (*reinterpret_cast<const uint32_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask;
00067 }
00068 
00069 inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) {
00070   *reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |= 
00071     (value << BitPackShift(bit_off & 7, length));
00072 }
00073 
00074 typedef union { float f; uint32_t i; } FloatEnc;
00075 
00076 inline float ReadFloat32(const void *base, uint64_t bit_off) {
00077   FloatEnc encoded;
00078   encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32);
00079   return encoded.f;
00080 }
00081 inline void WriteFloat32(void *base, uint64_t bit_off, float value) {
00082   FloatEnc encoded;
00083   encoded.f = value;
00084   WriteInt57(base, bit_off, 32, encoded.i);
00085 }
00086 
00087 const uint32_t kSignBit = 0x80000000;
00088 
00089 inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) {
00090   FloatEnc encoded;
00091   encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31);
00092   // Sign bit set means negative.  
00093   encoded.i |= kSignBit;
00094   return encoded.f;
00095 }
00096 inline void WriteNonPositiveFloat31(void *base, uint64_t bit_off, float value) {
00097   FloatEnc encoded;
00098   encoded.f = value;
00099   encoded.i &= ~kSignBit;
00100   WriteInt57(base, bit_off, 31, encoded.i);
00101 }
00102 
00103 void BitPackingSanity();
00104 
00105 // Return bits required to store integers upto max_value.  Not the most
00106 // efficient implementation, but this is only called a few times to size tries. 
00107 uint8_t RequiredBits(uint64_t max_value);
00108 
00109 struct BitsMask {
00110   static BitsMask ByMax(uint64_t max_value) {
00111     BitsMask ret;
00112     ret.FromMax(max_value);
00113     return ret;
00114   }
00115   static BitsMask ByBits(uint8_t bits) {
00116     BitsMask ret;
00117     ret.bits = bits;
00118     ret.mask = (1ULL << bits) - 1;
00119     return ret;
00120   }
00121   void FromMax(uint64_t max_value) {
00122     bits = RequiredBits(max_value);
00123     mask = (1ULL << bits) - 1;
00124   }
00125   uint8_t bits;
00126   uint64_t mask;
00127 };
00128 
00129 } // namespace util
00130 
00131 #endif // UTIL_BIT_PACKING__