|
Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 /* Simple implementation of 00002 * @inproceedings{bhikshacompression, 00003 * author={Bhiksha Raj and Ed Whittaker}, 00004 * year={2003}, 00005 * title={Lossless Compression of Language Model Structure and Word Identifiers}, 00006 * booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing}, 00007 * pages={388--391}, 00008 * } 00009 * 00010 * Currently only used for next pointers. 00011 */ 00012 00013 #include <inttypes.h> 00014 00015 #include "lm/binary_format.hh" 00016 #include "lm/trie.hh" 00017 #include "util/bit_packing.hh" 00018 #include "util/sorted_uniform.hh" 00019 00020 namespace lm { 00021 namespace ngram { 00022 class Config; 00023 00024 namespace trie { 00025 00026 class DontBhiksha { 00027 public: 00028 static const ModelType kModelTypeAdd = static_cast<ModelType>(0); 00029 00030 static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {} 00031 00032 static std::size_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; } 00033 00034 static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) { 00035 return util::RequiredBits(max_next); 00036 } 00037 00038 DontBhiksha(const void *base, uint64_t max_offset, uint64_t max_next, const Config &config); 00039 00040 void ReadNext(const void *base, uint64_t bit_offset, uint64_t /*index*/, uint8_t total_bits, NodeRange &out) const { 00041 out.begin = util::ReadInt57(base, bit_offset, next_.bits, next_.mask); 00042 out.end = util::ReadInt57(base, bit_offset + total_bits, next_.bits, next_.mask); 00043 //assert(out.end >= out.begin); 00044 } 00045 00046 void WriteNext(void *base, uint64_t bit_offset, uint64_t /*index*/, uint64_t value) { 00047 util::WriteInt57(base, bit_offset, next_.bits, value); 00048 } 00049 00050 void FinishedLoading(const Config &/*config*/) {} 00051 00052 void LoadedBinary() {} 00053 00054 uint8_t InlineBits() const { return next_.bits; } 00055 00056 private: 00057 util::BitsMask next_; 00058 }; 00059 00060 class ArrayBhiksha { 00061 public: 00062 static const ModelType kModelTypeAdd = kArrayAdd; 00063 00064 static void UpdateConfigFromBinary(int fd, Config &config); 00065 00066 static std::size_t Size(uint64_t max_offset, uint64_t max_next, const Config &config); 00067 00068 static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config); 00069 00070 ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_value, const Config &config); 00071 00072 void ReadNext(const void *base, uint64_t bit_offset, uint64_t index, uint8_t total_bits, NodeRange &out) const { 00073 const uint64_t *begin_it = util::BinaryBelow(util::IdentityAccessor<uint64_t>(), offset_begin_, offset_end_, index); 00074 const uint64_t *end_it; 00075 for (end_it = begin_it; (end_it < offset_end_) && (*end_it <= index + 1); ++end_it) {} 00076 --end_it; 00077 out.begin = ((begin_it - offset_begin_) << next_inline_.bits) | 00078 util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask); 00079 out.end = ((end_it - offset_begin_) << next_inline_.bits) | 00080 util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask); 00081 } 00082 00083 void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) { 00084 uint64_t encode = value >> next_inline_.bits; 00085 for (; write_to_ <= offset_begin_ + encode; ++write_to_) *write_to_ = index; 00086 util::WriteInt57(base, bit_offset, next_inline_.bits, value & next_inline_.mask); 00087 } 00088 00089 void FinishedLoading(const Config &config); 00090 00091 void LoadedBinary(); 00092 00093 uint8_t InlineBits() const { return next_inline_.bits; } 00094 00095 private: 00096 const util::BitsMask next_inline_; 00097 00098 const uint64_t *const offset_begin_; 00099 const uint64_t *const offset_end_; 00100 00101 uint64_t *write_to_; 00102 00103 void *original_base_; 00104 }; 00105 00106 } // namespace trie 00107 } // namespace ngram 00108 } // namespace lm