Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/joshua/decoder/ff/lm/kenlm/lm/bhiksha.hh
00001 /* Simple implementation of
00002  * @inproceedings{bhikshacompression,
00003  *  author={Bhiksha Raj and Ed Whittaker},
00004  *  year={2003},
00005  *  title={Lossless Compression of Language Model Structure and Word Identifiers},
00006  *  booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing},
00007  *  pages={388--391},
00008  *  }
00009  *
00010  *  Currently only used for next pointers.  
00011  */
00012 
00013 #include <inttypes.h>
00014 
00015 #include "lm/binary_format.hh"
00016 #include "lm/trie.hh"
00017 #include "util/bit_packing.hh"
00018 #include "util/sorted_uniform.hh"
00019 
00020 namespace lm {
00021 namespace ngram {
00022 class Config;
00023 
00024 namespace trie {
00025 
00026 class DontBhiksha {
00027   public:
00028     static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
00029 
00030     static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {}
00031 
00032     static std::size_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; }
00033 
00034     static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) {
00035       return util::RequiredBits(max_next);
00036     }
00037 
00038     DontBhiksha(const void *base, uint64_t max_offset, uint64_t max_next, const Config &config);
00039 
00040     void ReadNext(const void *base, uint64_t bit_offset, uint64_t /*index*/, uint8_t total_bits, NodeRange &out) const {
00041       out.begin = util::ReadInt57(base, bit_offset, next_.bits, next_.mask);
00042       out.end = util::ReadInt57(base, bit_offset + total_bits, next_.bits, next_.mask);
00043       //assert(out.end >= out.begin);
00044     }
00045 
00046     void WriteNext(void *base, uint64_t bit_offset, uint64_t /*index*/, uint64_t value) {
00047       util::WriteInt57(base, bit_offset, next_.bits, value);
00048     }
00049 
00050     void FinishedLoading(const Config &/*config*/) {}
00051 
00052     void LoadedBinary() {}
00053 
00054     uint8_t InlineBits() const { return next_.bits; }
00055 
00056   private:
00057     util::BitsMask next_;
00058 };
00059 
00060 class ArrayBhiksha {
00061   public:
00062     static const ModelType kModelTypeAdd = kArrayAdd;
00063 
00064     static void UpdateConfigFromBinary(int fd, Config &config);
00065 
00066     static std::size_t Size(uint64_t max_offset, uint64_t max_next, const Config &config);
00067 
00068     static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config);
00069 
00070     ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_value, const Config &config);
00071 
00072     void ReadNext(const void *base, uint64_t bit_offset, uint64_t index, uint8_t total_bits, NodeRange &out) const {
00073       const uint64_t *begin_it = util::BinaryBelow(util::IdentityAccessor<uint64_t>(), offset_begin_, offset_end_, index);
00074       const uint64_t *end_it;
00075       for (end_it = begin_it; (end_it < offset_end_) && (*end_it <= index + 1); ++end_it) {}
00076       --end_it;
00077       out.begin = ((begin_it - offset_begin_) << next_inline_.bits) | 
00078         util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
00079       out.end = ((end_it - offset_begin_) << next_inline_.bits) | 
00080         util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
00081     }
00082 
00083     void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) {
00084       uint64_t encode = value >> next_inline_.bits;
00085       for (; write_to_ <= offset_begin_ + encode; ++write_to_) *write_to_ = index;
00086       util::WriteInt57(base, bit_offset, next_inline_.bits, value & next_inline_.mask);
00087     }
00088 
00089     void FinishedLoading(const Config &config);
00090 
00091     void LoadedBinary();
00092 
00093     uint8_t InlineBits() const { return next_inline_.bits; }
00094 
00095   private:
00096     const util::BitsMask next_inline_;
00097 
00098     const uint64_t *const offset_begin_;
00099     const uint64_t *const offset_end_;
00100 
00101     uint64_t *write_to_;
00102 
00103     void *original_base_;
00104 };
00105 
00106 } // namespace trie
00107 } // namespace ngram
00108 } // namespace lm