Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/joshua/decoder/ff/lm/kenlm/lm/binary_format.hh
00001 #ifndef LM_BINARY_FORMAT__
00002 #define LM_BINARY_FORMAT__
00003 
00004 #include "lm/config.hh"
00005 #include "lm/read_arpa.hh"
00006 
00007 #include "util/file_piece.hh"
00008 #include "util/mmap.hh"
00009 #include "util/scoped.hh"
00010 
00011 #include <cstddef>
00012 #include <vector>
00013 
00014 #include <inttypes.h>
00015 
00016 namespace lm {
00017 namespace ngram {
00018 
00019 /* Not the best numbering system, but it grew this way for historical reasons
00020  * and I want to preserve existing binary files. */
00021 typedef enum {HASH_PROBING=0, HASH_SORTED=1, TRIE_SORTED=2, QUANT_TRIE_SORTED=3, ARRAY_TRIE_SORTED=4, QUANT_ARRAY_TRIE_SORTED=5} ModelType;
00022 
00023 const static ModelType kQuantAdd = static_cast<ModelType>(QUANT_TRIE_SORTED - TRIE_SORTED);
00024 const static ModelType kArrayAdd = static_cast<ModelType>(ARRAY_TRIE_SORTED - TRIE_SORTED);
00025 
00026 /*Inspect a file to determine if it is a binary lm.  If not, return false.  
00027  * If so, return true and set recognized to the type.  This is the only API in
00028  * this header designed for use by decoder authors.  
00029  */
00030 bool RecognizeBinary(const char *file, ModelType &recognized);
00031 
00032 struct FixedWidthParameters {
00033   unsigned char order;
00034   float probing_multiplier;
00035   // What type of model is this?  
00036   ModelType model_type;
00037   // Does the end of the file have the actual strings in the vocabulary?   
00038   bool has_vocabulary;
00039 };
00040 
00041 // Parameters stored in the header of a binary file.  
00042 struct Parameters {
00043   FixedWidthParameters fixed;
00044   std::vector<uint64_t> counts;
00045 };
00046 
00047 struct Backing {
00048   // File behind memory, if any.  
00049   util::scoped_fd file;
00050   // Vocabulary lookup table.  Not to be confused with the vocab words themselves.  
00051   util::scoped_memory vocab;
00052   // Raw block of memory backing the language model data structures
00053   util::scoped_memory search;
00054 };
00055 
00056 void SeekOrThrow(int fd, off_t off);
00057 // Seek forward
00058 void AdvanceOrThrow(int fd, off_t off);
00059 
00060 // Create just enough of a binary file to write vocabulary to it.  
00061 uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing);
00062 // Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin.  
00063 uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing);
00064 
00065 // Write header to binary file.  This is done last to prevent incomplete files
00066 // from loading.   
00067 void FinishFile(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, Backing &backing);
00068 
00069 namespace detail {
00070 
00071 bool IsBinaryFormat(int fd);
00072 
00073 void ReadHeader(int fd, Parameters &params);
00074 
00075 void MatchCheck(ModelType model_type, const Parameters &params);
00076 
00077 void SeekPastHeader(int fd, const Parameters &params);
00078 
00079 uint8_t *SetupBinary(const Config &config, const Parameters &params, std::size_t memory_size, Backing &backing);
00080 
00081 void ComplainAboutARPA(const Config &config, ModelType model_type);
00082 
00083 } // namespace detail
00084 
00085 template <class To> void LoadLM(const char *file, const Config &config, To &to) {
00086   Backing &backing = to.MutableBacking();
00087   backing.file.reset(util::OpenReadOrThrow(file));
00088 
00089   try {
00090     if (detail::IsBinaryFormat(backing.file.get())) {
00091       Parameters params;
00092       detail::ReadHeader(backing.file.get(), params);
00093       detail::MatchCheck(To::kModelType, params);
00094       // Replace the run-time configured probing_multiplier with the one in the file.  
00095       Config new_config(config);
00096       new_config.probing_multiplier = params.fixed.probing_multiplier;
00097       detail::SeekPastHeader(backing.file.get(), params);
00098       To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config);
00099       std::size_t memory_size = To::Size(params.counts, new_config);
00100       uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing);
00101       to.InitializeFromBinary(start, params, new_config, backing.file.get());
00102     } else {
00103       detail::ComplainAboutARPA(config, To::kModelType);
00104       to.InitializeFromARPA(file, config);
00105     }
00106   } catch (util::Exception &e) {
00107     e << " File: " << file;
00108     throw;
00109   }
00110 }
00111 
00112 } // namespace ngram
00113 } // namespace lm
00114 #endif // LM_BINARY_FORMAT__