|
Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef LM_BINARY_FORMAT__ 00002 #define LM_BINARY_FORMAT__ 00003 00004 #include "lm/config.hh" 00005 #include "lm/read_arpa.hh" 00006 00007 #include "util/file_piece.hh" 00008 #include "util/mmap.hh" 00009 #include "util/scoped.hh" 00010 00011 #include <cstddef> 00012 #include <vector> 00013 00014 #include <inttypes.h> 00015 00016 namespace lm { 00017 namespace ngram { 00018 00019 /* Not the best numbering system, but it grew this way for historical reasons 00020 * and I want to preserve existing binary files. */ 00021 typedef enum {HASH_PROBING=0, HASH_SORTED=1, TRIE_SORTED=2, QUANT_TRIE_SORTED=3, ARRAY_TRIE_SORTED=4, QUANT_ARRAY_TRIE_SORTED=5} ModelType; 00022 00023 const static ModelType kQuantAdd = static_cast<ModelType>(QUANT_TRIE_SORTED - TRIE_SORTED); 00024 const static ModelType kArrayAdd = static_cast<ModelType>(ARRAY_TRIE_SORTED - TRIE_SORTED); 00025 00026 /*Inspect a file to determine if it is a binary lm. If not, return false. 00027 * If so, return true and set recognized to the type. This is the only API in 00028 * this header designed for use by decoder authors. 00029 */ 00030 bool RecognizeBinary(const char *file, ModelType &recognized); 00031 00032 struct FixedWidthParameters { 00033 unsigned char order; 00034 float probing_multiplier; 00035 // What type of model is this? 00036 ModelType model_type; 00037 // Does the end of the file have the actual strings in the vocabulary? 00038 bool has_vocabulary; 00039 }; 00040 00041 // Parameters stored in the header of a binary file. 00042 struct Parameters { 00043 FixedWidthParameters fixed; 00044 std::vector<uint64_t> counts; 00045 }; 00046 00047 struct Backing { 00048 // File behind memory, if any. 00049 util::scoped_fd file; 00050 // Vocabulary lookup table. Not to be confused with the vocab words themselves. 00051 util::scoped_memory vocab; 00052 // Raw block of memory backing the language model data structures 00053 util::scoped_memory search; 00054 }; 00055 00056 void SeekOrThrow(int fd, off_t off); 00057 // Seek forward 00058 void AdvanceOrThrow(int fd, off_t off); 00059 00060 // Create just enough of a binary file to write vocabulary to it. 00061 uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing); 00062 // Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin. 00063 uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing); 00064 00065 // Write header to binary file. This is done last to prevent incomplete files 00066 // from loading. 00067 void FinishFile(const Config &config, ModelType model_type, const std::vector<uint64_t> &counts, Backing &backing); 00068 00069 namespace detail { 00070 00071 bool IsBinaryFormat(int fd); 00072 00073 void ReadHeader(int fd, Parameters ¶ms); 00074 00075 void MatchCheck(ModelType model_type, const Parameters ¶ms); 00076 00077 void SeekPastHeader(int fd, const Parameters ¶ms); 00078 00079 uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t memory_size, Backing &backing); 00080 00081 void ComplainAboutARPA(const Config &config, ModelType model_type); 00082 00083 } // namespace detail 00084 00085 template <class To> void LoadLM(const char *file, const Config &config, To &to) { 00086 Backing &backing = to.MutableBacking(); 00087 backing.file.reset(util::OpenReadOrThrow(file)); 00088 00089 try { 00090 if (detail::IsBinaryFormat(backing.file.get())) { 00091 Parameters params; 00092 detail::ReadHeader(backing.file.get(), params); 00093 detail::MatchCheck(To::kModelType, params); 00094 // Replace the run-time configured probing_multiplier with the one in the file. 00095 Config new_config(config); 00096 new_config.probing_multiplier = params.fixed.probing_multiplier; 00097 detail::SeekPastHeader(backing.file.get(), params); 00098 To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config); 00099 std::size_t memory_size = To::Size(params.counts, new_config); 00100 uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing); 00101 to.InitializeFromBinary(start, params, new_config, backing.file.get()); 00102 } else { 00103 detail::ComplainAboutARPA(config, To::kModelType); 00104 to.InitializeFromARPA(file, config); 00105 } 00106 } catch (util::Exception &e) { 00107 e << " File: " << file; 00108 throw; 00109 } 00110 } 00111 00112 } // namespace ngram 00113 } // namespace lm 00114 #endif // LM_BINARY_FORMAT__