|
Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef LM_READ_ARPA__ 00002 #define LM_READ_ARPA__ 00003 00004 #include "lm/lm_exception.hh" 00005 #include "lm/word_index.hh" 00006 #include "lm/weights.hh" 00007 #include "util/file_piece.hh" 00008 00009 #include <cstddef> 00010 #include <iosfwd> 00011 #include <vector> 00012 00013 namespace lm { 00014 00015 void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number); 00016 void ReadNGramHeader(util::FilePiece &in, unsigned int length); 00017 00018 void ReadBackoff(util::FilePiece &in, Prob &weights); 00019 void ReadBackoff(util::FilePiece &in, ProbBackoff &weights); 00020 00021 void ReadEnd(util::FilePiece &in); 00022 00023 extern const bool kARPASpaces[256]; 00024 00025 // Positive log probability warning. 00026 class PositiveProbWarn { 00027 public: 00028 PositiveProbWarn() : action_(THROW_UP) {} 00029 00030 explicit PositiveProbWarn(WarningAction action) : action_(action) {} 00031 00032 void Warn(float prob); 00033 00034 private: 00035 WarningAction action_; 00036 }; 00037 00038 template <class Voc> void Read1Gram(util::FilePiece &f, Voc &vocab, ProbBackoff *unigrams, PositiveProbWarn &warn) { 00039 try { 00040 float prob = f.ReadFloat(); 00041 if (prob > 0.0) { 00042 warn.Warn(prob); 00043 prob = 0.0; 00044 } 00045 if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability"); 00046 ProbBackoff &value = unigrams[vocab.Insert(f.ReadDelimited(kARPASpaces))]; 00047 value.prob = prob; 00048 ReadBackoff(f, value); 00049 } catch(util::Exception &e) { 00050 e << " in the 1-gram at byte " << f.Offset(); 00051 throw; 00052 } 00053 } 00054 00055 // Return true if a positive log probability came out. 00056 template <class Voc> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, ProbBackoff *unigrams, PositiveProbWarn &warn) { 00057 ReadNGramHeader(f, 1); 00058 for (std::size_t i = 0; i < count; ++i) { 00059 Read1Gram(f, vocab, unigrams, warn); 00060 } 00061 vocab.FinishedLoading(unigrams); 00062 } 00063 00064 // Return true if a positive log probability came out. 00065 template <class Voc, class Weights> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights, PositiveProbWarn &warn) { 00066 try { 00067 weights.prob = f.ReadFloat(); 00068 if (weights.prob > 0.0) { 00069 warn.Warn(weights.prob); 00070 weights.prob = 0.0; 00071 } 00072 for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) { 00073 *vocab_out = vocab.Index(f.ReadDelimited(kARPASpaces)); 00074 } 00075 ReadBackoff(f, weights); 00076 } catch(util::Exception &e) { 00077 e << " in the " << static_cast<unsigned int>(n) << "-gram at byte " << f.Offset(); 00078 throw; 00079 } 00080 } 00081 00082 } // namespace lm 00083 00084 #endif // LM_READ_ARPA__