|
Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef LM_VIRTUAL_INTERFACE__ 00002 #define LM_VIRTUAL_INTERFACE__ 00003 00004 #include "lm/word_index.hh" 00005 #include "util/string_piece.hh" 00006 00007 #include <string> 00008 00009 namespace lm { 00010 00011 /* Structure returned by scoring routines. */ 00012 struct FullScoreReturn { 00013 // log10 probability 00014 float prob; 00015 00016 /* The length of n-gram matched. Do not use this for recombination. 00017 * Consider a model containing only the following n-grams: 00018 * -1 foo 00019 * -3.14 bar 00020 * -2.718 baz -5 00021 * -6 foo bar 00022 * 00023 * If you score ``bar'' then ngram_length is 1 and recombination state is the 00024 * empty string because bar has zero backoff and does not extend to the 00025 * right. 00026 * If you score ``foo'' then ngram_length is 1 and recombination state is 00027 * ``foo''. 00028 * 00029 * Ideally, keep output states around and compare them. Failing that, 00030 * get out_state.ValidLength() and use that length for recombination. 00031 */ 00032 unsigned char ngram_length; 00033 }; 00034 00035 namespace base { 00036 00037 template <class T, class U, class V> class ModelFacade; 00038 00039 /* Vocabulary interface. Call Index(string) and get a word index for use in 00040 * calling Model. It provides faster convenience functions for <s>, </s>, and 00041 * <unk> although you can also find these using Index. 00042 * 00043 * Some models do not load the mapping from index to string. If you need this, 00044 * check if the model Vocabulary class implements such a function and access it 00045 * directly. 00046 * 00047 * The Vocabulary object is always owned by the Model and can be retrieved from 00048 * the Model using BaseVocabulary() for this abstract interface or 00049 * GetVocabulary() for the actual implementation (in which case you'll need the 00050 * actual implementation of the Model too). 00051 */ 00052 class Vocabulary { 00053 public: 00054 virtual ~Vocabulary(); 00055 00056 WordIndex BeginSentence() const { return begin_sentence_; } 00057 WordIndex EndSentence() const { return end_sentence_; } 00058 WordIndex NotFound() const { return not_found_; } 00059 00060 /* Most implementations allow StringPiece lookups and need only override 00061 * Index(StringPiece). SRI requires null termination and overrides all 00062 * three methods. 00063 */ 00064 virtual WordIndex Index(const StringPiece &str) const = 0; 00065 virtual WordIndex Index(const std::string &str) const { 00066 return Index(StringPiece(str)); 00067 } 00068 virtual WordIndex Index(const char *str) const { 00069 return Index(StringPiece(str)); 00070 } 00071 00072 protected: 00073 // Call SetSpecial afterward. 00074 Vocabulary() {} 00075 00076 Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { 00077 SetSpecial(begin_sentence, end_sentence, not_found); 00078 } 00079 00080 void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found); 00081 00082 WordIndex begin_sentence_, end_sentence_, not_found_; 00083 00084 private: 00085 // Disable copy constructors. They're private and undefined. 00086 // Ersatz boost::noncopyable. 00087 Vocabulary(const Vocabulary &); 00088 Vocabulary &operator=(const Vocabulary &); 00089 }; 00090 00091 /* There are two ways to access a Model. 00092 * 00093 * 00094 * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh). 00095 * 00096 * Every Model implements the scoring function: 00097 * float Score( 00098 * const Model::State &in_state, 00099 * const WordIndex new_word, 00100 * Model::State &out_state) const; 00101 * 00102 * It can also return the length of n-gram matched by the model: 00103 * FullScoreReturn FullScore( 00104 * const Model::State &in_state, 00105 * const WordIndex new_word, 00106 * Model::State &out_state) const; 00107 * 00108 * 00109 * There are also accessor functions: 00110 * const State &BeginSentenceState() const; 00111 * const State &NullContextState() const; 00112 * const Vocabulary &GetVocabulary() const; 00113 * unsigned int Order() const; 00114 * 00115 * NB: In case you're wondering why the model implementation looks like it's 00116 * missing these methods, see facade.hh. 00117 * 00118 * This is the fastest way to use a model and presents a normal State class to 00119 * be included in a hypothesis state structure. 00120 * 00121 * 00122 * OPTION 2: Use the virtual interface below. 00123 * 00124 * The virtual interface allow you to decide which Model to use at runtime 00125 * without templatizing everything on the Model type. However, each Model has 00126 * its own State class, so a single State cannot be efficiently provided (it 00127 * would require using the maximum memory of any Model's State or memory 00128 * allocation with each lookup). This means you become responsible for 00129 * allocating memory with size StateSize() and passing it to the Score or 00130 * FullScore functions provided here. 00131 * 00132 * For example, cdec has a std::string containing the entire state of a 00133 * hypothesis. It can reserve StateSize bytes in this string for the model 00134 * state. 00135 * 00136 * All the State objects are POD, so it's ok to use raw memory for storing 00137 * State. 00138 * in_state and out_state must not have the same address. 00139 */ 00140 class Model { 00141 public: 00142 virtual ~Model(); 00143 00144 size_t StateSize() const { return state_size_; } 00145 const void *BeginSentenceMemory() const { return begin_sentence_memory_; } 00146 const void *NullContextMemory() const { return null_context_memory_; } 00147 00148 // Requires in_state != out_state 00149 virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0; 00150 00151 // Requires in_state != out_state 00152 virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; 00153 00154 unsigned char Order() const { return order_; } 00155 00156 const Vocabulary &BaseVocabulary() const { return *base_vocab_; } 00157 00158 private: 00159 template <class T, class U, class V> friend class ModelFacade; 00160 explicit Model(size_t state_size) : state_size_(state_size) {} 00161 00162 const size_t state_size_; 00163 const void *begin_sentence_memory_, *null_context_memory_; 00164 00165 const Vocabulary *base_vocab_; 00166 00167 unsigned char order_; 00168 00169 // Disable copy constructors. They're private and undefined. 00170 // Ersatz boost::noncopyable. 00171 Model(const Model &); 00172 Model &operator=(const Model &); 00173 }; 00174 00175 } // mamespace base 00176 } // namespace lm 00177 00178 #endif // LM_VIRTUAL_INTERFACE__