Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/joshua/decoder/ff/lm/kenlm/lm/virtual_interface.hh
00001 #ifndef LM_VIRTUAL_INTERFACE__
00002 #define LM_VIRTUAL_INTERFACE__
00003 
00004 #include "lm/word_index.hh"
00005 #include "util/string_piece.hh"
00006 
00007 #include <string>
00008 
00009 namespace lm {
00010 
00011 /* Structure returned by scoring routines. */
00012 struct FullScoreReturn {
00013   // log10 probability
00014   float prob;
00015 
00016   /* The length of n-gram matched.  Do not use this for recombination.  
00017    * Consider a model containing only the following n-grams:
00018    * -1 foo
00019    * -3.14  bar
00020    * -2.718 baz -5
00021    * -6 foo bar
00022    *
00023    * If you score ``bar'' then ngram_length is 1 and recombination state is the
00024    * empty string because bar has zero backoff and does not extend to the
00025    * right.  
00026    * If you score ``foo'' then ngram_length is 1 and recombination state is 
00027    * ``foo''.  
00028    *
00029    * Ideally, keep output states around and compare them.  Failing that,
00030    * get out_state.ValidLength() and use that length for recombination.
00031    */
00032   unsigned char ngram_length;
00033 };
00034 
00035 namespace base {
00036 
00037 template <class T, class U, class V> class ModelFacade;
00038 
00039 /* Vocabulary interface.  Call Index(string) and get a word index for use in
00040  * calling Model.  It provides faster convenience functions for <s>, </s>, and
00041  * <unk> although you can also find these using Index.  
00042  *
00043  * Some models do not load the mapping from index to string.  If you need this,
00044  * check if the model Vocabulary class implements such a function and access it
00045  * directly.  
00046  *
00047  * The Vocabulary object is always owned by the Model and can be retrieved from
00048  * the Model using BaseVocabulary() for this abstract interface or
00049  * GetVocabulary() for the actual implementation (in which case you'll need the
00050  * actual implementation of the Model too).  
00051  */
00052 class Vocabulary {
00053   public:
00054     virtual ~Vocabulary();
00055 
00056     WordIndex BeginSentence() const { return begin_sentence_; }
00057     WordIndex EndSentence() const { return end_sentence_; }
00058     WordIndex NotFound() const { return not_found_; }
00059 
00060     /* Most implementations allow StringPiece lookups and need only override
00061      * Index(StringPiece).  SRI requires null termination and overrides all
00062      * three methods.  
00063      */
00064     virtual WordIndex Index(const StringPiece &str) const = 0;
00065     virtual WordIndex Index(const std::string &str) const {
00066       return Index(StringPiece(str));
00067     }
00068     virtual WordIndex Index(const char *str) const {
00069       return Index(StringPiece(str));
00070     }
00071 
00072   protected:
00073     // Call SetSpecial afterward.  
00074     Vocabulary() {}
00075 
00076     Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) {
00077       SetSpecial(begin_sentence, end_sentence, not_found);
00078     }
00079 
00080     void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found);
00081 
00082     WordIndex begin_sentence_, end_sentence_, not_found_;
00083 
00084   private:
00085     // Disable copy constructors.  They're private and undefined. 
00086     // Ersatz boost::noncopyable.
00087     Vocabulary(const Vocabulary &);
00088     Vocabulary &operator=(const Vocabulary &);
00089 };
00090 
00091 /* There are two ways to access a Model.  
00092  *
00093  *
00094  * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh).
00095  *
00096  * Every Model implements the scoring function:
00097  * float Score(
00098  *   const Model::State &in_state,
00099  *   const WordIndex new_word,
00100  *   Model::State &out_state) const;
00101  *
00102  * It can also return the length of n-gram matched by the model:
00103  * FullScoreReturn FullScore(
00104  *   const Model::State &in_state,
00105  *   const WordIndex new_word,
00106  *   Model::State &out_state) const;
00107  *
00108  *
00109  * There are also accessor functions:
00110  * const State &BeginSentenceState() const;
00111  * const State &NullContextState() const;
00112  * const Vocabulary &GetVocabulary() const;
00113  * unsigned int Order() const;
00114  *
00115  * NB: In case you're wondering why the model implementation looks like it's
00116  * missing these methods, see facade.hh.  
00117  *
00118  * This is the fastest way to use a model and presents a normal State class to
00119  * be included in a hypothesis state structure.  
00120  *
00121  *
00122  * OPTION 2: Use the virtual interface below.  
00123  *
00124  * The virtual interface allow you to decide which Model to use at runtime 
00125  * without templatizing everything on the Model type.  However, each Model has
00126  * its own State class, so a single State cannot be efficiently provided (it
00127  * would require using the maximum memory of any Model's State or memory
00128  * allocation with each lookup).  This means you become responsible for
00129  * allocating memory with size StateSize() and passing it to the Score or 
00130  * FullScore functions provided here.  
00131  *
00132  * For example, cdec has a std::string containing the entire state of a
00133  * hypothesis.  It can reserve StateSize bytes in this string for the model
00134  * state.  
00135  *
00136  * All the State objects are POD, so it's ok to use raw memory for storing
00137  * State.
00138  * in_state and out_state must not have the same address. 
00139  */
00140 class Model {
00141   public:
00142     virtual ~Model();
00143 
00144     size_t StateSize() const { return state_size_; }
00145     const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
00146     const void *NullContextMemory() const { return null_context_memory_; }
00147 
00148     // Requires in_state != out_state
00149     virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
00150 
00151     // Requires in_state != out_state
00152     virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
00153 
00154     unsigned char Order() const { return order_; }
00155 
00156     const Vocabulary &BaseVocabulary() const { return *base_vocab_; }
00157 
00158   private:
00159     template <class T, class U, class V> friend class ModelFacade;
00160     explicit Model(size_t state_size) : state_size_(state_size) {}
00161 
00162     const size_t state_size_;
00163     const void *begin_sentence_memory_, *null_context_memory_;
00164 
00165     const Vocabulary *base_vocab_;
00166 
00167     unsigned char order_;
00168 
00169     // Disable copy constructors.  They're private and undefined. 
00170     // Ersatz boost::noncopyable.
00171     Model(const Model &);
00172     Model &operator=(const Model &);
00173 };
00174 
00175 } // mamespace base
00176 } // namespace lm
00177 
00178 #endif // LM_VIRTUAL_INTERFACE__