Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/joshua/decoder/ff/lm/kenlm/lm/blank.hh
00001 #ifndef LM_BLANK__
00002 #define LM_BLANK__
00003 
00004 #include <limits>
00005 
00006 #include <inttypes.h>
00007 #include <math.h>
00008 
00009 namespace lm {
00010 namespace ngram {
00011 
00012 /* Suppose "foo bar" appears with zero backoff but there is no trigram
00013  * beginning with these words.  Then, when scoring "foo bar", the model could
00014  * return out_state containing "bar" or even null context if "bar" also has no
00015  * backoff and is never followed by another word.  Then the backoff is set to
00016  * kNoExtensionBackoff.  If the n-gram might be extended, then out_state must
00017  * contain the full n-gram, in which case kExtensionBackoff is set.  In any
00018  * case, if an n-gram has non-zero backoff, the full state is returned so
00019  * backoff can be properly charged.  
00020  * These differ only in sign bit because the backoff is in fact zero in either
00021  * case.   
00022  */
00023 const float kNoExtensionBackoff = -0.0;
00024 const float kExtensionBackoff = 0.0;
00025 const uint64_t kNoExtensionQuant = 0;
00026 const uint64_t kExtensionQuant = 1;
00027 
00028 inline void SetExtension(float &backoff) {
00029   if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
00030 }
00031 
00032 // This compiles down nicely.  
00033 inline bool HasExtension(const float &backoff) {
00034   typedef union { float f; uint32_t i; } UnionValue;
00035   UnionValue compare, interpret;
00036   compare.f = kNoExtensionBackoff;
00037   interpret.f = backoff;
00038   return compare.i != interpret.i;
00039 }
00040 
00041 /* Suppose "foo bar baz quux" appears in the ARPA but not "bar baz quux" or
00042  * "baz quux" (because they were pruned).  1.2% of n-grams generated by SRI
00043  * with default settings on the benchmark data set are like this.  Since search
00044  * proceeds by finding "quux", "baz quux", "bar baz quux", and finally 
00045  * "foo bar baz quux" and the trie needs pointer nodes anyway, blanks are
00046  * inserted.  The blanks have probability kBlankProb and backoff kBlankBackoff.
00047  * A blank is recognized by kBlankProb in the probability field; kBlankBackoff
00048  * must be 0 so that inference asseses zero backoff from these blanks.  
00049  */
00050 const float kBlankProb = -std::numeric_limits<float>::infinity();
00051 const float kBlankBackoff = kNoExtensionBackoff;
00052 const uint32_t kBlankProbQuant = 0;
00053 const uint32_t kBlankBackoffQuant = 0;
00054 
00055 } // namespace ngram
00056 } // namespace lm
00057 #endif // LM_BLANK__