|
Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef LM_BLANK__ 00002 #define LM_BLANK__ 00003 00004 #include <limits> 00005 00006 #include <inttypes.h> 00007 #include <math.h> 00008 00009 namespace lm { 00010 namespace ngram { 00011 00012 /* Suppose "foo bar" appears with zero backoff but there is no trigram 00013 * beginning with these words. Then, when scoring "foo bar", the model could 00014 * return out_state containing "bar" or even null context if "bar" also has no 00015 * backoff and is never followed by another word. Then the backoff is set to 00016 * kNoExtensionBackoff. If the n-gram might be extended, then out_state must 00017 * contain the full n-gram, in which case kExtensionBackoff is set. In any 00018 * case, if an n-gram has non-zero backoff, the full state is returned so 00019 * backoff can be properly charged. 00020 * These differ only in sign bit because the backoff is in fact zero in either 00021 * case. 00022 */ 00023 const float kNoExtensionBackoff = -0.0; 00024 const float kExtensionBackoff = 0.0; 00025 const uint64_t kNoExtensionQuant = 0; 00026 const uint64_t kExtensionQuant = 1; 00027 00028 inline void SetExtension(float &backoff) { 00029 if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff; 00030 } 00031 00032 // This compiles down nicely. 00033 inline bool HasExtension(const float &backoff) { 00034 typedef union { float f; uint32_t i; } UnionValue; 00035 UnionValue compare, interpret; 00036 compare.f = kNoExtensionBackoff; 00037 interpret.f = backoff; 00038 return compare.i != interpret.i; 00039 } 00040 00041 /* Suppose "foo bar baz quux" appears in the ARPA but not "bar baz quux" or 00042 * "baz quux" (because they were pruned). 1.2% of n-grams generated by SRI 00043 * with default settings on the benchmark data set are like this. Since search 00044 * proceeds by finding "quux", "baz quux", "bar baz quux", and finally 00045 * "foo bar baz quux" and the trie needs pointer nodes anyway, blanks are 00046 * inserted. The blanks have probability kBlankProb and backoff kBlankBackoff. 00047 * A blank is recognized by kBlankProb in the probability field; kBlankBackoff 00048 * must be 0 so that inference asseses zero backoff from these blanks. 00049 */ 00050 const float kBlankProb = -std::numeric_limits<float>::infinity(); 00051 const float kBlankBackoff = kNoExtensionBackoff; 00052 const uint32_t kBlankProbQuant = 0; 00053 const uint32_t kBlankBackoffQuant = 0; 00054 00055 } // namespace ngram 00056 } // namespace lm 00057 #endif // LM_BLANK__