Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/joshua/decoder/ff/lm/kenlm/lm/config.hh
00001 #ifndef LM_CONFIG__
00002 #define LM_CONFIG__
00003 
00004 #include <iosfwd>
00005 
00006 #include "lm/lm_exception.hh"
00007 #include "util/mmap.hh"
00008 
00009 /* Configuration for ngram model.  Separate header to reduce pollution. */
00010 
00011 namespace lm { namespace ngram {
00012 
00013 class EnumerateVocab;
00014 
00015 struct Config {
00016   // EFFECTIVE FOR BOTH ARPA AND BINARY READS 
00017 
00018   // Where to log messages including the progress bar.  Set to NULL for
00019   // silence.
00020   std::ostream *messages;
00021 
00022   // This will be called with every string in the vocabulary.  See
00023   // enumerate_vocab.hh for more detail.  Config does not take ownership; you
00024   // are still responsible for deleting it (or stack allocating).  
00025   EnumerateVocab *enumerate_vocab;
00026 
00027 
00028 
00029   // ONLY EFFECTIVE WHEN READING ARPA
00030 
00031   // What to do when <unk> isn't in the provided model. 
00032   WarningAction unknown_missing;
00033   // What to do when <s> or </s> is missing from the model. 
00034   // If THROW_UP, the exception will be of type util::SpecialWordMissingException.  
00035   WarningAction sentence_marker_missing;
00036 
00037   // What to do with a positive log probability.  For COMPLAIN and SILENT, map
00038   // to 0.  
00039   WarningAction positive_log_probability;
00040 
00041   // The probability to substitute for <unk> if it's missing from the model.  
00042   // No effect if the model has <unk> or unknown_missing == THROW_UP.
00043   float unknown_missing_logprob;
00044 
00045   // Size multiplier for probing hash table.  Must be > 1.  Space is linear in
00046   // this.  Time is probing_multiplier / (probing_multiplier - 1).  No effect
00047   // for sorted variant.  
00048   // If you find yourself setting this to a low number, consider using the
00049   // TrieModel which has lower memory consumption.  
00050   float probing_multiplier;
00051 
00052   // Amount of memory to use for building.  The actual memory usage will be
00053   // higher since this just sets sort buffer size.  Only applies to trie
00054   // models.
00055   std::size_t building_memory;
00056 
00057   // Template for temporary directory appropriate for passing to mkdtemp.  
00058   // The characters XXXXXX are appended before passing to mkdtemp.  Only
00059   // applies to trie.  If NULL, defaults to write_mmap.  If that's NULL,
00060   // defaults to input file name.  
00061   const char *temporary_directory_prefix;
00062 
00063   // Level of complaining to do when loading from ARPA instead of binary format.
00064   typedef enum {ALL, EXPENSIVE, NONE} ARPALoadComplain;
00065   ARPALoadComplain arpa_complain;
00066 
00067   // While loading an ARPA file, also write out this binary format file.  Set
00068   // to NULL to disable.  
00069   const char *write_mmap;
00070 
00071   // Include the vocab in the binary file?  Only effective if write_mmap != NULL.  
00072   bool include_vocab;
00073 
00074   // Quantization options.  Only effective for QuantTrieModel.  One value is
00075   // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
00076   // to quantize (and one of the remaining backoffs will be 0).  
00077   uint8_t prob_bits, backoff_bits;
00078 
00079   // Bhiksha compression (simple form).  Only works with trie.
00080   uint8_t pointer_bhiksha_bits;
00081 
00082   
00083   
00084   // ONLY EFFECTIVE WHEN READING BINARY
00085   
00086   // How to get the giant array into memory: lazy mmap, populate, read etc.
00087   // See util/mmap.hh for details of MapMethod.  
00088   util::LoadMethod load_method;
00089 
00090 
00091 
00092   // Set defaults. 
00093   Config();
00094 };
00095 
00096 } /* namespace ngram */ } /* namespace lm */
00097 
00098 #endif // LM_CONFIG__