~post/joshua/config_8hh_source.html

00001 #ifndef LM_CONFIG__
00002 #define LM_CONFIG__
00003
00004 #include <iosfwd>
00005
00006 #include "lm/lm_exception.hh"
00007 #include "util/mmap.hh"
00008
00009 /* Configuration for ngram model.  Separate header to reduce pollution. */
00010
00011 namespace lm { namespace ngram {
00012
00013 class EnumerateVocab;
00014
00015 struct Config {
00016   // EFFECTIVE FOR BOTH ARPA AND BINARY READS
00017
00018   // Where to log messages including the progress bar.  Set to NULL for
00019   // silence.
00020   std::ostream *messages;
00021
00022   // This will be called with every string in the vocabulary.  See
00023   // enumerate_vocab.hh for more detail.  Config does not take ownership; you
00024   // are still responsible for deleting it (or stack allocating).
00025   EnumerateVocab *enumerate_vocab;
00026
00027
00028
00029   // ONLY EFFECTIVE WHEN READING ARPA
00030
00031   // What to do when <unk> isn't in the provided model.
00032   WarningAction unknown_missing;
00033   // What to do when <s> or </s> is missing from the model.
00034   // If THROW_UP, the exception will be of type util::SpecialWordMissingException.
00035   WarningAction sentence_marker_missing;
00036
00037   // What to do with a positive log probability.  For COMPLAIN and SILENT, map
00038   // to 0.
00039   WarningAction positive_log_probability;
00040
00041   // The probability to substitute for <unk> if it's missing from the model.
00042   // No effect if the model has <unk> or unknown_missing == THROW_UP.
00043   float unknown_missing_logprob;
00044
00045   // Size multiplier for probing hash table.  Must be > 1.  Space is linear in
00046   // this.  Time is probing_multiplier / (probing_multiplier - 1).  No effect
00047   // for sorted variant.
00048   // If you find yourself setting this to a low number, consider using the
00049   // TrieModel which has lower memory consumption.
00050   float probing_multiplier;
00051
00052   // Amount of memory to use for building.  The actual memory usage will be
00053   // higher since this just sets sort buffer size.  Only applies to trie
00054   // models.
00055   std::size_t building_memory;
00056
00057   // Template for temporary directory appropriate for passing to mkdtemp.
00058   // The characters XXXXXX are appended before passing to mkdtemp.  Only
00059   // applies to trie.  If NULL, defaults to write_mmap.  If that's NULL,
00060   // defaults to input file name.
00061   const char *temporary_directory_prefix;
00062
00063   // Level of complaining to do when loading from ARPA instead of binary format.
00064   typedef enum {ALL, EXPENSIVE, NONE} ARPALoadComplain;
00065   ARPALoadComplain arpa_complain;
00066
00067   // While loading an ARPA file, also write out this binary format file.  Set
00068   // to NULL to disable.
00069   const char *write_mmap;
00070
00071   // Include the vocab in the binary file?  Only effective if write_mmap != NULL.
00072   bool include_vocab;
00073
00074   // Quantization options.  Only effective for QuantTrieModel.  One value is
00075   // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
00076   // to quantize (and one of the remaining backoffs will be 0).
00077   uint8_t prob_bits, backoff_bits;
00078
00079   // Bhiksha compression (simple form).  Only works with trie.
00080   uint8_t pointer_bhiksha_bits;
00081
00082
00083
00084   // ONLY EFFECTIVE WHEN READING BINARY
00085
00086   // How to get the giant array into memory: lazy mmap, populate, read etc.
00087   // See util/mmap.hh for details of MapMethod.
00088   util::LoadMethod load_method;
00089
00090
00091
00092   // Set defaults.
00093   Config();
00094 };
00095
00096 } /* namespace ngram */ } /* namespace lm */
00097
00098 #endif // LM_CONFIG__