|
Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef LM_CONFIG__ 00002 #define LM_CONFIG__ 00003 00004 #include <iosfwd> 00005 00006 #include "lm/lm_exception.hh" 00007 #include "util/mmap.hh" 00008 00009 /* Configuration for ngram model. Separate header to reduce pollution. */ 00010 00011 namespace lm { namespace ngram { 00012 00013 class EnumerateVocab; 00014 00015 struct Config { 00016 // EFFECTIVE FOR BOTH ARPA AND BINARY READS 00017 00018 // Where to log messages including the progress bar. Set to NULL for 00019 // silence. 00020 std::ostream *messages; 00021 00022 // This will be called with every string in the vocabulary. See 00023 // enumerate_vocab.hh for more detail. Config does not take ownership; you 00024 // are still responsible for deleting it (or stack allocating). 00025 EnumerateVocab *enumerate_vocab; 00026 00027 00028 00029 // ONLY EFFECTIVE WHEN READING ARPA 00030 00031 // What to do when <unk> isn't in the provided model. 00032 WarningAction unknown_missing; 00033 // What to do when <s> or </s> is missing from the model. 00034 // If THROW_UP, the exception will be of type util::SpecialWordMissingException. 00035 WarningAction sentence_marker_missing; 00036 00037 // What to do with a positive log probability. For COMPLAIN and SILENT, map 00038 // to 0. 00039 WarningAction positive_log_probability; 00040 00041 // The probability to substitute for <unk> if it's missing from the model. 00042 // No effect if the model has <unk> or unknown_missing == THROW_UP. 00043 float unknown_missing_logprob; 00044 00045 // Size multiplier for probing hash table. Must be > 1. Space is linear in 00046 // this. Time is probing_multiplier / (probing_multiplier - 1). No effect 00047 // for sorted variant. 00048 // If you find yourself setting this to a low number, consider using the 00049 // TrieModel which has lower memory consumption. 00050 float probing_multiplier; 00051 00052 // Amount of memory to use for building. The actual memory usage will be 00053 // higher since this just sets sort buffer size. Only applies to trie 00054 // models. 00055 std::size_t building_memory; 00056 00057 // Template for temporary directory appropriate for passing to mkdtemp. 00058 // The characters XXXXXX are appended before passing to mkdtemp. Only 00059 // applies to trie. If NULL, defaults to write_mmap. If that's NULL, 00060 // defaults to input file name. 00061 const char *temporary_directory_prefix; 00062 00063 // Level of complaining to do when loading from ARPA instead of binary format. 00064 typedef enum {ALL, EXPENSIVE, NONE} ARPALoadComplain; 00065 ARPALoadComplain arpa_complain; 00066 00067 // While loading an ARPA file, also write out this binary format file. Set 00068 // to NULL to disable. 00069 const char *write_mmap; 00070 00071 // Include the vocab in the binary file? Only effective if write_mmap != NULL. 00072 bool include_vocab; 00073 00074 // Quantization options. Only effective for QuantTrieModel. One value is 00075 // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used 00076 // to quantize (and one of the remaining backoffs will be 0). 00077 uint8_t prob_bits, backoff_bits; 00078 00079 // Bhiksha compression (simple form). Only works with trie. 00080 uint8_t pointer_bhiksha_bits; 00081 00082 00083 00084 // ONLY EFFECTIVE WHEN READING BINARY 00085 00086 // How to get the giant array into memory: lazy mmap, populate, read etc. 00087 // See util/mmap.hh for details of MapMethod. 00088 util::LoadMethod load_method; 00089 00090 00091 00092 // Set defaults. 00093 Config(); 00094 }; 00095 00096 } /* namespace ngram */ } /* namespace lm */ 00097 00098 #endif // LM_CONFIG__