Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/joshua/decoder/ff/lm/kenlm/util/file_piece.hh
00001 #ifndef UTIL_FILE_PIECE__
00002 #define UTIL_FILE_PIECE__
00003 
00004 #include "util/ersatz_progress.hh"
00005 #include "util/exception.hh"
00006 #include "util/have.hh"
00007 #include "util/mmap.hh"
00008 #include "util/scoped.hh"
00009 #include "util/string_piece.hh"
00010 
00011 #include <string>
00012 
00013 #include <cstddef>
00014 
00015 namespace util {
00016 
00017 class EndOfFileException : public Exception {
00018   public:
00019     EndOfFileException() throw();
00020     ~EndOfFileException() throw();
00021 };
00022 
00023 class ParseNumberException : public Exception {
00024   public:
00025     explicit ParseNumberException(StringPiece value) throw();
00026     ~ParseNumberException() throw() {}
00027 };
00028 
00029 class GZException : public Exception {
00030   public:
00031     explicit GZException(void *file);
00032     GZException() throw() {}
00033     ~GZException() throw() {}
00034 };
00035 
00036 int OpenReadOrThrow(const char *name);
00037 
00038 extern const bool kSpaces[256];
00039 
00040 // Return value for SizeFile when it can't size properly.  
00041 const off_t kBadSize = -1;
00042 off_t SizeFile(int fd);
00043 
00044 // Memory backing the returned StringPiece may vanish on the next call.  
00045 class FilePiece {
00046   public:
00047     // 32 MB default.
00048     explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
00049     // Takes ownership of fd.  name is used for messages.  
00050     explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432);
00051 
00052     ~FilePiece();
00053      
00054     char get() { 
00055       if (position_ == position_end_) {
00056         Shift();
00057         if (at_end_) throw EndOfFileException();
00058       }
00059       return *(position_++);
00060     }
00061 
00062     // Leaves the delimiter, if any, to be returned by get().  Delimiters defined by isspace().  
00063     StringPiece ReadDelimited(const bool *delim = kSpaces) {
00064       SkipSpaces(delim);
00065       return Consume(FindDelimiterOrEOF(delim));
00066     }
00067 
00068     // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter.
00069     // It is similar to getline in that way.  
00070     StringPiece ReadLine(char delim = '\n');
00071 
00072     float ReadFloat();
00073     double ReadDouble();
00074     long int ReadLong();
00075     unsigned long int ReadULong();
00076 
00077     // Skip spaces defined by isspace.  
00078     void SkipSpaces(const bool *delim = kSpaces) {
00079       for (; ; ++position_) {
00080         if (position_ == position_end_) Shift();
00081         if (!delim[static_cast<unsigned char>(*position_)]) return;
00082       }
00083     }
00084 
00085     off_t Offset() const {
00086       return position_ - data_.begin() + mapped_offset_;
00087     }
00088 
00089     const std::string &FileName() const { return file_name_; }
00090     
00091   private:
00092     void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer);
00093 
00094     template <class T> T ReadNumber();
00095 
00096     StringPiece Consume(const char *to) {
00097       StringPiece ret(position_, to - position_);
00098       position_ = to;
00099       return ret;
00100     }
00101 
00102     const char *FindDelimiterOrEOF(const bool *delim = kSpaces);
00103 
00104     void Shift();
00105     // Backends to Shift().
00106     void MMapShift(off_t desired_begin);
00107 
00108     void TransitionToRead();
00109     void ReadShift();
00110 
00111     const char *position_, *last_space_, *position_end_;
00112 
00113     scoped_fd file_;
00114     const off_t total_size_;
00115     const off_t page_;
00116 
00117     size_t default_map_size_;
00118     off_t mapped_offset_;
00119 
00120     // Order matters: file_ should always be destroyed after this.
00121     scoped_memory data_;
00122 
00123     bool at_end_;
00124     bool fallback_to_read_;
00125 
00126     ErsatzProgress progress_;
00127 
00128     std::string file_name_;
00129 
00130 #ifdef HAVE_ZLIB
00131     void *gz_file_;
00132 #endif // HAVE_ZLIB
00133 };
00134 
00135 } // namespace util
00136 
00137 #endif // UTIL_FILE_PIECE__