|
Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 #ifndef UTIL_FILE_PIECE__ 00002 #define UTIL_FILE_PIECE__ 00003 00004 #include "util/ersatz_progress.hh" 00005 #include "util/exception.hh" 00006 #include "util/have.hh" 00007 #include "util/mmap.hh" 00008 #include "util/scoped.hh" 00009 #include "util/string_piece.hh" 00010 00011 #include <string> 00012 00013 #include <cstddef> 00014 00015 namespace util { 00016 00017 class EndOfFileException : public Exception { 00018 public: 00019 EndOfFileException() throw(); 00020 ~EndOfFileException() throw(); 00021 }; 00022 00023 class ParseNumberException : public Exception { 00024 public: 00025 explicit ParseNumberException(StringPiece value) throw(); 00026 ~ParseNumberException() throw() {} 00027 }; 00028 00029 class GZException : public Exception { 00030 public: 00031 explicit GZException(void *file); 00032 GZException() throw() {} 00033 ~GZException() throw() {} 00034 }; 00035 00036 int OpenReadOrThrow(const char *name); 00037 00038 extern const bool kSpaces[256]; 00039 00040 // Return value for SizeFile when it can't size properly. 00041 const off_t kBadSize = -1; 00042 off_t SizeFile(int fd); 00043 00044 // Memory backing the returned StringPiece may vanish on the next call. 00045 class FilePiece { 00046 public: 00047 // 32 MB default. 00048 explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); 00049 // Takes ownership of fd. name is used for messages. 00050 explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); 00051 00052 ~FilePiece(); 00053 00054 char get() { 00055 if (position_ == position_end_) { 00056 Shift(); 00057 if (at_end_) throw EndOfFileException(); 00058 } 00059 return *(position_++); 00060 } 00061 00062 // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace(). 00063 StringPiece ReadDelimited(const bool *delim = kSpaces) { 00064 SkipSpaces(delim); 00065 return Consume(FindDelimiterOrEOF(delim)); 00066 } 00067 00068 // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter. 00069 // It is similar to getline in that way. 00070 StringPiece ReadLine(char delim = '\n'); 00071 00072 float ReadFloat(); 00073 double ReadDouble(); 00074 long int ReadLong(); 00075 unsigned long int ReadULong(); 00076 00077 // Skip spaces defined by isspace. 00078 void SkipSpaces(const bool *delim = kSpaces) { 00079 for (; ; ++position_) { 00080 if (position_ == position_end_) Shift(); 00081 if (!delim[static_cast<unsigned char>(*position_)]) return; 00082 } 00083 } 00084 00085 off_t Offset() const { 00086 return position_ - data_.begin() + mapped_offset_; 00087 } 00088 00089 const std::string &FileName() const { return file_name_; } 00090 00091 private: 00092 void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer); 00093 00094 template <class T> T ReadNumber(); 00095 00096 StringPiece Consume(const char *to) { 00097 StringPiece ret(position_, to - position_); 00098 position_ = to; 00099 return ret; 00100 } 00101 00102 const char *FindDelimiterOrEOF(const bool *delim = kSpaces); 00103 00104 void Shift(); 00105 // Backends to Shift(). 00106 void MMapShift(off_t desired_begin); 00107 00108 void TransitionToRead(); 00109 void ReadShift(); 00110 00111 const char *position_, *last_space_, *position_end_; 00112 00113 scoped_fd file_; 00114 const off_t total_size_; 00115 const off_t page_; 00116 00117 size_t default_map_size_; 00118 off_t mapped_offset_; 00119 00120 // Order matters: file_ should always be destroyed after this. 00121 scoped_memory data_; 00122 00123 bool at_end_; 00124 bool fallback_to_read_; 00125 00126 ErsatzProgress progress_; 00127 00128 std::string file_name_; 00129 00130 #ifdef HAVE_ZLIB 00131 void *gz_file_; 00132 #endif // HAVE_ZLIB 00133 }; 00134 00135 } // namespace util 00136 00137 #endif // UTIL_FILE_PIECE__