|
Joshua
open source statistical hierarchical phrase-based machine translation system
|
00001 /* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If 00002 * you don't use ICU, then this will use the Google implementation from Chrome. 00003 * This has been modified from the original version to let you choose. 00004 */ 00005 00006 // Copyright 2008, Google Inc. 00007 // All rights reserved. 00008 // 00009 // Redistribution and use in source and binary forms, with or without 00010 // modification, are permitted provided that the following conditions are 00011 // met: 00012 // 00013 // * Redistributions of source code must retain the above copyright 00014 // notice, this list of conditions and the following disclaimer. 00015 // * Redistributions in binary form must reproduce the above 00016 // copyright notice, this list of conditions and the following disclaimer 00017 // in the documentation and/or other materials provided with the 00018 // distribution. 00019 // * Neither the name of Google Inc. nor the names of its 00020 // contributors may be used to endorse or promote products derived from 00021 // this software without specific prior written permission. 00022 // 00023 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00024 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00025 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00026 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00027 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00028 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00029 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00030 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00031 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00032 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00033 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00034 // Copied from strings/stringpiece.h with modifications 00035 // 00036 // A string-like object that points to a sized piece of memory. 00037 // 00038 // Functions or methods may use const StringPiece& parameters to accept either 00039 // a "const char*" or a "string" value that will be implicitly converted to 00040 // a StringPiece. The implicit conversion means that it is often appropriate 00041 // to include this .h file in other files rather than forward-declaring 00042 // StringPiece as would be appropriate for most other Google classes. 00043 // 00044 // Systematic usage of StringPiece is encouraged as it will reduce unnecessary 00045 // conversions from "const char*" to "string" and back again. 00046 // 00047 00048 #ifndef BASE_STRING_PIECE_H__ 00049 #define BASE_STRING_PIECE_H__ 00050 00051 #include "util/have.hh" 00052 00053 #ifdef HAVE_BOOST 00054 #include <boost/functional/hash/hash.hpp> 00055 #endif // HAVE_BOOST 00056 00057 #include <cstring> 00058 #include <iosfwd> 00059 #include <ostream> 00060 00061 #ifdef HAVE_ICU 00062 #include <unicode/stringpiece.h> 00063 #include <unicode/uversion.h> 00064 00065 // Old versions of ICU don't define operator== and operator!=. 00066 #if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) 00067 #warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. 00068 inline bool operator==(const StringPiece& x, const StringPiece& y) { 00069 if (x.size() != y.size()) 00070 return false; 00071 00072 return std::memcmp(x.data(), y.data(), x.size()) == 0; 00073 } 00074 00075 inline bool operator!=(const StringPiece& x, const StringPiece& y) { 00076 return !(x == y); 00077 } 00078 #endif // old version of ICU 00079 00080 U_NAMESPACE_BEGIN 00081 #else 00082 00083 #include <algorithm> 00084 #include <cstddef> 00085 #include <string> 00086 #include <string.h> 00087 00088 class StringPiece { 00089 public: 00090 typedef size_t size_type; 00091 00092 private: 00093 const char* ptr_; 00094 size_type length_; 00095 00096 public: 00097 // We provide non-explicit singleton constructors so users can pass 00098 // in a "const char*" or a "string" wherever a "StringPiece" is 00099 // expected. 00100 StringPiece() : ptr_(NULL), length_(0) { } 00101 StringPiece(const char* str) 00102 : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { } 00103 StringPiece(const std::string& str) 00104 : ptr_(str.data()), length_(str.size()) { } 00105 StringPiece(const char* offset, size_type len) 00106 : ptr_(offset), length_(len) { } 00107 00108 // data() may return a pointer to a buffer with embedded NULs, and the 00109 // returned buffer may or may not be null terminated. Therefore it is 00110 // typically a mistake to pass data() to a routine that expects a NUL 00111 // terminated string. 00112 const char* data() const { return ptr_; } 00113 size_type size() const { return length_; } 00114 size_type length() const { return length_; } 00115 bool empty() const { return length_ == 0; } 00116 00117 void clear() { ptr_ = NULL; length_ = 0; } 00118 void set(const char* data, size_type len) { ptr_ = data; length_ = len; } 00119 void set(const char* str) { 00120 ptr_ = str; 00121 length_ = str ? strlen(str) : 0; 00122 } 00123 void set(const void* data, size_type len) { 00124 ptr_ = reinterpret_cast<const char*>(data); 00125 length_ = len; 00126 } 00127 00128 char operator[](size_type i) const { return ptr_[i]; } 00129 00130 void remove_prefix(size_type n) { 00131 ptr_ += n; 00132 length_ -= n; 00133 } 00134 00135 void remove_suffix(size_type n) { 00136 length_ -= n; 00137 } 00138 00139 int compare(const StringPiece& x) const { 00140 int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_)); 00141 if (r == 0) { 00142 if (length_ < x.length_) r = -1; 00143 else if (length_ > x.length_) r = +1; 00144 } 00145 return r; 00146 } 00147 00148 std::string as_string() const { 00149 // std::string doesn't like to take a NULL pointer even with a 0 size. 00150 return std::string(!empty() ? data() : "", size()); 00151 } 00152 00153 void CopyToString(std::string* target) const; 00154 void AppendToString(std::string* target) const; 00155 00156 // Does "this" start with "x" 00157 bool starts_with(const StringPiece& x) const { 00158 return ((length_ >= x.length_) && 00159 (wordmemcmp(ptr_, x.ptr_, x.length_) == 0)); 00160 } 00161 00162 // Does "this" end with "x" 00163 bool ends_with(const StringPiece& x) const { 00164 return ((length_ >= x.length_) && 00165 (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); 00166 } 00167 00168 // standard STL container boilerplate 00169 typedef char value_type; 00170 typedef const char* pointer; 00171 typedef const char& reference; 00172 typedef const char& const_reference; 00173 typedef ptrdiff_t difference_type; 00174 static const size_type npos; 00175 typedef const char* const_iterator; 00176 typedef const char* iterator; 00177 typedef std::reverse_iterator<const_iterator> const_reverse_iterator; 00178 typedef std::reverse_iterator<iterator> reverse_iterator; 00179 iterator begin() const { return ptr_; } 00180 iterator end() const { return ptr_ + length_; } 00181 const_reverse_iterator rbegin() const { 00182 return const_reverse_iterator(ptr_ + length_); 00183 } 00184 const_reverse_iterator rend() const { 00185 return const_reverse_iterator(ptr_); 00186 } 00187 00188 size_type max_size() const { return length_; } 00189 size_type capacity() const { return length_; } 00190 00191 size_type copy(char* buf, size_type n, size_type pos = 0) const; 00192 00193 size_type find(const StringPiece& s, size_type pos = 0) const; 00194 size_type find(char c, size_type pos = 0) const; 00195 size_type rfind(const StringPiece& s, size_type pos = npos) const; 00196 size_type rfind(char c, size_type pos = npos) const; 00197 00198 size_type find_first_of(const StringPiece& s, size_type pos = 0) const; 00199 size_type find_first_of(char c, size_type pos = 0) const { 00200 return find(c, pos); 00201 } 00202 size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const; 00203 size_type find_first_not_of(char c, size_type pos = 0) const; 00204 size_type find_last_of(const StringPiece& s, size_type pos = npos) const; 00205 size_type find_last_of(char c, size_type pos = npos) const { 00206 return rfind(c, pos); 00207 } 00208 size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const; 00209 size_type find_last_not_of(char c, size_type pos = npos) const; 00210 00211 StringPiece substr(size_type pos, size_type n = npos) const; 00212 00213 static int wordmemcmp(const char* p, const char* p2, size_type N) { 00214 return memcmp(p, p2, N); 00215 } 00216 }; 00217 00218 inline bool operator==(const StringPiece& x, const StringPiece& y) { 00219 if (x.size() != y.size()) 00220 return false; 00221 00222 return std::memcmp(x.data(), y.data(), x.size()) == 0; 00223 } 00224 00225 inline bool operator!=(const StringPiece& x, const StringPiece& y) { 00226 return !(x == y); 00227 } 00228 00229 #endif // HAVE_ICU undefined 00230 00231 inline bool operator<(const StringPiece& x, const StringPiece& y) { 00232 const int r = std::memcmp(x.data(), y.data(), 00233 std::min(x.size(), y.size())); 00234 return ((r < 0) || ((r == 0) && (x.size() < y.size()))); 00235 } 00236 00237 inline bool operator>(const StringPiece& x, const StringPiece& y) { 00238 return y < x; 00239 } 00240 00241 inline bool operator<=(const StringPiece& x, const StringPiece& y) { 00242 return !(x > y); 00243 } 00244 00245 inline bool operator>=(const StringPiece& x, const StringPiece& y) { 00246 return !(x < y); 00247 } 00248 00249 // allow StringPiece to be logged (needed for unit testing). 00250 inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { 00251 return o.write(piece.data(), static_cast<std::streamsize>(piece.size())); 00252 } 00253 00254 #ifdef HAVE_BOOST 00255 inline size_t hash_value(const StringPiece &str) { 00256 return boost::hash_range(str.data(), str.data() + str.length()); 00257 } 00258 00259 /* Support for lookup of StringPiece in boost::unordered_map<std::string> */ 00260 struct StringPieceCompatibleHash : public std::unary_function<const StringPiece &, size_t> { 00261 size_t operator()(const StringPiece &str) const { 00262 return hash_value(str); 00263 } 00264 }; 00265 00266 struct StringPieceCompatibleEquals : public std::binary_function<const StringPiece &, const std::string &, bool> { 00267 bool operator()(const StringPiece &first, const StringPiece &second) const { 00268 return first == second; 00269 } 00270 }; 00271 template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { 00272 return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); 00273 } 00274 template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) { 00275 return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); 00276 } 00277 #endif 00278 00279 #ifdef HAVE_ICU 00280 U_NAMESPACE_END 00281 #endif 00282 00283 #endif // BASE_STRING_PIECE_H__