Joshua
open source statistical hierarchical phrase-based machine translation system
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
src/joshua/decoder/ff/lm/kenlm/util/string_piece.hh
00001 /* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n.  If
00002  * you don't use ICU, then this will use the Google implementation from Chrome.
00003  * This has been modified from the original version to let you choose.  
00004  */
00005 
00006 // Copyright 2008, Google Inc.
00007 // All rights reserved.
00008 //
00009 // Redistribution and use in source and binary forms, with or without
00010 // modification, are permitted provided that the following conditions are
00011 // met:
00012 //
00013 //    * Redistributions of source code must retain the above copyright
00014 // notice, this list of conditions and the following disclaimer.
00015 //    * Redistributions in binary form must reproduce the above
00016 // copyright notice, this list of conditions and the following disclaimer
00017 // in the documentation and/or other materials provided with the
00018 // distribution.
00019 //    * Neither the name of Google Inc. nor the names of its
00020 // contributors may be used to endorse or promote products derived from
00021 // this software without specific prior written permission.
00022 //
00023 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00024 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00025 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00026 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00027 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00028 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00029 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00030 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00031 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00032 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00033 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00034 // Copied from strings/stringpiece.h with modifications
00035 //
00036 // A string-like object that points to a sized piece of memory.
00037 //
00038 // Functions or methods may use const StringPiece& parameters to accept either
00039 // a "const char*" or a "string" value that will be implicitly converted to
00040 // a StringPiece.  The implicit conversion means that it is often appropriate
00041 // to include this .h file in other files rather than forward-declaring
00042 // StringPiece as would be appropriate for most other Google classes.
00043 //
00044 // Systematic usage of StringPiece is encouraged as it will reduce unnecessary
00045 // conversions from "const char*" to "string" and back again.
00046 //
00047 
00048 #ifndef BASE_STRING_PIECE_H__
00049 #define BASE_STRING_PIECE_H__
00050 
00051 #include "util/have.hh"
00052 
00053 #ifdef HAVE_BOOST
00054 #include <boost/functional/hash/hash.hpp>
00055 #endif // HAVE_BOOST
00056 
00057 #include <cstring>
00058 #include <iosfwd>
00059 #include <ostream>
00060 
00061 #ifdef HAVE_ICU
00062 #include <unicode/stringpiece.h>
00063 #include <unicode/uversion.h>
00064 
00065 // Old versions of ICU don't define operator== and operator!=.  
00066 #if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4))
00067 #warning You are using an old version of ICU.  Consider upgrading to ICU >= 4.6.  
00068 inline bool operator==(const StringPiece& x, const StringPiece& y) {
00069   if (x.size() != y.size())
00070     return false;
00071 
00072   return std::memcmp(x.data(), y.data(), x.size()) == 0;
00073 }
00074 
00075 inline bool operator!=(const StringPiece& x, const StringPiece& y) {
00076   return !(x == y);
00077 }
00078 #endif // old version of ICU
00079 
00080 U_NAMESPACE_BEGIN
00081 #else
00082 
00083 #include <algorithm>
00084 #include <cstddef>
00085 #include <string>
00086 #include <string.h>
00087 
00088 class StringPiece {
00089  public:
00090   typedef size_t size_type;
00091 
00092  private:
00093   const char*   ptr_;
00094   size_type     length_;
00095 
00096  public:
00097   // We provide non-explicit singleton constructors so users can pass
00098   // in a "const char*" or a "string" wherever a "StringPiece" is
00099   // expected.
00100   StringPiece() : ptr_(NULL), length_(0) { }
00101   StringPiece(const char* str)
00102     : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { }
00103   StringPiece(const std::string& str)
00104     : ptr_(str.data()), length_(str.size()) { }
00105   StringPiece(const char* offset, size_type len)
00106     : ptr_(offset), length_(len) { }
00107 
00108   // data() may return a pointer to a buffer with embedded NULs, and the
00109   // returned buffer may or may not be null terminated.  Therefore it is
00110   // typically a mistake to pass data() to a routine that expects a NUL
00111   // terminated string.
00112   const char* data() const { return ptr_; }
00113   size_type size() const { return length_; }
00114   size_type length() const { return length_; }
00115   bool empty() const { return length_ == 0; }
00116 
00117   void clear() { ptr_ = NULL; length_ = 0; }
00118   void set(const char* data, size_type len) { ptr_ = data; length_ = len; }
00119   void set(const char* str) {
00120     ptr_ = str;
00121     length_ = str ? strlen(str) : 0;
00122   }
00123   void set(const void* data, size_type len) {
00124     ptr_ = reinterpret_cast<const char*>(data);
00125     length_ = len;
00126   }
00127 
00128   char operator[](size_type i) const { return ptr_[i]; }
00129 
00130   void remove_prefix(size_type n) {
00131     ptr_ += n;
00132     length_ -= n;
00133   }
00134 
00135   void remove_suffix(size_type n) {
00136     length_ -= n;
00137   }
00138 
00139   int compare(const StringPiece& x) const {
00140     int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_));
00141     if (r == 0) {
00142       if (length_ < x.length_) r = -1;
00143       else if (length_ > x.length_) r = +1;
00144     }
00145     return r;
00146   }
00147 
00148   std::string as_string() const {
00149     // std::string doesn't like to take a NULL pointer even with a 0 size.
00150     return std::string(!empty() ? data() : "", size());
00151   }
00152 
00153   void CopyToString(std::string* target) const;
00154   void AppendToString(std::string* target) const;
00155 
00156   // Does "this" start with "x"
00157   bool starts_with(const StringPiece& x) const {
00158     return ((length_ >= x.length_) &&
00159             (wordmemcmp(ptr_, x.ptr_, x.length_) == 0));
00160   }
00161 
00162   // Does "this" end with "x"
00163   bool ends_with(const StringPiece& x) const {
00164     return ((length_ >= x.length_) &&
00165             (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
00166   }
00167 
00168   // standard STL container boilerplate
00169   typedef char value_type;
00170   typedef const char* pointer;
00171   typedef const char& reference;
00172   typedef const char& const_reference;
00173   typedef ptrdiff_t difference_type;
00174   static const size_type npos;
00175   typedef const char* const_iterator;
00176   typedef const char* iterator;
00177   typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
00178   typedef std::reverse_iterator<iterator> reverse_iterator;
00179   iterator begin() const { return ptr_; }
00180   iterator end() const { return ptr_ + length_; }
00181   const_reverse_iterator rbegin() const {
00182     return const_reverse_iterator(ptr_ + length_);
00183   }
00184   const_reverse_iterator rend() const {
00185     return const_reverse_iterator(ptr_);
00186   }
00187 
00188   size_type max_size() const { return length_; }
00189   size_type capacity() const { return length_; }
00190 
00191   size_type copy(char* buf, size_type n, size_type pos = 0) const;
00192 
00193   size_type find(const StringPiece& s, size_type pos = 0) const;
00194   size_type find(char c, size_type pos = 0) const;
00195   size_type rfind(const StringPiece& s, size_type pos = npos) const;
00196   size_type rfind(char c, size_type pos = npos) const;
00197 
00198   size_type find_first_of(const StringPiece& s, size_type pos = 0) const;
00199   size_type find_first_of(char c, size_type pos = 0) const {
00200     return find(c, pos);
00201   }
00202   size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const;
00203   size_type find_first_not_of(char c, size_type pos = 0) const;
00204   size_type find_last_of(const StringPiece& s, size_type pos = npos) const;
00205   size_type find_last_of(char c, size_type pos = npos) const {
00206     return rfind(c, pos);
00207   }
00208   size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const;
00209   size_type find_last_not_of(char c, size_type pos = npos) const;
00210 
00211   StringPiece substr(size_type pos, size_type n = npos) const;
00212 
00213   static int wordmemcmp(const char* p, const char* p2, size_type N) {
00214     return memcmp(p, p2, N);
00215   }
00216 };
00217 
00218 inline bool operator==(const StringPiece& x, const StringPiece& y) {
00219   if (x.size() != y.size())
00220     return false;
00221 
00222   return std::memcmp(x.data(), y.data(), x.size()) == 0;
00223 }
00224 
00225 inline bool operator!=(const StringPiece& x, const StringPiece& y) {
00226   return !(x == y);
00227 }
00228 
00229 #endif // HAVE_ICU undefined
00230 
00231 inline bool operator<(const StringPiece& x, const StringPiece& y) {
00232   const int r = std::memcmp(x.data(), y.data(),
00233                                        std::min(x.size(), y.size()));
00234   return ((r < 0) || ((r == 0) && (x.size() < y.size())));
00235 }
00236 
00237 inline bool operator>(const StringPiece& x, const StringPiece& y) {
00238   return y < x;
00239 }
00240 
00241 inline bool operator<=(const StringPiece& x, const StringPiece& y) {
00242   return !(x > y);
00243 }
00244 
00245 inline bool operator>=(const StringPiece& x, const StringPiece& y) {
00246   return !(x < y);
00247 }
00248 
00249 // allow StringPiece to be logged (needed for unit testing).
00250 inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
00251   return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
00252 }
00253 
00254 #ifdef HAVE_BOOST
00255 inline size_t hash_value(const StringPiece &str) {
00256   return boost::hash_range(str.data(), str.data() + str.length());
00257 }
00258 
00259 /* Support for lookup of StringPiece in boost::unordered_map<std::string> */
00260 struct StringPieceCompatibleHash : public std::unary_function<const StringPiece &, size_t> {
00261   size_t operator()(const StringPiece &str) const {
00262     return hash_value(str);
00263   }
00264 };
00265 
00266 struct StringPieceCompatibleEquals : public std::binary_function<const StringPiece &, const std::string &, bool> {
00267   bool operator()(const StringPiece &first, const StringPiece &second) const {
00268     return first == second;
00269   }
00270 };
00271 template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) {
00272   return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
00273 }
00274 template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) {
00275   return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
00276 }
00277 #endif
00278 
00279 #ifdef HAVE_ICU
00280 U_NAMESPACE_END
00281 #endif
00282 
00283 #endif  // BASE_STRING_PIECE_H__