diff --git a/include/rapidjson/document.h b/include/rapidjson/document.h index 45991872..cffc701a 100644 --- a/include/rapidjson/document.h +++ b/include/rapidjson/document.h @@ -20,6 +20,8 @@ #include "reader.h" #include "internal/meta.h" #include "internal/strfunc.h" +#include "memorystream.h" +#include "encodedstream.h" #include // placement new #ifdef _MSC_VER @@ -2224,6 +2226,42 @@ public: GenericDocument& Parse(const Ch* str) { return Parse(str); } + + template + GenericDocument& Parse(const typename SourceEncoding::Ch* str, size_t length) { + RAPIDJSON_ASSERT(!(parseFlags & kParseInsituFlag)); + MemoryStream ms(static_cast(str), length * sizeof(typename SourceEncoding::Ch)); + EncodedInputStream is(ms); + ParseStream(is); + return *this; + } + + template + GenericDocument& Parse(const Ch* str, size_t length) { + return Parse(str, length); + } + + GenericDocument& Parse(const Ch* str, size_t length) { + return Parse(str, length); + } + +#if RAPIDJSON_HAS_STDSTRING + template + GenericDocument& Parse(const std::basic_string& str) { + // c_str() is constant complexity according to standard. Should be faster than Parse(const char*, size_t) + return Parse(str.c_str()); + } + + template + GenericDocument& Parse(const std::basic_string& str) { + return Parse(str); + } + + GenericDocument& Parse(const std::basic_string& str) { + return Parse(str); + } +#endif // RAPIDJSON_HAS_STDSTRING + //!@} //!@name Handling parse errors diff --git a/include/rapidjson/encodedstream.h b/include/rapidjson/encodedstream.h index 87c90671..877c3acf 100644 --- a/include/rapidjson/encodedstream.h +++ b/include/rapidjson/encodedstream.h @@ -16,6 +16,7 @@ #define RAPIDJSON_ENCODEDSTREAM_H_ #include "stream.h" +#include "memorystream.h" #ifdef __GNUC__ RAPIDJSON_DIAG_PUSH @@ -62,6 +63,30 @@ private: Ch current_; }; +//! Specialized for UTF8 MemoryStream. +template <> +class EncodedInputStream, MemoryStream> { +public: + typedef UTF8<>::Ch Ch; + + EncodedInputStream(MemoryStream& is) : is_(is) { + if (static_cast(is_.Peek()) == 0xEFu) is_.Take(); + if (static_cast(is_.Peek()) == 0xBBu) is_.Take(); + if (static_cast(is_.Peek()) == 0xBFu) is_.Take(); + } + Ch Peek() const { return is_.Peek(); } + Ch Take() { return is_.Take(); } + size_t Tell() const { return is_.Tell(); } + + // Not implemented + void Put(Ch) {} + void Flush() {} + Ch* PutBegin() { return 0; } + size_t PutEnd(Ch*) { return 0; } + + MemoryStream& is_; +}; + //! Output byte stream wrapper with statically bound encoding. /*! \tparam Encoding The interpretation of encoding of the stream. Either UTF8, UTF16LE, UTF16BE, UTF32LE, UTF32BE. diff --git a/include/rapidjson/memorystream.h b/include/rapidjson/memorystream.h index 7381e0b7..1d71d8a4 100644 --- a/include/rapidjson/memorystream.h +++ b/include/rapidjson/memorystream.h @@ -42,8 +42,8 @@ struct MemoryStream { MemoryStream(const Ch *src, size_t size) : src_(src), begin_(src), end_(src + size), size_(size) {} - Ch Peek() const { return (src_ == end_) ? '\0' : *src_; } - Ch Take() { return (src_ == end_) ? '\0' : *src_++; } + Ch Peek() const { return RAPIDJSON_UNLIKELY(src_ == end_) ? '\0' : *src_; } + Ch Take() { return RAPIDJSON_UNLIKELY(src_ == end_) ? '\0' : *src_++; } size_t Tell() const { return static_cast(src_ - begin_); } Ch* PutBegin() { RAPIDJSON_ASSERT(false); return 0; } diff --git a/include/rapidjson/reader.h b/include/rapidjson/reader.h index 51007037..226c9d1c 100644 --- a/include/rapidjson/reader.h +++ b/include/rapidjson/reader.h @@ -19,6 +19,7 @@ #include "allocators.h" #include "stream.h" +#include "encodedstream.h" #include "internal/meta.h" #include "internal/stack.h" #include "internal/strtod.h" @@ -259,6 +260,12 @@ void SkipWhitespace(InputStream& is) { s.Take(); } +inline const char* SkipWhitespace(const char* p, const char* end) { + while (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) + ++p; + return p; +} + #ifdef RAPIDJSON_SSE42 //! Skip whitespace with SSE 4.2 pcmpistrm instruction, testing 16 8-byte characters at once. inline const char *SkipWhitespace_SIMD(const char* p) { @@ -295,6 +302,34 @@ inline const char *SkipWhitespace_SIMD(const char* p) { } } +inline const char *SkipWhitespace_SIMD(const char* p, const char* end) { + // Fast return for single non-whitespace + if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) + ++p; + else + return p; + + // The middle of string using SIMD + static const char whitespace[16] = " \n\r\t"; + const __m128i w = _mm_loadu_si128(reinterpret_cast(&whitespace[0])); + + for (; p <= end - 16; p += 16) { + const __m128i s = _mm_loadu_si128(reinterpret_cast(p)); + const int r = _mm_cvtsi128_si32(_mm_cmpistrm(w, s, _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY)); + if (r != 0) { // some of characters is non-whitespace +#ifdef _MSC_VER // Find the index of first non-whitespace + unsigned long offset; + _BitScanForward(&offset, r); + return p + offset; +#else + return p + __builtin_ffs(r) - 1; +#endif + } + } + + return SkipWhitespace(p, end); +} + #elif defined(RAPIDJSON_SSE2) //! Skip whitespace with SSE2 instructions, testing 16 8-byte characters at once. @@ -342,6 +377,44 @@ inline const char *SkipWhitespace_SIMD(const char* p) { } } +inline const char *SkipWhitespace_SIMD(const char* p, const char* end) { + // Fast return for single non-whitespace + if (p != end && (*p == ' ' || *p == '\n' || *p == '\r' || *p == '\t')) + ++p; + else + return p; + + // The rest of string + #define C16(c) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c } + static const char whitespaces[4][16] = { C16(' '), C16('\n'), C16('\r'), C16('\t') }; + #undef C16 + + const __m128i w0 = _mm_loadu_si128(reinterpret_cast(&whitespaces[0][0])); + const __m128i w1 = _mm_loadu_si128(reinterpret_cast(&whitespaces[1][0])); + const __m128i w2 = _mm_loadu_si128(reinterpret_cast(&whitespaces[2][0])); + const __m128i w3 = _mm_loadu_si128(reinterpret_cast(&whitespaces[3][0])); + + for (; p <= end - 16; p += 16) { + const __m128i s = _mm_loadu_si128(reinterpret_cast(p)); + __m128i x = _mm_cmpeq_epi8(s, w0); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w1)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w2)); + x = _mm_or_si128(x, _mm_cmpeq_epi8(s, w3)); + unsigned short r = static_cast(~_mm_movemask_epi8(x)); + if (r != 0) { // some of characters may be non-whitespace +#ifdef _MSC_VER // Find the index of first non-whitespace + unsigned long offset; + _BitScanForward(&offset, r); + return p + offset; +#else + return p + __builtin_ffs(r) - 1; +#endif + } + } + + return SkipWhitespace(p, end); +} + #endif // RAPIDJSON_SSE2 #ifdef RAPIDJSON_SIMD @@ -354,6 +427,10 @@ template<> inline void SkipWhitespace(InsituStringStream& is) { template<> inline void SkipWhitespace(StringStream& is) { is.src_ = SkipWhitespace_SIMD(is.src_); } + +template<> inline void SkipWhitespace(EncodedInputStream, MemoryStream>& is) { + is.is_.src_ = SkipWhitespace_SIMD(is.is_.src_, is.is_.end_); +} #endif // RAPIDJSON_SIMD /////////////////////////////////////////////////////////////////////////////// diff --git a/test/perftest/perftest.h b/test/perftest/perftest.h index 0d316027..9e3d4bee 100644 --- a/test/perftest/perftest.h +++ b/test/perftest/perftest.h @@ -30,6 +30,8 @@ # define RAPIDJSON_SSE2 #endif +#define RAPIDJSON_HAS_STDSTRING 1 + //////////////////////////////////////////////////////////////////////////////// // Google Test diff --git a/test/perftest/rapidjsontest.cpp b/test/perftest/rapidjsontest.cpp index 5584178d..2869eb2f 100644 --- a/test/perftest/rapidjsontest.cpp +++ b/test/perftest/rapidjsontest.cpp @@ -187,6 +187,25 @@ TEST_F(RapidJson, SIMD_SUFFIX(DocumentParse_MemoryPoolAllocator)) { } } +TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseLength_MemoryPoolAllocator)) { + for (size_t i = 0; i < kTrialCount; i++) { + Document doc; + doc.Parse(json_, length_); + ASSERT_TRUE(doc.IsObject()); + } +} + +#if RAPIDJSON_HAS_STDSTRING +TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseStdString_MemoryPoolAllocator)) { + const std::string s(json_, length_); + for (size_t i = 0; i < kTrialCount; i++) { + Document doc; + doc.Parse(s); + ASSERT_TRUE(doc.IsObject()); + } +} +#endif + TEST_F(RapidJson, SIMD_SUFFIX(DocumentParseIterative_MemoryPoolAllocator)) { for (size_t i = 0; i < kTrialCount; i++) { Document doc; diff --git a/test/unittest/documenttest.cpp b/test/unittest/documenttest.cpp index 1b1c469f..e6e312f8 100644 --- a/test/unittest/documenttest.cpp +++ b/test/unittest/documenttest.cpp @@ -34,6 +34,8 @@ void ParseCheck(DocumentType& doc) { typedef typename DocumentType::ValueType ValueType; EXPECT_FALSE(doc.HasParseError()); + if (doc.HasParseError()) + printf("Error: %d at %zu\n", static_cast(doc.GetParseError()), doc.GetErrorOffset()); EXPECT_TRUE(static_cast(doc)); EXPECT_TRUE(doc.IsObject()); @@ -93,6 +95,26 @@ void ParseTest() { doc.ParseInsitu(buffer); ParseCheck(doc); free(buffer); + + // Parse(const Ch*, size_t) + size_t length = strlen(json); + buffer = reinterpret_cast(malloc(length * 2)); + memcpy(buffer, json, length); + memset(buffer + length, 'X', length); +#if RAPIDJSON_HAS_STDSTRING + std::string s2(buffer, length); // backup buffer +#endif + doc.SetNull(); + doc.Parse(buffer, length); + free(buffer); + ParseCheck(doc); + +#if RAPIDJSON_HAS_STDSTRING + // Parse(std::string) + doc.SetNull(); + doc.Parse(s2); + ParseCheck(doc); +#endif } TEST(Document, Parse) { @@ -140,6 +162,42 @@ static FILE* OpenEncodedFile(const char* filename) { return 0; } +TEST(Document, Parse_Encoding) { + const char* json = " { \"hello\" : \"world\", \"t\" : true , \"f\" : false, \"n\": null, \"i\":123, \"pi\": 3.1416, \"a\":[1, 2, 3, 4] } "; + + typedef GenericDocument > DocumentType; + DocumentType doc; + + // Parse(const SourceEncoding::Ch*) + // doc.Parse >(json); + // EXPECT_FALSE(doc.HasParseError()); + // EXPECT_EQ(0, StrCmp(doc[L"hello"].GetString(), L"world")); + + // Parse(const SourceEncoding::Ch*, size_t) + size_t length = strlen(json); + char* buffer = reinterpret_cast(malloc(length * 2)); + memcpy(buffer, json, length); + memset(buffer + length, 'X', length); +#if RAPIDJSON_HAS_STDSTRING + std::string s2(buffer, length); // backup buffer +#endif + doc.SetNull(); + doc.Parse >(buffer, length); + free(buffer); + EXPECT_FALSE(doc.HasParseError()); + if (doc.HasParseError()) + printf("Error: %d at %zu\n", static_cast(doc.GetParseError()), doc.GetErrorOffset()); + EXPECT_EQ(0, StrCmp(doc[L"hello"].GetString(), L"world")); + +#if RAPIDJSON_HAS_STDSTRING + // Parse(std::string) + doc.SetNull(); + doc.Parse >(s2); + EXPECT_FALSE(doc.HasParseError()); + EXPECT_EQ(0, StrCmp(doc[L"hello"].GetString(), L"world")); +#endif +} + TEST(Document, ParseStream_EncodedInputStream) { // UTF8 -> UTF16 FILE* fp = OpenEncodedFile("utf8.json"); diff --git a/test/unittest/simdtest.cpp b/test/unittest/simdtest.cpp index 6ded7402..1b6fcef3 100644 --- a/test/unittest/simdtest.cpp +++ b/test/unittest/simdtest.cpp @@ -73,6 +73,28 @@ TEST(SIMD, SIMD_SUFFIX(SkipWhitespace)) { TestSkipWhitespace(); } +TEST(SIMD, SIMD_SUFFIX(SkipWhitespace_EncodedMemoryStream)) { + for (size_t step = 1; step < 32; step++) { + char buffer[1024]; + for (size_t i = 0; i < 1024; i++) + buffer[i] = " \t\r\n"[i % 4]; + for (size_t i = 0; i < 1024; i += step) + buffer[i] = 'X'; + + MemoryStream ms(buffer, 1024); + EncodedInputStream, MemoryStream> s(ms); + size_t i = 0; + for (;;) { + SkipWhitespace(s); + if (s.Peek() == '\0') + break; + //EXPECT_EQ(i, s.Tell()); + EXPECT_EQ('X', s.Take()); + i += step; + } + } +} + struct ScanCopyUnescapedStringHandler : BaseReaderHandler, ScanCopyUnescapedStringHandler> { bool String(const char* str, size_t length, bool) { memcpy(buffer, str, length + 1);