From a214d7afba0df71d652573cb48381c0dea9d6cd8 Mon Sep 17 00:00:00 2001 From: "dcposch@dcpos.ch" Date: Thu, 28 Jan 2016 22:12:09 +0100 Subject: [PATCH] buffer: add Buffer.prototype.lastIndexOf() * Remove unnecessary templating from SearchString SearchString used to have separate PatternChar and SubjectChar template type arguments, apparently to support things like searching for an 8-bit string inside a 16-bit string or vice versa. However, SearchString is only used from node_buffer.cc, where PatternChar and SubjectChar are always the same. Since this is extra complexity that's unused and untested (simplifying to a single Char template argument still compiles and didn't break any unit tests), I removed it. * Use Boyer-Hoore[-Horspool] for both indexOf and lastIndexOf Add test cases for lastIndexOf. Test the fallback from BMH to Boyer-Moore, which looks like it was totally untested before. * Extra bounds checks in node_buffer.cc * Extra asserts in string_search.h * Buffer.lastIndexOf: clean up, enforce consistency w/ String.lastIndexOf * Polyfill memrchr(3) for non-GNU systems PR-URL: https://github.com/nodejs/node/pull/4846 Reviewed-By: James M Snell Reviewed-By: Trevor Norris --- doc/api/buffer.md | 40 +++ lib/buffer.js | 72 ++++-- src/node_buffer.cc | 128 ++++++---- src/string_search.h | 366 ++++++++++++++------------- test/parallel/test-buffer-indexof.js | 117 +++++++++ 5 files changed, 471 insertions(+), 252 deletions(-) diff --git a/doc/api/buffer.md b/doc/api/buffer.md index b367b8bed52a18..1d3b65ebfeb0f4 100644 --- a/doc/api/buffer.md +++ b/doc/api/buffer.md @@ -988,6 +988,46 @@ for (var key of buf.keys()) { // 5 ``` +### buf.lastIndexOf(value[, byteOffset][, encoding]) + +* `value` {String|Buffer|Number} +* `byteOffset` {Number} Default: `buf.length` +* `encoding` {String} Default: `'utf8'` +* Return: {Number} + +Identical to [`Buffer#indexOf()`][], but searches the Buffer from back to front +instead of front to back. Returns the starting index position of `value` in +Buffer or `-1` if the Buffer does not contain `value`. The `value` can be a +String, Buffer or Number. Strings are by default interpreted as UTF8. If +`byteOffset` is provided, will return the last match that begins at or before +`byteOffset`. + +```js +const buf = new Buffer('this buffer is a buffer'); + +buf.lastIndexOf('this'); + // returns 0 +buf.lastIndexOf('buffer'); + // returns 17 +buf.lastIndexOf(new Buffer('buffer')); + // returns 17 +buf.lastIndexOf(97); // ascii for 'a' + // returns 15 +buf.lastIndexOf(new Buffer('yolo')); + // returns -1 +buf.lastIndexOf('buffer', 5) + // returns 5 +buf.lastIndexOf('buffer', 4) + // returns -1 + +const utf16Buffer = new Buffer('\u039a\u0391\u03a3\u03a3\u0395', 'ucs2'); + +utf16Buffer.lastIndexOf('\u03a3', null, 'ucs2'); + // returns 6 +utf16Buffer.lastIndexOf('\u03a3', -5, 'ucs2'); + // returns 4 +``` + ### buf.length * {Number} diff --git a/lib/buffer.js b/lib/buffer.js index 1d0963bb509544..9076ad24c0ce6e 100644 --- a/lib/buffer.js +++ b/lib/buffer.js @@ -598,7 +598,48 @@ Buffer.prototype.compare = function compare(target, return binding.compareOffset(this, target, start, thisStart, end, thisEnd); }; -function slowIndexOf(buffer, val, byteOffset, encoding) { + +// Finds either the first index of `val` in `buffer` at offset >= `byteOffset`, +// OR the last index of `val` in `buffer` at offset <= `byteOffset`. +// +// Arguments: +// - buffer - a Buffer to search +// - val - a string, Buffer, or number +// - byteOffset - an index into `buffer`; will be clamped to an int32 +// - encoding - an optional encoding, relevant is val is a string +// - dir - true for indexOf, false for lastIndexOf +function bidirectionalIndexOf(buffer, val, byteOffset, encoding, dir) { + if (typeof byteOffset === 'string') { + encoding = byteOffset; + byteOffset = undefined; + } else if (byteOffset > 0x7fffffff) { + byteOffset = 0x7fffffff; + } else if (byteOffset < -0x80000000) { + byteOffset = -0x80000000; + } + byteOffset = +byteOffset; // Coerce to Number. + if (isNaN(byteOffset)) { + // If the offset is undefined, null, NaN, "foo", etc, search whole buffer. + byteOffset = dir ? 0 : (buffer.length - 1); + } + dir = !!dir; // Cast to bool. + + if (typeof val === 'string') { + if (encoding === undefined) { + return binding.indexOfString(buffer, val, byteOffset, encoding, dir); + } + return slowIndexOf(buffer, val, byteOffset, encoding, dir); + } else if (val instanceof Buffer) { + return binding.indexOfBuffer(buffer, val, byteOffset, encoding, dir); + } else if (typeof val === 'number') { + return binding.indexOfNumber(buffer, val, byteOffset, dir); + } + + throw new TypeError('"val" argument must be string, number or Buffer'); +} + + +function slowIndexOf(buffer, val, byteOffset, encoding, dir) { var loweredCase = false; for (;;) { switch (encoding) { @@ -609,13 +650,13 @@ function slowIndexOf(buffer, val, byteOffset, encoding) { case 'utf16le': case 'utf-16le': case 'binary': - return binding.indexOfString(buffer, val, byteOffset, encoding); + return binding.indexOfString(buffer, val, byteOffset, encoding, dir); case 'base64': case 'ascii': case 'hex': return binding.indexOfBuffer( - buffer, Buffer.from(val, encoding), byteOffset, encoding); + buffer, Buffer.from(val, encoding), byteOffset, encoding, dir); default: if (loweredCase) { @@ -628,29 +669,14 @@ function slowIndexOf(buffer, val, byteOffset, encoding) { } } + Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) { - if (typeof byteOffset === 'string') { - encoding = byteOffset; - byteOffset = 0; - } else if (byteOffset > 0x7fffffff) { - byteOffset = 0x7fffffff; - } else if (byteOffset < -0x80000000) { - byteOffset = -0x80000000; - } - byteOffset >>= 0; + return bidirectionalIndexOf(this, val, byteOffset, encoding, true); +}; - if (typeof val === 'string') { - if (encoding === undefined) { - return binding.indexOfString(this, val, byteOffset, encoding); - } - return slowIndexOf(this, val, byteOffset, encoding); - } else if (val instanceof Buffer) { - return binding.indexOfBuffer(this, val, byteOffset, encoding); - } else if (typeof val === 'number') { - return binding.indexOfNumber(this, val, byteOffset); - } - throw new TypeError('"val" argument must be string, number or Buffer'); +Buffer.prototype.lastIndexOf = function lastIndexOf(val, byteOffset, encoding) { + return bidirectionalIndexOf(this, val, byteOffset, encoding, false); }; diff --git a/src/node_buffer.cc b/src/node_buffer.cc index ea62dc929435aa..67b469511b0f2d 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -943,9 +943,44 @@ void Compare(const FunctionCallbackInfo &args) { } +// Computes the offset for starting an indexOf or lastIndexOf search. +// Returns either a valid offset in [0...], ie inside the Buffer, +// or -1 to signal that there is no possible match. +int64_t IndexOfOffset(size_t length, int64_t offset_i64, bool is_forward) { + int64_t length_i64 = static_cast(length); + if (length_i64 == 0) { + // Empty buffer, no match. + return -1; + } + if (offset_i64 < 0) { + if (offset_i64 + length_i64 >= 0) { + // Negative offsets count backwards from the end of the buffer. + return length_i64 + offset_i64; + } else if (is_forward) { + // indexOf from before the start of the buffer: search the whole buffer. + return 0; + } else { + // lastIndexOf from before the start of the buffer: no match. + return -1; + } + } else { + if (offset_i64 < length_i64) { + // Valid positive offset. + return offset_i64; + } else if (is_forward) { + // indexOf from past the end of the buffer: no match. + return -1; + } else { + // lastIndexOf from past the end of the buffer: search the whole buffer. + return length_i64 - 1; + } + } +} + void IndexOfString(const FunctionCallbackInfo& args) { ASSERT(args[1]->IsString()); ASSERT(args[2]->IsNumber()); + ASSERT(args[4]->IsBoolean()); enum encoding enc = ParseEncoding(args.GetIsolate(), args[3], @@ -955,31 +990,26 @@ void IndexOfString(const FunctionCallbackInfo& args) { SPREAD_ARG(args[0], ts_obj); Local needle = args[1].As(); + int64_t offset_i64 = args[2]->IntegerValue(); + bool is_forward = args[4]->IsTrue(); + const char* haystack = ts_obj_data; const size_t haystack_length = ts_obj_length; // Extended latin-1 characters are 2 bytes in Utf8. const size_t needle_length = enc == BINARY ? needle->Length() : needle->Utf8Length(); - if (needle_length == 0 || haystack_length == 0) { return args.GetReturnValue().Set(-1); } - int64_t offset_i64 = args[2]->IntegerValue(); - size_t offset = 0; - - if (offset_i64 < 0) { - if (offset_i64 + static_cast(haystack_length) < 0) { - offset = 0; - } else { - offset = static_cast(haystack_length + offset_i64); - } - } else { - offset = static_cast(offset_i64); + int64_t opt_offset = IndexOfOffset(haystack_length, offset_i64, is_forward); + if (opt_offset <= -1) { + return args.GetReturnValue().Set(-1); } - - if (haystack_length < offset || needle_length + offset > haystack_length) { + size_t offset = static_cast(opt_offset); + CHECK_LT(offset, haystack_length); + if (is_forward && needle_length + offset > haystack_length) { return args.GetReturnValue().Set(-1); } @@ -1007,13 +1037,15 @@ void IndexOfString(const FunctionCallbackInfo& args) { haystack_length / 2, decoded_string, decoder.size() / 2, - offset / 2); + offset / 2, + is_forward); } else { result = SearchString(reinterpret_cast(haystack), haystack_length / 2, reinterpret_cast(*needle_value), needle_value.length(), - offset / 2); + offset / 2, + is_forward); } result *= 2; } else if (enc == UTF8) { @@ -1025,7 +1057,8 @@ void IndexOfString(const FunctionCallbackInfo& args) { haystack_length, reinterpret_cast(*needle_value), needle_length, - offset); + offset, + is_forward); } else if (enc == BINARY) { uint8_t* needle_data = static_cast(malloc(needle_length)); if (needle_data == nullptr) { @@ -1038,7 +1071,8 @@ void IndexOfString(const FunctionCallbackInfo& args) { haystack_length, needle_data, needle_length, - offset); + offset, + is_forward); free(needle_data); } @@ -1049,17 +1083,18 @@ void IndexOfString(const FunctionCallbackInfo& args) { void IndexOfBuffer(const FunctionCallbackInfo& args) { ASSERT(args[1]->IsObject()); ASSERT(args[2]->IsNumber()); + ASSERT(args[4]->IsBoolean()); enum encoding enc = ParseEncoding(args.GetIsolate(), args[3], UTF8); THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]); + THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[1]); SPREAD_ARG(args[0], ts_obj); SPREAD_ARG(args[1], buf); - - if (buf_length > 0) - CHECK_NE(buf_data, nullptr); + int64_t offset_i64 = args[2]->IntegerValue(); + bool is_forward = args[4]->IsTrue(); const char* haystack = ts_obj_data; const size_t haystack_length = ts_obj_length; @@ -1070,19 +1105,13 @@ void IndexOfBuffer(const FunctionCallbackInfo& args) { return args.GetReturnValue().Set(-1); } - int64_t offset_i64 = args[2]->IntegerValue(); - size_t offset = 0; - - if (offset_i64 < 0) { - if (offset_i64 + static_cast(haystack_length) < 0) - offset = 0; - else - offset = static_cast(haystack_length + offset_i64); - } else { - offset = static_cast(offset_i64); + int64_t opt_offset = IndexOfOffset(haystack_length, offset_i64, is_forward); + if (opt_offset <= -1) { + return args.GetReturnValue().Set(-1); } - - if (haystack_length < offset || needle_length + offset > haystack_length) { + size_t offset = static_cast(opt_offset); + CHECK_LT(offset, haystack_length); + if (is_forward && needle_length + offset > haystack_length) { return args.GetReturnValue().Set(-1); } @@ -1097,7 +1126,8 @@ void IndexOfBuffer(const FunctionCallbackInfo& args) { haystack_length / 2, reinterpret_cast(needle), needle_length / 2, - offset / 2); + offset / 2, + is_forward); result *= 2; } else { result = SearchString( @@ -1105,7 +1135,8 @@ void IndexOfBuffer(const FunctionCallbackInfo& args) { haystack_length, reinterpret_cast(needle), needle_length, - offset); + offset, + is_forward); } args.GetReturnValue().Set( @@ -1115,28 +1146,29 @@ void IndexOfBuffer(const FunctionCallbackInfo& args) { void IndexOfNumber(const FunctionCallbackInfo& args) { ASSERT(args[1]->IsNumber()); ASSERT(args[2]->IsNumber()); + ASSERT(args[3]->IsBoolean()); THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]); SPREAD_ARG(args[0], ts_obj); uint32_t needle = args[1]->Uint32Value(); int64_t offset_i64 = args[2]->IntegerValue(); - size_t offset; - - if (offset_i64 < 0) { - if (offset_i64 + static_cast(ts_obj_length) < 0) - offset = 0; - else - offset = static_cast(ts_obj_length + offset_i64); - } else { - offset = static_cast(offset_i64); - } + bool is_forward = args[3]->IsTrue(); - if (ts_obj_length == 0 || offset + 1 > ts_obj_length) + int64_t opt_offset = IndexOfOffset(ts_obj_length, offset_i64, is_forward); + if (opt_offset <= -1) { return args.GetReturnValue().Set(-1); + } + size_t offset = static_cast(opt_offset); + CHECK_LT(offset, ts_obj_length); - void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset); - char* ptr_char = static_cast(ptr); + const void* ptr; + if (is_forward) { + ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset); + } else { + ptr = node::stringsearch::MemrchrFill(ts_obj_data, needle, offset + 1); + } + const char* ptr_char = static_cast(ptr); args.GetReturnValue().Set(ptr ? static_cast(ptr_char - ts_obj_data) : -1); } diff --git a/src/string_search.h b/src/string_search.h index 2a2790b2cc6f82..bf246702d7e75e 100644 --- a/src/string_search.h +++ b/src/string_search.h @@ -21,60 +21,35 @@ T Max(T a, T b) { static const uint32_t kMaxOneByteCharCodeU = 0xff; - -static inline size_t NonOneByteStart(const uint16_t* chars, size_t length) { - const uint16_t* limit = chars + length; - const uint16_t* start = chars; - while (chars < limit) { - if (*chars > kMaxOneByteCharCodeU) - return static_cast(chars - start); - ++chars; - } - return static_cast(chars - start); -} - - -static inline bool IsOneByte(const uint16_t* chars, size_t length) { - return NonOneByteStart(chars, length) >= length; -} - - template class Vector { public: - Vector(T* data, size_t length) : start_(data), length_(length) { + Vector(T* data, size_t length, bool isForward) + : start_(data), length_(length), is_forward_(isForward) { ASSERT(length > 0 && data != nullptr); } - // Returns the length of the vector. + // Returns the start of the memory range. + // For vector v this is NOT necessarily &v[0], see forward(). + const T* start() const { return start_; } + + // Returns the length of the vector, in characters. size_t length() const { return length_; } - T* start() const { return start_; } + // Returns true if the Vector is front-to-back, false if back-to-front. + // In the latter case, v[0] corresponds to the *end* of the memory range. + size_t forward() const { return is_forward_; } // Access individual vector elements - checks bounds in debug mode. T& operator[](size_t index) const { ASSERT(0 <= index && index < length_); - return start_[index]; - } - - const T& at(size_t index) const { return operator[](index); } - - bool operator==(const Vector& other) const { - if (length_ != other.length_) - return false; - if (start_ == other.start_) - return true; - for (size_t i = 0; i < length_; ++i) { - if (start_[i] != other.start_[i]) { - return false; - } - } - return true; + return start_[is_forward_ ? index : (length_ - index - 1)]; } private: T* start_; size_t length_; + bool is_forward_; }; @@ -114,31 +89,17 @@ class StringSearchBase { // Table used temporarily while building the BoyerMoore good suffix // shift table. static int kSuffixTable[kBMMaxShift + 1]; - - static inline bool IsOneByteString(Vector string) { - return true; - } - - static inline bool IsOneByteString(Vector string) { - return IsOneByte(string.start(), string.length()); - } }; -template +template class StringSearch : private StringSearchBase { public: - explicit StringSearch(Vector pattern) + explicit StringSearch(Vector pattern) : pattern_(pattern), start_(0) { if (pattern.length() >= kBMMaxShift) { start_ = pattern.length() - kBMMaxShift; } - if (sizeof(PatternChar) > sizeof(SubjectChar)) { - if (!IsOneByteString(pattern_)) { - strategy_ = &FailSearch; - return; - } - } size_t pattern_length = pattern_.length(); CHECK_GT(pattern_length, 0); if (pattern_length < kBMMinPatternLength) { @@ -152,12 +113,12 @@ class StringSearch : private StringSearchBase { strategy_ = &InitialSearch; } - size_t Search(Vector subject, size_t index) { + size_t Search(Vector subject, size_t index) { return strategy_(this, subject, index); } static inline int AlphabetSize() { - if (sizeof(PatternChar) == 1) { + if (sizeof(Char) == 1) { // Latin1 needle. return kLatin1AlphabetSize; } else { @@ -165,42 +126,42 @@ class StringSearch : private StringSearchBase { return kUC16AlphabetSize; } - static_assert(sizeof(PatternChar) == sizeof(uint8_t) || - sizeof(PatternChar) == sizeof(uint16_t), - "sizeof(PatternChar) == sizeof(uint16_t) || sizeof(uint8_t)"); + static_assert(sizeof(Char) == sizeof(uint8_t) || + sizeof(Char) == sizeof(uint16_t), + "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); } private: typedef size_t (*SearchFunction)( // NOLINT - it's not a cast! - StringSearch*, - Vector, + StringSearch*, + Vector, size_t); - static size_t FailSearch(StringSearch*, - Vector subject, + static size_t FailSearch(StringSearch*, + Vector subject, size_t) { return subject.length(); } - static size_t SingleCharSearch(StringSearch* search, - Vector subject, + static size_t SingleCharSearch(StringSearch* search, + Vector subject, size_t start_index); - static size_t LinearSearch(StringSearch* search, - Vector subject, + static size_t LinearSearch(StringSearch* search, + Vector subject, size_t start_index); - static size_t InitialSearch(StringSearch* search, - Vector subject, + static size_t InitialSearch(StringSearch* search, + Vector subject, size_t start_index); static size_t BoyerMooreHorspoolSearch( - StringSearch* search, - Vector subject, + StringSearch* search, + Vector subject, size_t start_index); - static size_t BoyerMooreSearch(StringSearch* search, - Vector subject, + static size_t BoyerMooreSearch(StringSearch* search, + Vector subject, size_t start_index); void PopulateBoyerMooreHorspoolTable(); @@ -214,16 +175,10 @@ class StringSearch : private StringSearchBase { } static inline int CharOccurrence(int* bad_char_occurrence, - SubjectChar char_code) { - if (sizeof(SubjectChar) == 1) { + Char char_code) { + if (sizeof(Char) == 1) { return bad_char_occurrence[static_cast(char_code)]; } - if (sizeof(PatternChar) == 1) { - if (exceedsOneByte(char_code)) { - return -1; - } - return bad_char_occurrence[static_cast(char_code)]; - } // Both pattern and subject are UC16. Reduce character to equivalence class. int equiv_class = char_code % kUC16AlphabetSize; return bad_char_occurrence[equiv_class]; @@ -250,7 +205,7 @@ class StringSearch : private StringSearchBase { } // The pattern to search for. - Vector pattern_; + Vector pattern_; // Pointer to implementation of the search. SearchFunction strategy_; // Cache value of Max(0, pattern_length() - kBMMaxShift) @@ -274,111 +229,138 @@ inline uint8_t GetHighestValueByte(uint16_t character) { inline uint8_t GetHighestValueByte(uint8_t character) { return character; } -template -inline size_t FindFirstCharacter(Vector pattern, - Vector subject, size_t index) { - const PatternChar pattern_first_char = pattern[0]; +// Searches for a byte value in a memory buffer, back to front. +// Uses memrchr(3) on systems which support it, for speed. +// Falls back to a vanilla for loop on non-GNU systems such as Windows. +inline const void* MemrchrFill(const void* haystack, uint8_t needle, + size_t haystack_len) { +#ifdef _GNU_SOURCE + return memrchr(haystack, needle, haystack_len); +#else + const uint8_t* haystack8 = static_cast(haystack); + for (size_t i = haystack_len - 1; i != static_cast(-1); i--) { + if (haystack8[i] == needle) { + return haystack8 + i; + } + } + return nullptr; +#endif +} + + +// Finds the first occurence of *two-byte* character pattern[0] in the string +// `subject`. Does not check that the whole pattern matches. +template +inline size_t FindFirstCharacter(Vector pattern, + Vector subject, size_t index) { + const Char pattern_first_char = pattern[0]; const size_t max_n = (subject.length() - pattern.length() + 1); + // For speed, search for the more `rare` of the two bytes in pattern[0] + // using memchr / memrchr (which are much faster than a simple for loop). const uint8_t search_byte = GetHighestValueByte(pattern_first_char); - const SubjectChar search_char = static_cast(pattern_first_char); size_t pos = index; do { - const SubjectChar* char_pos = reinterpret_cast( - memchr(subject.start() + pos, search_byte, - (max_n - pos) * sizeof(SubjectChar))); + size_t bytes_to_search; + const void* void_pos; + if (subject.forward()) { + // Assert that bytes_to_search won't overflow + CHECK_LE(pos, max_n); + CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char)); + bytes_to_search = (max_n - pos) * sizeof(Char); + void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); + } else { + CHECK_LE(pos, subject.length()); + CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char)); + bytes_to_search = (subject.length() - pos) * sizeof(Char); + void_pos = MemrchrFill(subject.start(), search_byte, bytes_to_search); + } + const Char* char_pos = static_cast(void_pos); if (char_pos == nullptr) return subject.length(); - char_pos = AlignDown(char_pos, sizeof(SubjectChar)); - pos = static_cast(char_pos - subject.start()); - if (subject[pos] == search_char) + + // Then, for each match, verify that the full two bytes match pattern[0]. + char_pos = AlignDown(char_pos, sizeof(Char)); + size_t raw_pos = static_cast(char_pos - subject.start()); + pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); + if (subject[pos] == pattern_first_char) { + // Match found, hooray. return pos; + } + // Search byte matched, but the other byte of pattern[0] didn't. Keep going. } while (++pos < max_n); return subject.length(); } +// Finds the first occurance of the byte pattern[0] in string `subject`. +// Does not verify that the whole pattern matches. template <> inline size_t FindFirstCharacter(Vector pattern, Vector subject, size_t index) { const uint8_t pattern_first_char = pattern[0]; + const size_t subj_len = subject.length(); const size_t max_n = (subject.length() - pattern.length() + 1); - const uint8_t* char_pos = reinterpret_cast( - memchr(subject.start() + index, pattern_first_char, max_n - index)); - if (char_pos == nullptr) - return subject.length(); - return static_cast(char_pos - subject.start()); + const void* pos; + if (subject.forward()) { + pos = memchr(subject.start() + index, pattern_first_char, max_n - index); + } else { + pos = MemrchrFill(subject.start(), pattern_first_char, subj_len - index); + } + const uint8_t* char_pos = static_cast(pos); + if (char_pos == nullptr) { + return subj_len; + } + + size_t raw_pos = static_cast(char_pos - subject.start()); + return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); } //--------------------------------------------------------------------- // Single Character Pattern Search Strategy //--------------------------------------------------------------------- -template -size_t StringSearch::SingleCharSearch( - StringSearch* search, - Vector subject, +template +size_t StringSearch::SingleCharSearch( + StringSearch* search, + Vector subject, size_t index) { CHECK_EQ(1, search->pattern_.length()); - PatternChar pattern_first_char = search->pattern_[0]; - - if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { - return FindFirstCharacter(search->pattern_, subject, index); - } else { - if (sizeof(PatternChar) > sizeof(SubjectChar)) { - if (exceedsOneByte(pattern_first_char)) { - return -1; - } - } - return FindFirstCharacter(search->pattern_, subject, index); - } + return FindFirstCharacter(search->pattern_, subject, index); } //--------------------------------------------------------------------- // Linear Search Strategy //--------------------------------------------------------------------- -template -inline bool CharCompare(const PatternChar* pattern, - const SubjectChar* subject, - size_t length) { - ASSERT_GT(length, 0); - size_t pos = 0; - do { - if (pattern[pos] != subject[pos]) { - return false; - } - pos++; - } while (pos < length); - return true; -} - // Simple linear search for short patterns. Never bails out. -template -size_t StringSearch::LinearSearch( - StringSearch* search, - Vector subject, +template +size_t StringSearch::LinearSearch( + StringSearch* search, + Vector subject, size_t index) { - Vector pattern = search->pattern_; + Vector pattern = search->pattern_; CHECK_GT(pattern.length(), 1); const size_t pattern_length = pattern.length(); - size_t i = index; const size_t n = subject.length() - pattern_length; - while (i <= n) { + for (size_t i = index; i <= n; i++) { i = FindFirstCharacter(pattern, subject, i); if (i == subject.length()) return subject.length(); ASSERT_LE(i, n); - i++; - // Loop extracted to separate function to allow using return to do - // a deeper break. - if (CharCompare(pattern.start() + 1, subject.start() + i, - pattern_length - 1)) { - return i - 1; + bool matches = true; + for (size_t j = 1; j < pattern_length; j++) { + if (pattern[j] != subject[i + j]) { + matches = false; + break; + } + } + if (matches) { + return i; } } return subject.length(); @@ -388,12 +370,12 @@ size_t StringSearch::LinearSearch( // Boyer-Moore string search //--------------------------------------------------------------------- -template -size_t StringSearch::BoyerMooreSearch( - StringSearch* search, - Vector subject, +template +size_t StringSearch::BoyerMooreSearch( + StringSearch* search, + Vector subject, size_t start_index) { - Vector pattern = search->pattern_; + Vector pattern = search->pattern_; const size_t subject_length = subject.length(); const size_t pattern_length = pattern.length(); // Only preprocess at most kBMMaxShift last characters of pattern. @@ -402,7 +384,7 @@ size_t StringSearch::BoyerMooreSearch( int* bad_char_occurence = search->bad_char_table(); int* good_suffix_shift = search->good_suffix_shift_table(); - PatternChar last_char = pattern[pattern_length - 1]; + Char last_char = pattern[pattern_length - 1]; size_t index = start_index; // Continue search from i. while (index <= subject_length - pattern_length) { @@ -426,7 +408,7 @@ size_t StringSearch::BoyerMooreSearch( // Fall back on BMH shift. index += pattern_length - 1 - CharOccurrence(bad_char_occurence, - static_cast(last_char)); + static_cast(last_char)); } else { int gs_shift = good_suffix_shift[j + 1]; int bc_occ = CharOccurrence(bad_char_occurence, c); @@ -441,10 +423,10 @@ size_t StringSearch::BoyerMooreSearch( return subject.length(); } -template -void StringSearch::PopulateBoyerMooreTable() { +template +void StringSearch::PopulateBoyerMooreTable() { const size_t pattern_length = pattern_.length(); - const PatternChar* pattern = pattern_.start(); + Vector pattern = pattern_; // Only look at the last kBMMaxShift characters of pattern (from start_ // to pattern_length). const size_t start = start_; @@ -467,12 +449,12 @@ void StringSearch::PopulateBoyerMooreTable() { } // Find suffixes. - PatternChar last_char = pattern[pattern_length - 1]; + Char last_char = pattern_[pattern_length - 1]; size_t suffix = pattern_length + 1; { size_t i = pattern_length; while (i > start) { - PatternChar c = pattern[i - 1]; + Char c = pattern[i - 1]; while (suffix <= pattern_length && c != pattern[suffix - 1]) { if (static_cast(shift_table[suffix]) == length) { shift_table[suffix] = suffix - i; @@ -511,22 +493,22 @@ void StringSearch::PopulateBoyerMooreTable() { // Boyer-Moore-Horspool string search. //--------------------------------------------------------------------- -template -size_t StringSearch::BoyerMooreHorspoolSearch( - StringSearch* search, - Vector subject, +template +size_t StringSearch::BoyerMooreHorspoolSearch( + StringSearch* search, + Vector subject, size_t start_index) { - Vector pattern = search->pattern_; + Vector pattern = search->pattern_; const size_t subject_length = subject.length(); const size_t pattern_length = pattern.length(); int* char_occurrences = search->bad_char_table(); int64_t badness = -pattern_length; // How bad we are doing without a good-suffix table. - PatternChar last_char = pattern[pattern_length - 1]; + Char last_char = pattern[pattern_length - 1]; int last_char_shift = pattern_length - 1 - - CharOccurrence(char_occurrences, static_cast(last_char)); + CharOccurrence(char_occurrences, static_cast(last_char)); // Perform search size_t index = start_index; // No matches found prior to this index. @@ -564,8 +546,8 @@ size_t StringSearch::BoyerMooreHorspoolSearch( return subject.length(); } -template -void StringSearch::PopulateBoyerMooreHorspoolTable() { +template +void StringSearch::PopulateBoyerMooreHorspoolTable() { const size_t pattern_length = pattern_.length(); int* bad_char_occurrence = bad_char_table(); @@ -585,8 +567,8 @@ void StringSearch::PopulateBoyerMooreHorspoolTable() { } } for (size_t i = start; i < pattern_length - 1; i++) { - PatternChar c = pattern_[i]; - int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); + Char c = pattern_[i]; + int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); bad_char_occurrence[bucket] = i; } } @@ -597,12 +579,12 @@ void StringSearch::PopulateBoyerMooreHorspoolTable() { // Simple linear search for short patterns, which bails out if the string // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. -template -size_t StringSearch::InitialSearch( - StringSearch* search, - Vector subject, +template +size_t StringSearch::InitialSearch( + StringSearch* search, + Vector subject, size_t index) { - Vector pattern = search->pattern_; + Vector pattern = search->pattern_; const size_t pattern_length = pattern.length(); // Badness is a count of how much work we have done. When we have // done enough work we decide it's probably worth switching to a better @@ -642,11 +624,11 @@ size_t StringSearch::InitialSearch( // If searching multiple times for the same pattern, a search // object should be constructed once and the Search function then called // for each search. -template -size_t SearchString(Vector subject, - Vector pattern, +template +size_t SearchString(Vector subject, + Vector pattern, size_t start_index) { - StringSearch search(pattern); + StringSearch search(pattern); return search.Search(subject, start_index); } } @@ -655,16 +637,38 @@ size_t SearchString(Vector subject, namespace node { using node::stringsearch::Vector; -template -size_t SearchString(const SubjectChar* haystack, +template +size_t SearchString(const Char* haystack, size_t haystack_length, - const PatternChar* needle, + const Char* needle, size_t needle_length, - size_t start_index) { - return node::stringsearch::SearchString( - Vector(haystack, haystack_length), - Vector(needle, needle_length), - start_index); + size_t start_index, + bool is_forward) { + // To do a reverse search (lastIndexOf instead of indexOf) without redundant + // code, create two vectors that are reversed views into the input strings. + // For example, v_needle[0] would return the *last* character of the needle. + // So we're searching for the first instance of rev(needle) in rev(haystack) + Vector v_needle = Vector( + needle, needle_length, is_forward); + Vector v_haystack = Vector( + haystack, haystack_length, is_forward); + ASSERT(haystack_length >= needle_length); + size_t diff = haystack_length - needle_length; + size_t relative_start_index; + if (is_forward) { + relative_start_index = start_index; + } else if (diff < start_index) { + relative_start_index = 0; + } else { + relative_start_index = diff - start_index; + } + size_t pos = node::stringsearch::SearchString( + v_haystack, v_needle, relative_start_index); + if (pos == haystack_length) { + // not found + return pos; + } + return is_forward ? pos : (haystack_length - needle_length - pos); } } // namespace node diff --git a/test/parallel/test-buffer-indexof.js b/test/parallel/test-buffer-indexof.js index 7fb862d9190d89..0a20d2ce9af021 100644 --- a/test/parallel/test-buffer-indexof.js +++ b/test/parallel/test-buffer-indexof.js @@ -282,3 +282,120 @@ assert.throws(function() { assert.throws(function() { b.indexOf([]); }); + +// All code for handling encodings is shared between Buffer.indexOf and +// Buffer.lastIndexOf, so only testing the separate lastIndexOf semantics. + +// Test lastIndexOf basic functionality; Buffer b contains 'abcdef'. +// lastIndexOf string: +assert.equal(b.lastIndexOf('a'), 0); +assert.equal(b.lastIndexOf('a', 1), 0); +assert.equal(b.lastIndexOf('b', 1), 1); +assert.equal(b.lastIndexOf('c', 1), -1); +assert.equal(b.lastIndexOf('a', -1), 0); +assert.equal(b.lastIndexOf('a', -4), 0); +assert.equal(b.lastIndexOf('a', -b.length), 0); +assert.equal(b.lastIndexOf('a', -b.length - 1), -1); +assert.equal(b.lastIndexOf('a', NaN), 0); +assert.equal(b.lastIndexOf('a', -Infinity), -1); +assert.equal(b.lastIndexOf('a', Infinity), 0); +// lastIndexOf Buffer: +assert.equal(b.lastIndexOf(buf_a), 0); +assert.equal(b.lastIndexOf(buf_a, 1), 0); +assert.equal(b.lastIndexOf(buf_a, -1), 0); +assert.equal(b.lastIndexOf(buf_a, -4), 0); +assert.equal(b.lastIndexOf(buf_a, -b.length), 0); +assert.equal(b.lastIndexOf(buf_a, -b.length - 1), -1); +assert.equal(b.lastIndexOf(buf_a, NaN), 0); +assert.equal(b.lastIndexOf(buf_a, -Infinity), -1); +assert.equal(b.lastIndexOf(buf_a, Infinity), 0); +assert.equal(b.lastIndexOf(buf_bc), 1); +assert.equal(b.lastIndexOf(buf_bc, 2), 1); +assert.equal(b.lastIndexOf(buf_bc, -1), 1); +assert.equal(b.lastIndexOf(buf_bc, -3), 1); +assert.equal(b.lastIndexOf(buf_bc, -5), 1); +assert.equal(b.lastIndexOf(buf_bc, -6), -1); +assert.equal(b.lastIndexOf(buf_bc, NaN), 1); +assert.equal(b.lastIndexOf(buf_bc, -Infinity), -1); +assert.equal(b.lastIndexOf(buf_bc, Infinity), 1); +assert.equal(b.lastIndexOf(buf_f), b.length - 1); +assert.equal(b.lastIndexOf(buf_z), -1); +assert.equal(b.lastIndexOf(buf_empty), -1); +assert.equal(b.lastIndexOf(buf_empty, 1), -1); +assert.equal(b.lastIndexOf(buf_empty, b.length + 1), -1); +assert.equal(b.lastIndexOf(buf_empty, Infinity), -1); +// lastIndexOf number: +assert.equal(b.lastIndexOf(0x61), 0); +assert.equal(b.lastIndexOf(0x61, 1), 0); +assert.equal(b.lastIndexOf(0x61, -1), 0); +assert.equal(b.lastIndexOf(0x61, -4), 0); +assert.equal(b.lastIndexOf(0x61, -b.length), 0); +assert.equal(b.lastIndexOf(0x61, -b.length - 1), -1); +assert.equal(b.lastIndexOf(0x61, NaN), 0); +assert.equal(b.lastIndexOf(0x61, -Infinity), -1); +assert.equal(b.lastIndexOf(0x61, Infinity), 0); +assert.equal(b.lastIndexOf(0x0), -1); + +// Test weird offset arguments. +// Behaviour should match String.lastIndexOf: +assert.equal(b.lastIndexOf('b', 0), -1); +assert.equal(b.lastIndexOf('b', undefined), 1); +assert.equal(b.lastIndexOf('b', null), -1); +assert.equal(b.lastIndexOf('b', {}), 1); +assert.equal(b.lastIndexOf('b', []), -1); +assert.equal(b.lastIndexOf('b', [2]), 1); + +// Test lastIndexOf on a longer buffer: +var bufferString = new Buffer('a man a plan a canal panama'); +assert.equal(15, bufferString.lastIndexOf('canal')); +assert.equal(21, bufferString.lastIndexOf('panama')); +assert.equal(0, bufferString.lastIndexOf('a man a plan a canal panama')); +assert.equal(-1, bufferString.lastIndexOf('a man a plan a canal mexico')); +assert.equal(13, bufferString.lastIndexOf('a ')); +assert.equal(13, bufferString.lastIndexOf('a ', 13)); +assert.equal(6, bufferString.lastIndexOf('a ', 12)); +assert.equal(0, bufferString.lastIndexOf('a ', 5)); +assert.equal(13, bufferString.lastIndexOf('a ', -1)); +assert.equal(0, bufferString.lastIndexOf('a ', -27)); +assert.equal(-1, bufferString.lastIndexOf('a ', -28)); + +// The above tests test the LINEAR and SINGLE-CHAR strategies. +// Now, we test the BOYER-MOORE-HORSPOOL strategy. +// Test lastIndexOf on a long buffer w multiple matches: +pattern = 'JABACABADABACABA'; +assert.equal(1535, longBufferString.lastIndexOf(pattern)); +assert.equal(1535, longBufferString.lastIndexOf(pattern, 1535)); +assert.equal(511, longBufferString.lastIndexOf(pattern, 1534)); + +// Finally, give it a really long input to trigger fallback from BMH to +// regular BOYER-MOORE (which has better worst-case complexity). + +// Generate a really long Thue-Morse sequence of 'yolo' and 'swag', +// "yolo swag swag yolo swag yolo yolo swag" ..., goes on for about 5MB. +// This is hard to search because it all looks similar, but never repeats. + +// countBits returns the number of bits in the binary reprsentation of n. +function countBits(n) { + for (var count = 0; n > 0; count++) { + n = n & (n - 1); // remove top bit + } + return count; +} +var parts = []; +for (var i = 0; i < 1000000; i++) { + parts.push((countBits(i) % 2 === 0) ? 'yolo' : 'swag'); +} +var reallyLong = new Buffer(parts.join(' ')); +assert.equal('yolo swag swag yolo', reallyLong.slice(0, 19).toString()); + +// Expensive reverse searches. Stress test lastIndexOf: +pattern = reallyLong.slice(0, 100000); // First 1/50th of the pattern. +assert.equal(4751360, reallyLong.lastIndexOf(pattern)); +assert.equal(3932160, reallyLong.lastIndexOf(pattern, 4000000)); +assert.equal(2949120, reallyLong.lastIndexOf(pattern, 3000000)); +pattern = reallyLong.slice(100000, 200000); // Second 1/50th. +assert.equal(4728480, reallyLong.lastIndexOf(pattern)); +pattern = reallyLong.slice(0, 1000000); // First 1/5th. +assert.equal(3932160, reallyLong.lastIndexOf(pattern)); +pattern = reallyLong.slice(0, 2000000); // first 2/5ths. +assert.equal(0, reallyLong.lastIndexOf(pattern));