The C and C++ Include Header Files
/usr/include/nodejs/src/string_search.h
$ cat -n /usr/include/nodejs/src/string_search.h 1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef SRC_STRING_SEARCH_H_ 6 #define SRC_STRING_SEARCH_H_ 7 8 #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS 9 10 #include "util.h" 11 12 #include
13 #include
14 15 namespace node { 16 namespace stringsearch { 17 18 template
19 class Vector { 20 public: 21 Vector(T* data, size_t length, bool isForward) 22 : start_(data), length_(length), is_forward_(isForward) { 23 CHECK(length > 0 && data != nullptr); 24 } 25 26 // Returns the start of the memory range. 27 // For vector v this is NOT necessarily &v[0], see forward(). 28 const T* start() const { return start_; } 29 30 // Returns the length of the vector, in characters. 31 size_t length() const { return length_; } 32 33 // Returns true if the Vector is front-to-back, false if back-to-front. 34 // In the latter case, v[0] corresponds to the *end* of the memory range. 35 bool forward() const { return is_forward_; } 36 37 // Access individual vector elements - checks bounds in debug mode. 38 T& operator[](size_t index) const { 39 DCHECK_LT(index, length_); 40 return start_[is_forward_ ? index : (length_ - index - 1)]; 41 } 42 43 private: 44 T* start_; 45 size_t length_; 46 bool is_forward_; 47 }; 48 49 50 //--------------------------------------------------------------------- 51 // String Search object. 52 //--------------------------------------------------------------------- 53 54 // Class holding constants and methods that apply to all string search variants, 55 // independently of subject and pattern char size. 56 class StringSearchBase { 57 protected: 58 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 59 // limit, we can fix the size of tables. For a needle longer than this limit, 60 // search will not be optimal, since we only build tables for a suffix 61 // of the string, but it is a safe approximation. 62 static const int kBMMaxShift = 250; 63 64 // Reduce alphabet to this size. 65 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 66 // proportional to the input alphabet. We reduce the alphabet size by 67 // equating input characters modulo a smaller alphabet size. This gives 68 // a potentially less efficient searching, but is a safe approximation. 69 // For needles using only characters in the same Unicode 256-code point page, 70 // there is no search speed degradation. 71 static const int kLatin1AlphabetSize = 256; 72 static const int kUC16AlphabetSize = 256; 73 74 // Bad-char shift table stored in the state. It's length is the alphabet size. 75 // For patterns below this length, the skip length of Boyer-Moore is too short 76 // to compensate for the algorithmic overhead compared to simple brute force. 77 static const int kBMMinPatternLength = 8; 78 79 // Store for the BoyerMoore(Horspool) bad char shift table. 80 int bad_char_shift_table_[kUC16AlphabetSize]; 81 // Store for the BoyerMoore good suffix shift table. 82 int good_suffix_shift_table_[kBMMaxShift + 1]; 83 // Table used temporarily while building the BoyerMoore good suffix 84 // shift table. 85 int suffix_table_[kBMMaxShift + 1]; 86 }; 87 88 template
89 class StringSearch : private StringSearchBase { 90 public: 91 typedef stringsearch::Vector
Vector; 92 93 explicit StringSearch(Vector pattern) 94 : pattern_(pattern), start_(0) { 95 if (pattern.length() >= kBMMaxShift) { 96 start_ = pattern.length() - kBMMaxShift; 97 } 98 99 size_t pattern_length = pattern_.length(); 100 CHECK_GT(pattern_length, 0); 101 if (pattern_length < kBMMinPatternLength) { 102 if (pattern_length == 1) { 103 strategy_ = SearchStrategy::kSingleChar; 104 return; 105 } 106 strategy_ = SearchStrategy::kLinear; 107 return; 108 } 109 strategy_ = SearchStrategy::kInitial; 110 } 111 112 size_t Search(Vector subject, size_t index) { 113 switch (strategy_) { 114 case kBoyerMooreHorspool: 115 return BoyerMooreHorspoolSearch(subject, index); 116 case kBoyerMoore: 117 return BoyerMooreSearch(subject, index); 118 case kInitial: 119 return InitialSearch(subject, index); 120 case kLinear: 121 return LinearSearch(subject, index); 122 case kSingleChar: 123 return SingleCharSearch(subject, index); 124 } 125 UNREACHABLE(); 126 } 127 128 static inline int AlphabetSize() { 129 if (sizeof(Char) == 1) { 130 // Latin1 needle. 131 return kLatin1AlphabetSize; 132 } else { 133 // UC16 needle. 134 return kUC16AlphabetSize; 135 } 136 137 static_assert(sizeof(Char) == sizeof(uint8_t) || 138 sizeof(Char) == sizeof(uint16_t), 139 "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); 140 } 141 142 private: 143 typedef size_t (StringSearch::*SearchFunction)(Vector, size_t); 144 size_t SingleCharSearch(Vector subject, size_t start_index); 145 size_t LinearSearch(Vector subject, size_t start_index); 146 size_t InitialSearch(Vector subject, size_t start_index); 147 size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index); 148 size_t BoyerMooreSearch(Vector subject, size_t start_index); 149 150 void PopulateBoyerMooreHorspoolTable(); 151 152 void PopulateBoyerMooreTable(); 153 154 static inline int CharOccurrence(int* bad_char_occurrence, 155 Char char_code) { 156 if (sizeof(Char) == 1) { 157 return bad_char_occurrence[static_cast
(char_code)]; 158 } 159 // Both pattern and subject are UC16. Reduce character to equivalence class. 160 int equiv_class = char_code % kUC16AlphabetSize; 161 return bad_char_occurrence[equiv_class]; 162 } 163 164 enum SearchStrategy { 165 kBoyerMooreHorspool, 166 kBoyerMoore, 167 kInitial, 168 kLinear, 169 kSingleChar, 170 }; 171 172 // The pattern to search for. 173 Vector pattern_; 174 SearchStrategy strategy_; 175 // Cache value of Max(0, pattern_length() - kBMMaxShift) 176 size_t start_; 177 }; 178 179 180 template
181 inline T AlignDown(T value, U alignment) { 182 return reinterpret_cast
( 183 (reinterpret_cast
(value) & ~(alignment - 1))); 184 } 185 186 187 inline uint8_t GetHighestValueByte(uint16_t character) { 188 return std::max(static_cast
(character & 0xFF), 189 static_cast
(character >> 8)); 190 } 191 192 193 inline uint8_t GetHighestValueByte(uint8_t character) { return character; } 194 195 196 // Searches for a byte value in a memory buffer, back to front. 197 // Uses memrchr(3) on systems which support it, for speed. 198 // Falls back to a vanilla for loop on non-GNU systems such as Windows. 199 inline const void* MemrchrFill(const void* haystack, uint8_t needle, 200 size_t haystack_len) { 201 #ifdef _GNU_SOURCE 202 return memrchr(haystack, needle, haystack_len); 203 #else 204 const uint8_t* haystack8 = static_cast
(haystack); 205 for (size_t i = haystack_len - 1; i != static_cast
(-1); i--) { 206 if (haystack8[i] == needle) { 207 return haystack8 + i; 208 } 209 } 210 return nullptr; 211 #endif 212 } 213 214 215 // Finds the first occurrence of *two-byte* character pattern[0] in the string 216 // `subject`. Does not check that the whole pattern matches. 217 template
218 inline size_t FindFirstCharacter(Vector
pattern, 219 Vector
subject, size_t index) { 220 const Char pattern_first_char = pattern[0]; 221 const size_t max_n = (subject.length() - pattern.length() + 1); 222 223 // For speed, search for the more `rare` of the two bytes in pattern[0] 224 // using memchr / memrchr (which are much faster than a simple for loop). 225 const uint8_t search_byte = GetHighestValueByte(pattern_first_char); 226 size_t pos = index; 227 do { 228 const size_t bytes_to_search = (max_n - pos) * sizeof(Char); 229 const void* void_pos; 230 if (subject.forward()) { 231 // Assert that bytes_to_search won't overflow 232 CHECK_LE(pos, max_n); 233 CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char)); 234 void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); 235 } else { 236 CHECK_LE(pos, subject.length()); 237 CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char)); 238 void_pos = MemrchrFill(subject.start() + pattern.length() - 1, 239 search_byte, 240 bytes_to_search); 241 } 242 const Char* char_pos = static_cast
(void_pos); 243 if (char_pos == nullptr) 244 return subject.length(); 245 246 // Then, for each match, verify that the full two bytes match pattern[0]. 247 char_pos = AlignDown(char_pos, sizeof(Char)); 248 size_t raw_pos = static_cast
(char_pos - subject.start()); 249 pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); 250 if (subject[pos] == pattern_first_char) { 251 // Match found, hooray. 252 return pos; 253 } 254 // Search byte matched, but the other byte of pattern[0] didn't. Keep going. 255 } while (++pos < max_n); 256 257 return subject.length(); 258 } 259 260 261 // Finds the first occurrence of the byte pattern[0] in string `subject`. 262 // Does not verify that the whole pattern matches. 263 template <> 264 inline size_t FindFirstCharacter(Vector
pattern, 265 Vector
subject, 266 size_t index) { 267 const uint8_t pattern_first_char = pattern[0]; 268 const size_t subj_len = subject.length(); 269 const size_t max_n = (subject.length() - pattern.length() + 1); 270 271 const void* pos; 272 if (subject.forward()) { 273 pos = memchr(subject.start() + index, pattern_first_char, max_n - index); 274 } else { 275 pos = MemrchrFill(subject.start() + pattern.length() - 1, 276 pattern_first_char, 277 max_n - index); 278 } 279 const uint8_t* char_pos = static_cast
(pos); 280 if (char_pos == nullptr) { 281 return subj_len; 282 } 283 284 size_t raw_pos = static_cast
(char_pos - subject.start()); 285 return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); 286 } 287 288 //--------------------------------------------------------------------- 289 // Single Character Pattern Search Strategy 290 //--------------------------------------------------------------------- 291 292 template
293 size_t StringSearch
::SingleCharSearch( 294 Vector subject, 295 size_t index) { 296 CHECK_EQ(1, pattern_.length()); 297 return FindFirstCharacter(pattern_, subject, index); 298 } 299 300 //--------------------------------------------------------------------- 301 // Linear Search Strategy 302 //--------------------------------------------------------------------- 303 304 // Simple linear search for short patterns. Never bails out. 305 template
306 size_t StringSearch
::LinearSearch( 307 Vector subject, 308 size_t index) { 309 CHECK_GT(pattern_.length(), 1); 310 const size_t n = subject.length() - pattern_.length(); 311 for (size_t i = index; i <= n; i++) { 312 i = FindFirstCharacter(pattern_, subject, i); 313 if (i == subject.length()) 314 return subject.length(); 315 CHECK_LE(i, n); 316 317 bool matches = true; 318 for (size_t j = 1; j < pattern_.length(); j++) { 319 if (pattern_[j] != subject[i + j]) { 320 matches = false; 321 break; 322 } 323 } 324 if (matches) { 325 return i; 326 } 327 } 328 return subject.length(); 329 } 330 331 //--------------------------------------------------------------------- 332 // Boyer-Moore string search 333 //--------------------------------------------------------------------- 334 335 template
336 size_t StringSearch
::BoyerMooreSearch( 337 Vector subject, 338 size_t start_index) { 339 const size_t subject_length = subject.length(); 340 const size_t pattern_length = pattern_.length(); 341 // Only preprocess at most kBMMaxShift last characters of pattern. 342 size_t start = start_; 343 344 int* bad_char_occurrence = bad_char_shift_table_; 345 int* good_suffix_shift = good_suffix_shift_table_ - start_; 346 347 Char last_char = pattern_[pattern_length - 1]; 348 size_t index = start_index; 349 // Continue search from i. 350 while (index <= subject_length - pattern_length) { 351 size_t j = pattern_length - 1; 352 int c; 353 while (last_char != (c = subject[index + j])) { 354 int shift = j - CharOccurrence(bad_char_occurrence, c); 355 index += shift; 356 if (index > subject_length - pattern_length) { 357 return subject.length(); 358 } 359 } 360 while (pattern_[j] == (c = subject[index + j])) { 361 if (j == 0) { 362 return index; 363 } 364 j--; 365 } 366 if (j < start) { 367 // we have matched more than our tables allow us to be smart about. 368 // Fall back on BMH shift. 369 index += pattern_length - 1 - 370 CharOccurrence(bad_char_occurrence, last_char); 371 } else { 372 int gs_shift = good_suffix_shift[j + 1]; 373 int bc_occ = CharOccurrence(bad_char_occurrence, c); 374 int shift = j - bc_occ; 375 if (gs_shift > shift) { 376 shift = gs_shift; 377 } 378 index += shift; 379 } 380 } 381 382 return subject.length(); 383 } 384 385 template
386 void StringSearch
::PopulateBoyerMooreTable() { 387 const size_t pattern_length = pattern_.length(); 388 // Only look at the last kBMMaxShift characters of pattern (from start_ 389 // to pattern_length). 390 const size_t start = start_; 391 const size_t length = pattern_length - start; 392 393 // Biased tables so that we can use pattern indices as table indices, 394 // even if we only cover the part of the pattern from offset start. 395 int* shift_table = good_suffix_shift_table_ - start_; 396 int* suffix_table = suffix_table_ - start_; 397 398 // Initialize table. 399 for (size_t i = start; i < pattern_length; i++) { 400 shift_table[i] = length; 401 } 402 shift_table[pattern_length] = 1; 403 suffix_table[pattern_length] = pattern_length + 1; 404 405 if (pattern_length <= start) { 406 return; 407 } 408 409 // Find suffixes. 410 Char last_char = pattern_[pattern_length - 1]; 411 size_t suffix = pattern_length + 1; 412 { 413 size_t i = pattern_length; 414 while (i > start) { 415 Char c = pattern_[i - 1]; 416 while (suffix <= pattern_length && c != pattern_[suffix - 1]) { 417 if (static_cast
(shift_table[suffix]) == length) { 418 shift_table[suffix] = suffix - i; 419 } 420 suffix = suffix_table[suffix]; 421 } 422 suffix_table[--i] = --suffix; 423 if (suffix == pattern_length) { 424 // No suffix to extend, so we check against last_char only. 425 while ((i > start) && (pattern_[i - 1] != last_char)) { 426 if (static_cast
(shift_table[pattern_length]) == length) { 427 shift_table[pattern_length] = pattern_length - i; 428 } 429 suffix_table[--i] = pattern_length; 430 } 431 if (i > start) { 432 suffix_table[--i] = --suffix; 433 } 434 } 435 } 436 } 437 // Build shift table using suffixes. 438 if (suffix < pattern_length) { 439 for (size_t i = start; i <= pattern_length; i++) { 440 if (static_cast
(shift_table[i]) == length) { 441 shift_table[i] = suffix - start; 442 } 443 if (i == suffix) { 444 suffix = suffix_table[suffix]; 445 } 446 } 447 } 448 } 449 450 //--------------------------------------------------------------------- 451 // Boyer-Moore-Horspool string search. 452 //--------------------------------------------------------------------- 453 454 template
455 size_t StringSearch
::BoyerMooreHorspoolSearch( 456 Vector subject, 457 size_t start_index) { 458 const size_t subject_length = subject.length(); 459 const size_t pattern_length = pattern_.length(); 460 int* char_occurrences = bad_char_shift_table_; 461 int64_t badness = -static_cast
(pattern_length); 462 463 // How bad we are doing without a good-suffix table. 464 Char last_char = pattern_[pattern_length - 1]; 465 int last_char_shift = 466 pattern_length - 1 - 467 CharOccurrence(char_occurrences, last_char); 468 469 // Perform search 470 size_t index = start_index; // No matches found prior to this index. 471 while (index <= subject_length - pattern_length) { 472 size_t j = pattern_length - 1; 473 int subject_char; 474 while (last_char != (subject_char = subject[index + j])) { 475 int bc_occ = CharOccurrence(char_occurrences, subject_char); 476 int shift = j - bc_occ; 477 index += shift; 478 badness += 1 - shift; // at most zero, so badness cannot increase. 479 if (index > subject_length - pattern_length) { 480 return subject_length; 481 } 482 } 483 j--; 484 while (pattern_[j] == (subject[index + j])) { 485 if (j == 0) { 486 return index; 487 } 488 j--; 489 } 490 index += last_char_shift; 491 // Badness increases by the number of characters we have 492 // checked, and decreases by the number of characters we 493 // can skip by shifting. It's a measure of how we are doing 494 // compared to reading each character exactly once. 495 badness += (pattern_length - j) - last_char_shift; 496 if (badness > 0) { 497 PopulateBoyerMooreTable(); 498 strategy_ = SearchStrategy::kBoyerMoore; 499 return BoyerMooreSearch(subject, index); 500 } 501 } 502 return subject.length(); 503 } 504 505 template
506 void StringSearch
::PopulateBoyerMooreHorspoolTable() { 507 const size_t pattern_length = pattern_.length(); 508 509 int* bad_char_occurrence = bad_char_shift_table_; 510 511 // Only preprocess at most kBMMaxShift last characters of pattern. 512 const size_t start = start_; 513 // Run forwards to populate bad_char_table, so that *last* instance 514 // of character equivalence class is the one registered. 515 // Notice: Doesn't include the last character. 516 const size_t table_size = AlphabetSize(); 517 if (start == 0) { 518 // All patterns less than kBMMaxShift in length. 519 memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); 520 } else { 521 for (size_t i = 0; i < table_size; i++) { 522 bad_char_occurrence[i] = start - 1; 523 } 524 } 525 for (size_t i = start; i < pattern_length - 1; i++) { 526 Char c = pattern_[i]; 527 int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); 528 bad_char_occurrence[bucket] = i; 529 } 530 } 531 532 //--------------------------------------------------------------------- 533 // Linear string search with bailout to BMH. 534 //--------------------------------------------------------------------- 535 536 // Simple linear search for short patterns, which bails out if the string 537 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 538 template
539 size_t StringSearch
::InitialSearch( 540 Vector subject, 541 size_t index) { 542 const size_t pattern_length = pattern_.length(); 543 // Badness is a count of how much work we have done. When we have 544 // done enough work we decide it's probably worth switching to a better 545 // algorithm. 546 int64_t badness = -10 - (pattern_length << 2); 547 548 // We know our pattern is at least 2 characters, we cache the first so 549 // the common case of the first character not matching is faster. 550 for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { 551 badness++; 552 if (badness <= 0) { 553 i = FindFirstCharacter(pattern_, subject, i); 554 if (i == subject.length()) 555 return subject.length(); 556 CHECK_LE(i, n); 557 size_t j = 1; 558 do { 559 if (pattern_[j] != subject[i + j]) { 560 break; 561 } 562 j++; 563 } while (j < pattern_length); 564 if (j == pattern_length) { 565 return i; 566 } 567 badness += j; 568 } else { 569 PopulateBoyerMooreHorspoolTable(); 570 strategy_ = SearchStrategy::kBoyerMooreHorspool; 571 return BoyerMooreHorspoolSearch(subject, i); 572 } 573 } 574 return subject.length(); 575 } 576 577 // Perform a single stand-alone search. 578 // If searching multiple times for the same pattern, a search 579 // object should be constructed once and the Search function then called 580 // for each search. 581 template
582 size_t SearchString(Vector
subject, 583 Vector
pattern, 584 size_t start_index) { 585 StringSearch
search(pattern); 586 return search.Search(subject, start_index); 587 } 588 } // namespace stringsearch 589 } // namespace node 590 591 namespace node { 592 593 template
594 size_t SearchString(const Char* haystack, 595 size_t haystack_length, 596 const Char* needle, 597 size_t needle_length, 598 size_t start_index, 599 bool is_forward) { 600 if (haystack_length < needle_length) return haystack_length; 601 // To do a reverse search (lastIndexOf instead of indexOf) without redundant 602 // code, create two vectors that are reversed views into the input strings. 603 // For example, v_needle[0] would return the *last* character of the needle. 604 // So we're searching for the first instance of rev(needle) in rev(haystack) 605 stringsearch::Vector
v_needle(needle, needle_length, is_forward); 606 stringsearch::Vector
v_haystack( 607 haystack, haystack_length, is_forward); 608 size_t diff = haystack_length - needle_length; 609 size_t relative_start_index; 610 if (is_forward) { 611 relative_start_index = start_index; 612 } else if (diff < start_index) { 613 relative_start_index = 0; 614 } else { 615 relative_start_index = diff - start_index; 616 } 617 size_t pos = node::stringsearch::SearchString( 618 v_haystack, v_needle, relative_start_index); 619 if (pos == haystack_length) { 620 // not found 621 return pos; 622 } 623 return is_forward ? pos : (haystack_length - needle_length - pos); 624 } 625 626 template
627 size_t SearchString(const char* haystack, size_t haystack_length, 628 const char (&needle)[N]) { 629 return SearchString( 630 reinterpret_cast
(haystack), haystack_length, 631 reinterpret_cast
(needle), N - 1, 0, true); 632 } 633 634 } // namespace node 635 636 #endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS 637 638 #endif // SRC_STRING_SEARCH_H_
Contact us
|
About us
|
Term of use
|
Copyright © 2000-2024 MyWebUniversity.com ™