string_util_internal.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. // Copyright 2020 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #ifndef BASE_STRINGS_STRING_UTIL_INTERNAL_H_
  5. #define BASE_STRINGS_STRING_UTIL_INTERNAL_H_
  6. #include "base/strings/string_piece.h"
  7. #include "base/third_party/icu/icu_utf.h"
  8. namespace base {
  9. namespace internal {
  10. // Used by ReplaceStringPlaceholders to track the position in the string of
  11. // replaced parameters.
  12. struct ReplacementOffset {
  13. ReplacementOffset(uintptr_t parameter, size_t offset)
  14. : parameter(parameter), offset(offset) {}
  15. // Index of the parameter.
  16. uintptr_t parameter;
  17. // Starting position in the string.
  18. size_t offset;
  19. };
  20. static bool CompareParameter(const ReplacementOffset& elem1,
  21. const ReplacementOffset& elem2) {
  22. return elem1.parameter < elem2.parameter;
  23. }
  24. // Assuming that a pointer is the size of a "machine word", then
  25. // uintptr_t is an integer type that is also a machine word.
  26. using MachineWord = uintptr_t;
  27. inline bool IsMachineWordAligned(const void* pointer) {
  28. return !(reinterpret_cast<MachineWord>(pointer) & (sizeof(MachineWord) - 1));
  29. }
  30. template <typename StringType>
  31. StringType ToLowerASCIIImpl(BasicStringPiece<StringType> str) {
  32. StringType ret;
  33. ret.reserve(str.size());
  34. for (size_t i = 0; i < str.size(); i++)
  35. ret.push_back(ToLowerASCII(str[i]));
  36. return ret;
  37. }
  38. template <typename StringType>
  39. StringType ToUpperASCIIImpl(BasicStringPiece<StringType> str) {
  40. StringType ret;
  41. ret.reserve(str.size());
  42. for (size_t i = 0; i < str.size(); i++)
  43. ret.push_back(ToUpperASCII(str[i]));
  44. return ret;
  45. }
  46. template <class StringType>
  47. int CompareCaseInsensitiveASCIIT(BasicStringPiece<StringType> a,
  48. BasicStringPiece<StringType> b) {
  49. // Find the first characters that aren't equal and compare them. If the end
  50. // of one of the strings is found before a nonequal character, the lengths
  51. // of the strings are compared.
  52. size_t i = 0;
  53. while (i < a.length() && i < b.length()) {
  54. typename StringType::value_type lower_a = ToLowerASCII(a[i]);
  55. typename StringType::value_type lower_b = ToLowerASCII(b[i]);
  56. if (lower_a < lower_b)
  57. return -1;
  58. if (lower_a > lower_b)
  59. return 1;
  60. i++;
  61. }
  62. // End of one string hit before finding a different character. Expect the
  63. // common case to be "strings equal" at this point so check that first.
  64. if (a.length() == b.length())
  65. return 0;
  66. if (a.length() < b.length())
  67. return -1;
  68. return 1;
  69. }
  70. template <typename Str>
  71. TrimPositions TrimStringT(BasicStringPiece<Str> input,
  72. BasicStringPiece<Str> trim_chars,
  73. TrimPositions positions,
  74. Str* output) {
  75. // Find the edges of leading/trailing whitespace as desired. Need to use
  76. // a StringPiece version of input to be able to call find* on it with the
  77. // StringPiece version of trim_chars (normally the trim_chars will be a
  78. // constant so avoid making a copy).
  79. const size_t last_char = input.length() - 1;
  80. const size_t first_good_char =
  81. (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0;
  82. const size_t last_good_char = (positions & TRIM_TRAILING)
  83. ? input.find_last_not_of(trim_chars)
  84. : last_char;
  85. // When the string was all trimmed, report that we stripped off characters
  86. // from whichever position the caller was interested in. For empty input, we
  87. // stripped no characters, but we still need to clear |output|.
  88. if (input.empty() || first_good_char == Str::npos ||
  89. last_good_char == Str::npos) {
  90. bool input_was_empty = input.empty(); // in case output == &input
  91. output->clear();
  92. return input_was_empty ? TRIM_NONE : positions;
  93. }
  94. // Trim.
  95. output->assign(input.data() + first_good_char,
  96. last_good_char - first_good_char + 1);
  97. // Return where we trimmed from.
  98. return static_cast<TrimPositions>(
  99. (first_good_char == 0 ? TRIM_NONE : TRIM_LEADING) |
  100. (last_good_char == last_char ? TRIM_NONE : TRIM_TRAILING));
  101. }
  102. template <typename Str>
  103. BasicStringPiece<Str> TrimStringPieceT(BasicStringPiece<Str> input,
  104. BasicStringPiece<Str> trim_chars,
  105. TrimPositions positions) {
  106. size_t begin =
  107. (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0;
  108. size_t end = (positions & TRIM_TRAILING)
  109. ? input.find_last_not_of(trim_chars) + 1
  110. : input.size();
  111. return input.substr(begin, end - begin);
  112. }
  113. template <typename STR>
  114. STR CollapseWhitespaceT(BasicStringPiece<STR> text,
  115. bool trim_sequences_with_line_breaks) {
  116. STR result;
  117. result.resize(text.size());
  118. // Set flags to pretend we're already in a trimmed whitespace sequence, so we
  119. // will trim any leading whitespace.
  120. bool in_whitespace = true;
  121. bool already_trimmed = true;
  122. int chars_written = 0;
  123. for (auto c : text) {
  124. if (IsUnicodeWhitespace(c)) {
  125. if (!in_whitespace) {
  126. // Reduce all whitespace sequences to a single space.
  127. in_whitespace = true;
  128. result[chars_written++] = L' ';
  129. }
  130. if (trim_sequences_with_line_breaks && !already_trimmed &&
  131. ((c == '\n') || (c == '\r'))) {
  132. // Whitespace sequences containing CR or LF are eliminated entirely.
  133. already_trimmed = true;
  134. --chars_written;
  135. }
  136. } else {
  137. // Non-whitespace characters are copied straight across.
  138. in_whitespace = false;
  139. already_trimmed = false;
  140. result[chars_written++] = c;
  141. }
  142. }
  143. if (in_whitespace && !already_trimmed) {
  144. // Any trailing whitespace is eliminated.
  145. --chars_written;
  146. }
  147. result.resize(chars_written);
  148. return result;
  149. }
  150. template <class Char>
  151. bool DoIsStringASCII(const Char* characters, size_t length) {
  152. // Bitmasks to detect non ASCII characters for character sizes of 8, 16 and 32
  153. // bits.
  154. constexpr MachineWord NonASCIIMasks[] = {
  155. 0, MachineWord(0x8080808080808080ULL), MachineWord(0xFF80FF80FF80FF80ULL),
  156. 0, MachineWord(0xFFFFFF80FFFFFF80ULL),
  157. };
  158. if (!length)
  159. return true;
  160. constexpr MachineWord non_ascii_bit_mask = NonASCIIMasks[sizeof(Char)];
  161. static_assert(non_ascii_bit_mask, "Error: Invalid Mask");
  162. MachineWord all_char_bits = 0;
  163. const Char* end = characters + length;
  164. // Prologue: align the input.
  165. while (!IsMachineWordAligned(characters) && characters < end)
  166. all_char_bits |= *characters++;
  167. if (all_char_bits & non_ascii_bit_mask)
  168. return false;
  169. // Compare the values of CPU word size.
  170. constexpr size_t chars_per_word = sizeof(MachineWord) / sizeof(Char);
  171. constexpr int batch_count = 16;
  172. while (characters <= end - batch_count * chars_per_word) {
  173. all_char_bits = 0;
  174. for (int i = 0; i < batch_count; ++i) {
  175. all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
  176. characters += chars_per_word;
  177. }
  178. if (all_char_bits & non_ascii_bit_mask)
  179. return false;
  180. }
  181. // Process the remaining words.
  182. all_char_bits = 0;
  183. while (characters <= end - chars_per_word) {
  184. all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
  185. characters += chars_per_word;
  186. }
  187. // Process the remaining bytes.
  188. while (characters < end)
  189. all_char_bits |= *characters++;
  190. return !(all_char_bits & non_ascii_bit_mask);
  191. }
  192. template <bool (*Validator)(uint32_t)>
  193. inline static bool DoIsStringUTF8(StringPiece str) {
  194. const char* src = str.data();
  195. int32_t src_len = static_cast<int32_t>(str.length());
  196. int32_t char_index = 0;
  197. while (char_index < src_len) {
  198. int32_t code_point;
  199. CBU8_NEXT(src, char_index, src_len, code_point);
  200. if (!Validator(code_point))
  201. return false;
  202. }
  203. return true;
  204. }
  205. // Implementation note: Normally this function will be called with a hardcoded
  206. // constant for the lowercase_ascii parameter. Constructing a StringPiece from
  207. // a C constant requires running strlen, so the result will be two passes
  208. // through the buffers, one to file the length of lowercase_ascii, and one to
  209. // compare each letter.
  210. //
  211. // This function could have taken a const char* to avoid this and only do one
  212. // pass through the string. But the strlen is faster than the case-insensitive
  213. // compares and lets us early-exit in the case that the strings are different
  214. // lengths (will often be the case for non-matches). So whether one approach or
  215. // the other will be faster depends on the case.
  216. //
  217. // The hardcoded strings are typically very short so it doesn't matter, and the
  218. // string piece gives additional flexibility for the caller (doesn't have to be
  219. // null terminated) so we choose the StringPiece route.
  220. template <typename Str>
  221. static inline bool DoLowerCaseEqualsASCII(BasicStringPiece<Str> str,
  222. StringPiece lowercase_ascii) {
  223. return std::equal(
  224. str.begin(), str.end(), lowercase_ascii.begin(), lowercase_ascii.end(),
  225. [](auto lhs, auto rhs) { return ToLowerASCII(lhs) == rhs; });
  226. }
  227. template <typename Str>
  228. bool StartsWithT(BasicStringPiece<Str> str,
  229. BasicStringPiece<Str> search_for,
  230. CompareCase case_sensitivity) {
  231. if (search_for.size() > str.size())
  232. return false;
  233. BasicStringPiece<Str> source = str.substr(0, search_for.size());
  234. switch (case_sensitivity) {
  235. case CompareCase::SENSITIVE:
  236. return source == search_for;
  237. case CompareCase::INSENSITIVE_ASCII:
  238. return std::equal(
  239. search_for.begin(), search_for.end(), source.begin(),
  240. CaseInsensitiveCompareASCII<typename Str::value_type>());
  241. default:
  242. NOTREACHED();
  243. return false;
  244. }
  245. }
  246. template <typename Str>
  247. bool EndsWithT(BasicStringPiece<Str> str,
  248. BasicStringPiece<Str> search_for,
  249. CompareCase case_sensitivity) {
  250. if (search_for.size() > str.size())
  251. return false;
  252. BasicStringPiece<Str> source =
  253. str.substr(str.size() - search_for.size(), search_for.size());
  254. switch (case_sensitivity) {
  255. case CompareCase::SENSITIVE:
  256. return source == search_for;
  257. case CompareCase::INSENSITIVE_ASCII:
  258. return std::equal(
  259. source.begin(), source.end(), search_for.begin(),
  260. CaseInsensitiveCompareASCII<typename Str::value_type>());
  261. default:
  262. NOTREACHED();
  263. return false;
  264. }
  265. }
  266. // A Matcher for DoReplaceMatchesAfterOffset() that matches substrings.
  267. template <class StringType>
  268. struct SubstringMatcher {
  269. BasicStringPiece<StringType> find_this;
  270. size_t Find(const StringType& input, size_t pos) {
  271. return input.find(find_this.data(), pos, find_this.length());
  272. }
  273. size_t MatchSize() { return find_this.length(); }
  274. };
  275. // A Matcher for DoReplaceMatchesAfterOffset() that matches single characters.
  276. template <class StringType>
  277. struct CharacterMatcher {
  278. BasicStringPiece<StringType> find_any_of_these;
  279. size_t Find(const StringType& input, size_t pos) {
  280. return input.find_first_of(find_any_of_these.data(), pos,
  281. find_any_of_these.length());
  282. }
  283. constexpr size_t MatchSize() { return 1; }
  284. };
  285. enum class ReplaceType { REPLACE_ALL, REPLACE_FIRST };
  286. // Runs in O(n) time in the length of |str|, and transforms the string without
  287. // reallocating when possible. Returns |true| if any matches were found.
  288. //
  289. // This is parameterized on a |Matcher| traits type, so that it can be the
  290. // implementation for both ReplaceChars() and ReplaceSubstringsAfterOffset().
  291. template <class StringType, class Matcher>
  292. bool DoReplaceMatchesAfterOffset(StringType* str,
  293. size_t initial_offset,
  294. Matcher matcher,
  295. BasicStringPiece<StringType> replace_with,
  296. ReplaceType replace_type) {
  297. using CharTraits = typename StringType::traits_type;
  298. const size_t find_length = matcher.MatchSize();
  299. if (!find_length)
  300. return false;
  301. // If the find string doesn't appear, there's nothing to do.
  302. size_t first_match = matcher.Find(*str, initial_offset);
  303. if (first_match == StringType::npos)
  304. return false;
  305. // If we're only replacing one instance, there's no need to do anything
  306. // complicated.
  307. const size_t replace_length = replace_with.length();
  308. if (replace_type == ReplaceType::REPLACE_FIRST) {
  309. str->replace(first_match, find_length, replace_with.data(), replace_length);
  310. return true;
  311. }
  312. // If the find and replace strings are the same length, we can simply use
  313. // replace() on each instance, and finish the entire operation in O(n) time.
  314. if (find_length == replace_length) {
  315. auto* buffer = &((*str)[0]);
  316. for (size_t offset = first_match; offset != StringType::npos;
  317. offset = matcher.Find(*str, offset + replace_length)) {
  318. CharTraits::copy(buffer + offset, replace_with.data(), replace_length);
  319. }
  320. return true;
  321. }
  322. // Since the find and replace strings aren't the same length, a loop like the
  323. // one above would be O(n^2) in the worst case, as replace() will shift the
  324. // entire remaining string each time. We need to be more clever to keep things
  325. // O(n).
  326. //
  327. // When the string is being shortened, it's possible to just shift the matches
  328. // down in one pass while finding, and truncate the length at the end of the
  329. // search.
  330. //
  331. // If the string is being lengthened, more work is required. The strategy used
  332. // here is to make two find() passes through the string. The first pass counts
  333. // the number of matches to determine the new size. The second pass will
  334. // either construct the new string into a new buffer (if the existing buffer
  335. // lacked capacity), or else -- if there is room -- create a region of scratch
  336. // space after |first_match| by shifting the tail of the string to a higher
  337. // index, and doing in-place moves from the tail to lower indices thereafter.
  338. size_t str_length = str->length();
  339. size_t expansion = 0;
  340. if (replace_length > find_length) {
  341. // This operation lengthens the string; determine the new length by counting
  342. // matches.
  343. const size_t expansion_per_match = (replace_length - find_length);
  344. size_t num_matches = 0;
  345. for (size_t match = first_match; match != StringType::npos;
  346. match = matcher.Find(*str, match + find_length)) {
  347. expansion += expansion_per_match;
  348. ++num_matches;
  349. }
  350. const size_t final_length = str_length + expansion;
  351. if (str->capacity() < final_length) {
  352. // If we'd have to allocate a new buffer to grow the string, build the
  353. // result directly into the new allocation via append().
  354. StringType src(str->get_allocator());
  355. str->swap(src);
  356. str->reserve(final_length);
  357. size_t pos = 0;
  358. for (size_t match = first_match;; match = matcher.Find(src, pos)) {
  359. str->append(src, pos, match - pos);
  360. str->append(replace_with.data(), replace_length);
  361. pos = match + find_length;
  362. // A mid-loop test/break enables skipping the final Find() call; the
  363. // number of matches is known, so don't search past the last one.
  364. if (!--num_matches)
  365. break;
  366. }
  367. // Handle substring after the final match.
  368. str->append(src, pos, str_length - pos);
  369. return true;
  370. }
  371. // Prepare for the copy/move loop below -- expand the string to its final
  372. // size by shifting the data after the first match to the end of the resized
  373. // string.
  374. size_t shift_src = first_match + find_length;
  375. size_t shift_dst = shift_src + expansion;
  376. // Big |expansion| factors (relative to |str_length|) require padding up to
  377. // |shift_dst|.
  378. if (shift_dst > str_length)
  379. str->resize(shift_dst);
  380. str->replace(shift_dst, str_length - shift_src, *str, shift_src,
  381. str_length - shift_src);
  382. str_length = final_length;
  383. }
  384. // We can alternate replacement and move operations. This won't overwrite the
  385. // unsearched region of the string so long as |write_offset| <= |read_offset|;
  386. // that condition is always satisfied because:
  387. //
  388. // (a) If the string is being shortened, |expansion| is zero and
  389. // |write_offset| grows slower than |read_offset|.
  390. //
  391. // (b) If the string is being lengthened, |write_offset| grows faster than
  392. // |read_offset|, but |expansion| is big enough so that |write_offset|
  393. // will only catch up to |read_offset| at the point of the last match.
  394. auto* buffer = &((*str)[0]);
  395. size_t write_offset = first_match;
  396. size_t read_offset = first_match + expansion;
  397. do {
  398. if (replace_length) {
  399. CharTraits::copy(buffer + write_offset, replace_with.data(),
  400. replace_length);
  401. write_offset += replace_length;
  402. }
  403. read_offset += find_length;
  404. // min() clamps StringType::npos (the largest unsigned value) to str_length.
  405. size_t match = std::min(matcher.Find(*str, read_offset), str_length);
  406. size_t length = match - read_offset;
  407. if (length) {
  408. CharTraits::move(buffer + write_offset, buffer + read_offset, length);
  409. write_offset += length;
  410. read_offset += length;
  411. }
  412. } while (read_offset < str_length);
  413. // If we're shortening the string, truncate it now.
  414. str->resize(write_offset);
  415. return true;
  416. }
  417. template <class StringType>
  418. bool ReplaceCharsT(BasicStringPiece<StringType> input,
  419. BasicStringPiece<StringType> find_any_of_these,
  420. BasicStringPiece<StringType> replace_with,
  421. StringType* output) {
  422. // Commonly, this is called with output and input being the same string; in
  423. // that case, skip the copy.
  424. if (input.data() != output->data() || input.size() != output->size())
  425. output->assign(input.data(), input.size());
  426. return DoReplaceMatchesAfterOffset(
  427. output, 0, CharacterMatcher<StringType>{find_any_of_these}, replace_with,
  428. ReplaceType::REPLACE_ALL);
  429. }
  430. template <class string_type>
  431. inline typename string_type::value_type* WriteIntoT(string_type* str,
  432. size_t length_with_null) {
  433. DCHECK_GE(length_with_null, 1u);
  434. str->reserve(length_with_null);
  435. str->resize(length_with_null - 1);
  436. return &((*str)[0]);
  437. }
  438. // Generic version for all JoinString overloads. |list_type| must be a sequence
  439. // (base::span or std::initializer_list) of strings/StringPieces (std::string,
  440. // string16, StringPiece or StringPiece16). |string_type| is either std::string
  441. // or string16.
  442. template <typename list_type, typename string_type>
  443. static string_type JoinStringT(list_type parts,
  444. BasicStringPiece<string_type> sep) {
  445. if (base::empty(parts))
  446. return string_type();
  447. // Pre-allocate the eventual size of the string. Start with the size of all of
  448. // the separators (note that this *assumes* parts.size() > 0).
  449. size_t total_size = (parts.size() - 1) * sep.size();
  450. for (const auto& part : parts)
  451. total_size += part.size();
  452. string_type result;
  453. result.reserve(total_size);
  454. auto iter = parts.begin();
  455. DCHECK(iter != parts.end());
  456. result.append(iter->data(), iter->size());
  457. ++iter;
  458. for (; iter != parts.end(); ++iter) {
  459. result.append(sep.data(), sep.size());
  460. result.append(iter->data(), iter->size());
  461. }
  462. // Sanity-check that we pre-allocated correctly.
  463. DCHECK_EQ(total_size, result.size());
  464. return result;
  465. }
  466. template <class StringType>
  467. StringType DoReplaceStringPlaceholders(
  468. BasicStringPiece<StringType> format_string,
  469. const std::vector<StringType>& subst,
  470. std::vector<size_t>* offsets) {
  471. size_t substitutions = subst.size();
  472. DCHECK_LT(substitutions, 10U);
  473. size_t sub_length = 0;
  474. for (const auto& cur : subst)
  475. sub_length += cur.length();
  476. StringType formatted;
  477. formatted.reserve(format_string.length() + sub_length);
  478. std::vector<ReplacementOffset> r_offsets;
  479. for (auto i = format_string.begin(); i != format_string.end(); ++i) {
  480. if ('$' == *i) {
  481. if (i + 1 != format_string.end()) {
  482. ++i;
  483. if ('$' == *i) {
  484. while (i != format_string.end() && '$' == *i) {
  485. formatted.push_back('$');
  486. ++i;
  487. }
  488. --i;
  489. } else {
  490. if (*i < '1' || *i > '9') {
  491. DLOG(ERROR) << "Invalid placeholder: $" << *i;
  492. continue;
  493. }
  494. uintptr_t index = *i - '1';
  495. if (offsets) {
  496. ReplacementOffset r_offset(index,
  497. static_cast<int>(formatted.size()));
  498. r_offsets.insert(
  499. std::upper_bound(r_offsets.begin(), r_offsets.end(), r_offset,
  500. &CompareParameter),
  501. r_offset);
  502. }
  503. if (index < substitutions)
  504. formatted.append(subst.at(index));
  505. }
  506. }
  507. } else {
  508. formatted.push_back(*i);
  509. }
  510. }
  511. if (offsets) {
  512. for (const auto& cur : r_offsets)
  513. offsets->push_back(cur.offset);
  514. }
  515. return formatted;
  516. }
  517. // The following code is compatible with the OpenBSD lcpy interface. See:
  518. // http://www.gratisoft.us/todd/papers/strlcpy.html
  519. // ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
  520. template <typename CHAR>
  521. size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
  522. for (size_t i = 0; i < dst_size; ++i) {
  523. if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL.
  524. return i;
  525. }
  526. // We were left off at dst_size. We over copied 1 byte. Null terminate.
  527. if (dst_size != 0)
  528. dst[dst_size - 1] = 0;
  529. // Count the rest of the |src|, and return it's length in characters.
  530. while (src[dst_size])
  531. ++dst_size;
  532. return dst_size;
  533. }
  534. } // namespace internal
  535. } // namespace base
  536. #endif // BASE_STRINGS_STRING_UTIL_INTERNAL_H_