escape.h 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. // Copyright 2020 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #ifndef BASE_STRINGS_ESCAPE_H_
  5. #define BASE_STRINGS_ESCAPE_H_
  6. #include <stdint.h>
  7. #include <set>
  8. #include <string>
  9. #include "base/base_export.h"
  10. #include "base/strings/string16.h"
  11. #include "base/strings/string_piece.h"
  12. #include "base/strings/utf_offset_string_conversions.h"
  13. namespace base {
  14. class UnescapeRule {
  15. public:
  16. // A combination of the following flags that is passed to the unescaping
  17. // functions.
  18. typedef uint32_t Type;
  19. enum {
  20. // Don't unescape anything at all.
  21. NONE = 0,
  22. // Don't unescape anything special, but all normal unescaping will happen.
  23. // This is a placeholder and can't be combined with other flags (since it's
  24. // just the absence of them). All other unescape rules imply "normal" in
  25. // addition to their special meaning. Things like escaped letters, digits,
  26. // and most symbols will get unescaped with this mode.
  27. NORMAL = 1 << 0,
  28. // Convert %20 to spaces. In some places where we're showing URLs, we may
  29. // want this. In places where the URL may be copied and pasted out, then
  30. // you wouldn't want this since it might not be interpreted in one piece
  31. // by other applications. Other UTF-8 spaces will not be unescaped.
  32. SPACES = 1 << 1,
  33. // Unescapes '/' and '\\'. If these characters were unescaped, the resulting
  34. // URL won't be the same as the source one. Moreover, they are dangerous to
  35. // unescape in strings that will be used as file paths or names. This value
  36. // should only be used when slashes don't have special meaning, like data
  37. // URLs.
  38. PATH_SEPARATORS = 1 << 2,
  39. // Unescapes various characters that will change the meaning of URLs,
  40. // including '%', '+', '&', '#'. Does not unescape path separators.
  41. // If these characters were unescaped, the resulting URL won't be the same
  42. // as the source one. This flag is used when generating final output like
  43. // filenames for URLs where we won't be interpreting as a URL and want to do
  44. // as much unescaping as possible.
  45. URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS = 1 << 3,
  46. // URL queries use "+" for space. This flag controls that replacement.
  47. REPLACE_PLUS_WITH_SPACE = 1 << 4,
  48. };
  49. };
  50. // Unescapes |escaped_text| and returns the result.
  51. // Unescaping consists of looking for the exact pattern "%XX", where each X is
  52. // a hex digit, and converting to the character with the numerical value of
  53. // those digits. Thus "i%20=%203%3b" unescapes to "i = 3;", if the
  54. // "UnescapeRule::SPACES" used.
  55. //
  56. // This method does not ensure that the output is a valid string using any
  57. // character encoding. However, it does leave escaped certain byte sequences
  58. // that would be dangerous to display to the user, because if interpreted as
  59. // UTF-8, they could be used to mislead the user. Callers that want to
  60. // unconditionally unescape everything for uses other than displaying data to
  61. // the user should use UnescapeBinaryURLComponent().
  62. BASE_EXPORT std::string UnescapeURLComponent(StringPiece escaped_text,
  63. UnescapeRule::Type rules);
  64. // Unescapes the given substring as a URL, and then tries to interpret the
  65. // result as being encoded as UTF-8. If the result is convertible into UTF-8, it
  66. // will be returned as converted. If it is not, the original escaped string will
  67. // be converted into a string16 and returned. |adjustments| provides
  68. // information on how the original string was adjusted to get the string
  69. // returned.
  70. BASE_EXPORT string16 UnescapeAndDecodeUTF8URLComponentWithAdjustments(
  71. StringPiece text,
  72. UnescapeRule::Type rules,
  73. OffsetAdjuster::Adjustments* adjustments);
  74. // Unescapes a component of a URL for use as binary data. Unlike
  75. // UnescapeURLComponent, leaves nothing unescaped, including nulls, invalid
  76. // characters, characters that are unsafe to display, etc. This should *not*
  77. // be used when displaying the decoded data to the user.
  78. //
  79. // Only the NORMAL and REPLACE_PLUS_WITH_SPACE rules are allowed.
  80. BASE_EXPORT std::string UnescapeBinaryURLComponent(
  81. StringPiece escaped_text,
  82. UnescapeRule::Type rules = UnescapeRule::NORMAL);
  83. // Variant of UnescapeBinaryURLComponent(). Writes output to |unescaped_text|.
  84. // Returns true on success, returns false and clears |unescaped_text| on
  85. // failure. Fails on characters escaped that are unsafe to unescape in some
  86. // contexts, which are defined as characters "\0" through "\x1F" (Which includes
  87. // CRLF but not space), and optionally path separators. Path separators include
  88. // both forward and backward slashes on all platforms. Does not fail if any of
  89. // those characters appear unescaped in the input string.
  90. BASE_EXPORT bool UnescapeBinaryURLComponentSafe(StringPiece escaped_text,
  91. bool fail_on_path_separators,
  92. std::string* unescaped_text);
  93. // Returns true if |escaped_text| contains any element of |bytes| in
  94. // percent-encoded form.
  95. //
  96. // For example, if |bytes| is {'%', '/'}, returns true if |escaped_text|
  97. // contains "%25" or "%2F", but not if it just contains bare '%' or '/'
  98. // characters.
  99. BASE_EXPORT bool ContainsEncodedBytes(StringPiece escaped_text,
  100. const std::set<unsigned char>& bytes);
  101. } // namespace base
  102. #endif // BASE_STRINGS_ESCAPE_H_