json_parser.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. // Copyright (c) 2012 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #ifndef BASE_JSON_JSON_PARSER_H_
  5. #define BASE_JSON_JSON_PARSER_H_
  6. #include <stddef.h>
  7. #include <stdint.h>
  8. #include <memory>
  9. #include <string>
  10. #include "base/base_export.h"
  11. #include "base/compiler_specific.h"
  12. #include "base/gtest_prod_util.h"
  13. #include "base/json/json_common.h"
  14. #include "base/macros.h"
  15. #include "base/optional.h"
  16. #include "base/strings/string_piece.h"
  17. #include "base/values.h"
  18. namespace base {
  19. class Value;
  20. namespace internal {
  21. class JSONParserTest;
  22. // The implementation behind the JSONReader interface. This class is not meant
  23. // to be used directly; it encapsulates logic that need not be exposed publicly.
  24. //
  25. // This parser guarantees O(n) time through the input string. Iteration happens
  26. // on the byte level, with the functions ConsumeChars() and ConsumeChar(). The
  27. // conversion from byte to JSON token happens without advancing the parser in
  28. // GetNextToken/ParseToken, that is tokenization operates on the current parser
  29. // position without advancing.
  30. //
  31. // Built on top of these are a family of Consume functions that iterate
  32. // internally. Invariant: on entry of a Consume function, the parser is wound
  33. // to the first byte of a valid JSON token. On exit, it is on the first byte
  34. // after the token that was just consumed, which would likely be the first byte
  35. // of the next token.
  36. class BASE_EXPORT JSONParser {
  37. public:
  38. // Error codes during parsing.
  39. enum JsonParseError {
  40. JSON_NO_ERROR = base::ValueDeserializer::kErrorCodeNoError,
  41. JSON_SYNTAX_ERROR = base::ValueDeserializer::kErrorCodeInvalidFormat,
  42. JSON_INVALID_ESCAPE,
  43. JSON_UNEXPECTED_TOKEN,
  44. JSON_TRAILING_COMMA,
  45. JSON_TOO_MUCH_NESTING,
  46. JSON_UNEXPECTED_DATA_AFTER_ROOT,
  47. JSON_UNSUPPORTED_ENCODING,
  48. JSON_UNQUOTED_DICTIONARY_KEY,
  49. JSON_TOO_LARGE,
  50. JSON_UNREPRESENTABLE_NUMBER,
  51. JSON_PARSE_ERROR_COUNT
  52. };
  53. // String versions of parse error codes.
  54. static const char kSyntaxError[];
  55. static const char kInvalidEscape[];
  56. static const char kUnexpectedToken[];
  57. static const char kTrailingComma[];
  58. static const char kTooMuchNesting[];
  59. static const char kUnexpectedDataAfterRoot[];
  60. static const char kUnsupportedEncoding[];
  61. static const char kUnquotedDictionaryKey[];
  62. static const char kInputTooLarge[];
  63. static const char kUnrepresentableNumber[];
  64. explicit JSONParser(int options, size_t max_depth = kAbsoluteMaxDepth);
  65. ~JSONParser();
  66. // Parses the input string according to the set options and returns the
  67. // result as a Value.
  68. // Wrap this in base::FooValue::From() to check the Value is of type Foo and
  69. // convert to a FooValue at the same time.
  70. Optional<Value> Parse(StringPiece input);
  71. // Returns the error code.
  72. JsonParseError error_code() const;
  73. // Returns the human-friendly error message.
  74. std::string GetErrorMessage() const;
  75. // Returns the error line number if parse error happened. Otherwise always
  76. // returns 0.
  77. int error_line() const;
  78. // Returns the error column number if parse error happened. Otherwise always
  79. // returns 0.
  80. int error_column() const;
  81. private:
  82. enum Token {
  83. T_OBJECT_BEGIN, // {
  84. T_OBJECT_END, // }
  85. T_ARRAY_BEGIN, // [
  86. T_ARRAY_END, // ]
  87. T_STRING,
  88. T_NUMBER,
  89. T_BOOL_TRUE, // true
  90. T_BOOL_FALSE, // false
  91. T_NULL, // null
  92. T_LIST_SEPARATOR, // ,
  93. T_OBJECT_PAIR_SEPARATOR, // :
  94. T_END_OF_INPUT,
  95. T_INVALID_TOKEN,
  96. };
  97. // A helper class used for parsing strings. One optimization performed is to
  98. // create base::Value with a StringPiece to avoid unnecessary std::string
  99. // copies. This is not possible if the input string needs to be decoded from
  100. // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped.
  101. // This class centralizes that logic.
  102. class StringBuilder {
  103. public:
  104. // Empty constructor. Used for creating a builder with which to assign to.
  105. StringBuilder();
  106. // |pos| is the beginning of an input string, excluding the |"|.
  107. explicit StringBuilder(const char* pos);
  108. ~StringBuilder();
  109. StringBuilder& operator=(StringBuilder&& other);
  110. // Appends the Unicode code point |point| to the string, either by
  111. // increasing the |length_| of the string if the string has not been
  112. // converted, or by appending the UTF8 bytes for the code point.
  113. void Append(uint32_t point);
  114. // Converts the builder from its default StringPiece to a full std::string,
  115. // performing a copy. Once a builder is converted, it cannot be made a
  116. // StringPiece again.
  117. void Convert();
  118. // Returns the builder as a string, invalidating all state. This allows
  119. // the internal string buffer representation to be destructively moved
  120. // in cases where the builder will not be needed any more.
  121. std::string DestructiveAsString();
  122. private:
  123. // The beginning of the input string.
  124. const char* pos_;
  125. // Number of bytes in |pos_| that make up the string being built.
  126. size_t length_;
  127. // The copied string representation. Will be unset until Convert() is
  128. // called.
  129. base::Optional<std::string> string_;
  130. };
  131. // Returns the next |count| bytes of the input stream, or nullopt if fewer
  132. // than |count| bytes remain.
  133. Optional<StringPiece> PeekChars(size_t count);
  134. // Calls PeekChars() with a |count| of 1.
  135. Optional<char> PeekChar();
  136. // Returns the next |count| bytes of the input stream, or nullopt if fewer
  137. // than |count| bytes remain, and advances the parser position by |count|.
  138. Optional<StringPiece> ConsumeChars(size_t count);
  139. // Calls ConsumeChars() with a |count| of 1.
  140. Optional<char> ConsumeChar();
  141. // Returns a pointer to the current character position.
  142. const char* pos();
  143. // Skips over whitespace and comments to find the next token in the stream.
  144. // This does not advance the parser for non-whitespace or comment chars.
  145. Token GetNextToken();
  146. // Consumes whitespace characters and comments until the next non-that is
  147. // encountered.
  148. void EatWhitespaceAndComments();
  149. // Helper function that consumes a comment, assuming that the parser is
  150. // currently wound to a '/'.
  151. bool EatComment();
  152. // Calls GetNextToken() and then ParseToken().
  153. Optional<Value> ParseNextToken();
  154. // Takes a token that represents the start of a Value ("a structural token"
  155. // in RFC terms) and consumes it, returning the result as a Value.
  156. Optional<Value> ParseToken(Token token);
  157. // Assuming that the parser is currently wound to '{', this parses a JSON
  158. // object into a Value.
  159. Optional<Value> ConsumeDictionary();
  160. // Assuming that the parser is wound to '[', this parses a JSON list into a
  161. // Value.
  162. Optional<Value> ConsumeList();
  163. // Calls through ConsumeStringRaw and wraps it in a value.
  164. Optional<Value> ConsumeString();
  165. // Assuming that the parser is wound to a double quote, this parses a string,
  166. // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on
  167. // success and places result into |out|. Returns false on failure with
  168. // error information set.
  169. bool ConsumeStringRaw(StringBuilder* out);
  170. // Helper function for ConsumeStringRaw() that consumes the next four or 10
  171. // bytes (parser is wound to the first character of a HEX sequence, with the
  172. // potential for consuming another \uXXXX for a surrogate). Returns true on
  173. // success and places the code point |out_code_point|, and false on failure.
  174. bool DecodeUTF16(uint32_t* out_code_point);
  175. // Assuming that the parser is wound to the start of a valid JSON number,
  176. // this parses and converts it to either an int or double value.
  177. Optional<Value> ConsumeNumber();
  178. // Helper that reads characters that are ints. Returns true if a number was
  179. // read and false on error.
  180. bool ReadInt(bool allow_leading_zeros);
  181. // Consumes the literal values of |true|, |false|, and |null|, assuming the
  182. // parser is wound to the first character of any of those.
  183. Optional<Value> ConsumeLiteral();
  184. // Helper function that returns true if the byte squence |match| can be
  185. // consumed at the current parser position. Returns false if there are fewer
  186. // than |match|-length bytes or if the sequence does not match, and the
  187. // parser state is unchanged.
  188. bool ConsumeIfMatch(StringPiece match);
  189. // Sets the error information to |code| at the current column, based on
  190. // |index_| and |index_last_line_|, with an optional positive/negative
  191. // adjustment by |column_adjust|.
  192. void ReportError(JsonParseError code, int column_adjust);
  193. // Given the line and column number of an error, formats one of the error
  194. // message contants from json_reader.h for human display.
  195. static std::string FormatErrorMessage(int line, int column,
  196. const std::string& description);
  197. // base::JSONParserOptions that control parsing.
  198. const int options_;
  199. // Maximum depth to parse.
  200. const size_t max_depth_;
  201. // The input stream being parsed. Note: Not guaranteed to NUL-terminated.
  202. StringPiece input_;
  203. // The index in the input stream to which the parser is wound.
  204. int index_;
  205. // The number of times the parser has recursed (current stack depth).
  206. size_t stack_depth_;
  207. // The line number that the parser is at currently.
  208. int line_number_;
  209. // The last value of |index_| on the previous line.
  210. int index_last_line_;
  211. // Error information.
  212. JsonParseError error_code_;
  213. int error_line_;
  214. int error_column_;
  215. friend class JSONParserTest;
  216. FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar);
  217. FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary);
  218. FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList);
  219. FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString);
  220. FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals);
  221. FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers);
  222. FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages);
  223. DISALLOW_COPY_AND_ASSIGN(JSONParser);
  224. };
  225. // Used when decoding and an invalid utf-8 sequence is encountered.
  226. BASE_EXPORT extern const char kUnicodeReplacementString[];
  227. } // namespace internal
  228. } // namespace base
  229. #endif // BASE_JSON_JSON_PARSER_H_