streaming_utf8_validator.h 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. // Copyright 2014 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. // A streaming validator for UTF-8. Validation is based on the definition in
  5. // RFC-3629. In particular, it does not reject the invalid characters rejected
  6. // by base::IsStringUTF8().
  7. //
  8. // The implementation detects errors on the first possible byte.
  9. #ifndef BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
  10. #define BASE_I18N_STREAMING_UTF8_VALIDATOR_H_
  11. #include <stddef.h>
  12. #include <stdint.h>
  13. #include <string>
  14. #include "base/i18n/base_i18n_export.h"
  15. #include "base/macros.h"
  16. namespace base {
  17. class BASE_I18N_EXPORT StreamingUtf8Validator {
  18. public:
  19. // The validator exposes 3 states. It starts in state VALID_ENDPOINT. As it
  20. // processes characters it alternates between VALID_ENDPOINT and
  21. // VALID_MIDPOINT. If it encounters an invalid byte or UTF-8 sequence the
  22. // state changes permanently to INVALID.
  23. enum State {
  24. VALID_ENDPOINT,
  25. VALID_MIDPOINT,
  26. INVALID
  27. };
  28. StreamingUtf8Validator() : state_(0u) {}
  29. // Trivial destructor intentionally omitted.
  30. // Validate |size| bytes starting at |data|. If the concatenation of all calls
  31. // to AddBytes() since this object was constructed or reset is a valid UTF-8
  32. // string, returns VALID_ENDPOINT. If it could be the prefix of a valid UTF-8
  33. // string, returns VALID_MIDPOINT. If an invalid byte or UTF-8 sequence was
  34. // present, returns INVALID.
  35. State AddBytes(const char* data, size_t size);
  36. // Return the object to a freshly-constructed state so that it can be re-used.
  37. void Reset();
  38. // Validate a complete string using the same criteria. Returns true if the
  39. // string only contains complete, valid UTF-8 codepoints.
  40. static bool Validate(const std::string& string);
  41. private:
  42. // The current state of the validator. Value 0 is the initial/valid state.
  43. // The state is stored as an offset into |kUtf8ValidatorTables|. The special
  44. // state |kUtf8InvalidState| is invalid.
  45. uint8_t state_;
  46. // This type could be made copyable but there is currently no use-case for
  47. // it.
  48. DISALLOW_COPY_AND_ASSIGN(StreamingUtf8Validator);
  49. };
  50. } // namespace base
  51. #endif // BASE_I18N_STREAMING_UTF8_VALIDATOR_H_