csrmbcs.h 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2005-2012, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. */
  9. #ifndef __CSRMBCS_H
  10. #define __CSRMBCS_H
  11. #include "unicode/utypes.h"
  12. #if !UCONFIG_NO_CONVERSION
  13. #include "csrecog.h"
  14. U_NAMESPACE_BEGIN
  15. // "Character" iterated character class.
  16. // Recognizers for specific mbcs encodings make their "characters" available
  17. // by providing a nextChar() function that fills in an instance of IteratedChar
  18. // with the next char from the input.
  19. // The returned characters are not converted to Unicode, but remain as the raw
  20. // bytes (concatenated into an int) from the codepage data.
  21. //
  22. // For Asian charsets, use the raw input rather than the input that has been
  23. // stripped of markup. Detection only considers multi-byte chars, effectively
  24. // stripping markup anyway, and double byte chars do occur in markup too.
  25. //
  26. class IteratedChar : public UMemory
  27. {
  28. public:
  29. uint32_t charValue; // 1-4 bytes from the raw input data
  30. int32_t index;
  31. int32_t nextIndex;
  32. UBool error;
  33. UBool done;
  34. public:
  35. IteratedChar();
  36. //void reset();
  37. int32_t nextByte(InputText* det);
  38. };
  39. class CharsetRecog_mbcs : public CharsetRecognizer {
  40. protected:
  41. /**
  42. * Test the match of this charset with the input text data
  43. * which is obtained via the CharsetDetector object.
  44. *
  45. * @param det The CharsetDetector, which contains the input text
  46. * to be checked for being in this charset.
  47. * @return Two values packed into one int (Damn java, anyhow)
  48. * <br/>
  49. * bits 0-7: the match confidence, ranging from 0-100
  50. * <br/>
  51. * bits 8-15: The match reason, an enum-like value.
  52. */
  53. int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
  54. public:
  55. virtual ~CharsetRecog_mbcs();
  56. /**
  57. * Get the IANA name of this charset.
  58. * @return the charset name.
  59. */
  60. const char *getName() const = 0;
  61. const char *getLanguage() const = 0;
  62. UBool match(InputText* input, CharsetMatch *results) const = 0;
  63. /**
  64. * Get the next character (however many bytes it is) from the input data
  65. * Subclasses for specific charset encodings must implement this function
  66. * to get characters according to the rules of their encoding scheme.
  67. *
  68. * This function is not a method of class IteratedChar only because
  69. * that would require a lot of extra derived classes, which is awkward.
  70. * @param it The IteratedChar "struct" into which the returned char is placed.
  71. * @param det The charset detector, which is needed to get at the input byte data
  72. * being iterated over.
  73. * @return True if a character was returned, false at end of input.
  74. */
  75. virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0;
  76. };
  77. /**
  78. * Shift-JIS charset recognizer.
  79. *
  80. */
  81. class CharsetRecog_sjis : public CharsetRecog_mbcs {
  82. public:
  83. virtual ~CharsetRecog_sjis();
  84. UBool nextChar(IteratedChar *it, InputText *det) const;
  85. UBool match(InputText* input, CharsetMatch *results) const;
  86. const char *getName() const;
  87. const char *getLanguage() const;
  88. };
  89. /**
  90. * EUC charset recognizers. One abstract class that provides the common function
  91. * for getting the next character according to the EUC encoding scheme,
  92. * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
  93. *
  94. */
  95. class CharsetRecog_euc : public CharsetRecog_mbcs
  96. {
  97. public:
  98. virtual ~CharsetRecog_euc();
  99. const char *getName() const = 0;
  100. const char *getLanguage() const = 0;
  101. UBool match(InputText* input, CharsetMatch *results) const = 0;
  102. /*
  103. * (non-Javadoc)
  104. * Get the next character value for EUC based encodings.
  105. * Character "value" is simply the raw bytes that make up the character
  106. * packed into an int.
  107. */
  108. UBool nextChar(IteratedChar *it, InputText *det) const;
  109. };
  110. /**
  111. * The charset recognize for EUC-JP. A singleton instance of this class
  112. * is created and kept by the public CharsetDetector class
  113. */
  114. class CharsetRecog_euc_jp : public CharsetRecog_euc
  115. {
  116. public:
  117. virtual ~CharsetRecog_euc_jp();
  118. const char *getName() const;
  119. const char *getLanguage() const;
  120. UBool match(InputText* input, CharsetMatch *results) const;
  121. };
  122. /**
  123. * The charset recognize for EUC-KR. A singleton instance of this class
  124. * is created and kept by the public CharsetDetector class
  125. */
  126. class CharsetRecog_euc_kr : public CharsetRecog_euc
  127. {
  128. public:
  129. virtual ~CharsetRecog_euc_kr();
  130. const char *getName() const;
  131. const char *getLanguage() const;
  132. UBool match(InputText* input, CharsetMatch *results) const;
  133. };
  134. /**
  135. *
  136. * Big5 charset recognizer.
  137. *
  138. */
  139. class CharsetRecog_big5 : public CharsetRecog_mbcs
  140. {
  141. public:
  142. virtual ~CharsetRecog_big5();
  143. UBool nextChar(IteratedChar* it, InputText* det) const;
  144. const char *getName() const;
  145. const char *getLanguage() const;
  146. UBool match(InputText* input, CharsetMatch *results) const;
  147. };
  148. /**
  149. *
  150. * GB-18030 recognizer. Uses simplified Chinese statistics.
  151. *
  152. */
  153. class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
  154. {
  155. public:
  156. virtual ~CharsetRecog_gb_18030();
  157. UBool nextChar(IteratedChar* it, InputText* det) const;
  158. const char *getName() const;
  159. const char *getLanguage() const;
  160. UBool match(InputText* input, CharsetMatch *results) const;
  161. };
  162. U_NAMESPACE_END
  163. #endif
  164. #endif /* __CSRMBCS_H */