csrsbcs.h 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2005-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. */
  9. #ifndef __CSRSBCS_H
  10. #define __CSRSBCS_H
  11. #include "unicode/uobject.h"
  12. #if !UCONFIG_NO_CONVERSION
  13. #include "csrecog.h"
  14. U_NAMESPACE_BEGIN
  15. class NGramParser : public UMemory
  16. {
  17. private:
  18. int32_t ngram;
  19. const int32_t *ngramList;
  20. int32_t ngramCount;
  21. int32_t hitCount;
  22. protected:
  23. int32_t byteIndex;
  24. const uint8_t *charMap;
  25. void addByte(int32_t b);
  26. public:
  27. NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
  28. virtual ~NGramParser();
  29. private:
  30. /*
  31. * Binary search for value in table, which must have exactly 64 entries.
  32. */
  33. int32_t search(const int32_t *table, int32_t value);
  34. void lookup(int32_t thisNgram);
  35. virtual int32_t nextByte(InputText *det);
  36. virtual void parseCharacters(InputText *det);
  37. public:
  38. int32_t parse(InputText *det);
  39. };
  40. #if !UCONFIG_ONLY_HTML_CONVERSION
  41. class NGramParser_IBM420 : public NGramParser
  42. {
  43. public:
  44. NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
  45. ~NGramParser_IBM420();
  46. private:
  47. int32_t alef;
  48. int32_t isLamAlef(int32_t b);
  49. int32_t nextByte(InputText *det);
  50. void parseCharacters(InputText *det);
  51. };
  52. #endif
  53. class CharsetRecog_sbcs : public CharsetRecognizer
  54. {
  55. public:
  56. CharsetRecog_sbcs();
  57. virtual ~CharsetRecog_sbcs();
  58. virtual const char *getName() const = 0;
  59. virtual UBool match(InputText *det, CharsetMatch *results) const = 0;
  60. virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
  61. };
  62. class CharsetRecog_8859_1 : public CharsetRecog_sbcs
  63. {
  64. public:
  65. virtual ~CharsetRecog_8859_1();
  66. const char *getName() const;
  67. virtual UBool match(InputText *det, CharsetMatch *results) const;
  68. };
  69. class CharsetRecog_8859_2 : public CharsetRecog_sbcs
  70. {
  71. public:
  72. virtual ~CharsetRecog_8859_2();
  73. const char *getName() const;
  74. virtual UBool match(InputText *det, CharsetMatch *results) const;
  75. };
  76. class CharsetRecog_8859_5 : public CharsetRecog_sbcs
  77. {
  78. public:
  79. virtual ~CharsetRecog_8859_5();
  80. const char *getName() const;
  81. };
  82. class CharsetRecog_8859_6 : public CharsetRecog_sbcs
  83. {
  84. public:
  85. virtual ~CharsetRecog_8859_6();
  86. const char *getName() const;
  87. };
  88. class CharsetRecog_8859_7 : public CharsetRecog_sbcs
  89. {
  90. public:
  91. virtual ~CharsetRecog_8859_7();
  92. const char *getName() const;
  93. };
  94. class CharsetRecog_8859_8 : public CharsetRecog_sbcs
  95. {
  96. public:
  97. virtual ~CharsetRecog_8859_8();
  98. virtual const char *getName() const;
  99. };
  100. class CharsetRecog_8859_9 : public CharsetRecog_sbcs
  101. {
  102. public:
  103. virtual ~CharsetRecog_8859_9();
  104. const char *getName() const;
  105. };
  106. class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
  107. {
  108. public:
  109. virtual ~CharsetRecog_8859_5_ru();
  110. const char *getLanguage() const;
  111. virtual UBool match(InputText *det, CharsetMatch *results) const;
  112. };
  113. class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
  114. {
  115. public:
  116. virtual ~CharsetRecog_8859_6_ar();
  117. const char *getLanguage() const;
  118. virtual UBool match(InputText *det, CharsetMatch *results) const;
  119. };
  120. class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
  121. {
  122. public:
  123. virtual ~CharsetRecog_8859_7_el();
  124. const char *getLanguage() const;
  125. virtual UBool match(InputText *det, CharsetMatch *results) const;
  126. };
  127. class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
  128. {
  129. public:
  130. virtual ~CharsetRecog_8859_8_I_he();
  131. const char *getName() const;
  132. const char *getLanguage() const;
  133. virtual UBool match(InputText *det, CharsetMatch *results) const;
  134. };
  135. class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
  136. {
  137. public:
  138. virtual ~CharsetRecog_8859_8_he ();
  139. const char *getLanguage() const;
  140. virtual UBool match(InputText *det, CharsetMatch *results) const;
  141. };
  142. class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
  143. {
  144. public:
  145. virtual ~CharsetRecog_8859_9_tr ();
  146. const char *getLanguage() const;
  147. virtual UBool match(InputText *det, CharsetMatch *results) const;
  148. };
  149. class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
  150. {
  151. public:
  152. virtual ~CharsetRecog_windows_1256();
  153. const char *getName() const;
  154. const char *getLanguage() const;
  155. virtual UBool match(InputText *det, CharsetMatch *results) const;
  156. };
  157. class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
  158. {
  159. public:
  160. virtual ~CharsetRecog_windows_1251();
  161. const char *getName() const;
  162. const char *getLanguage() const;
  163. virtual UBool match(InputText *det, CharsetMatch *results) const;
  164. };
  165. class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
  166. {
  167. public:
  168. virtual ~CharsetRecog_KOI8_R();
  169. const char *getName() const;
  170. const char *getLanguage() const;
  171. virtual UBool match(InputText *det, CharsetMatch *results) const;
  172. };
  173. #if !UCONFIG_ONLY_HTML_CONVERSION
  174. class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
  175. {
  176. public:
  177. virtual ~CharsetRecog_IBM424_he();
  178. const char *getLanguage() const;
  179. };
  180. class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
  181. public:
  182. virtual ~CharsetRecog_IBM424_he_rtl();
  183. const char *getName() const;
  184. virtual UBool match(InputText *det, CharsetMatch *results) const;
  185. };
  186. class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
  187. virtual ~CharsetRecog_IBM424_he_ltr();
  188. const char *getName() const;
  189. virtual UBool match(InputText *det, CharsetMatch *results) const;
  190. };
  191. class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
  192. {
  193. public:
  194. virtual ~CharsetRecog_IBM420_ar();
  195. const char *getLanguage() const;
  196. int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
  197. };
  198. class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
  199. public:
  200. virtual ~CharsetRecog_IBM420_ar_rtl();
  201. const char *getName() const;
  202. virtual UBool match(InputText *det, CharsetMatch *results) const;
  203. };
  204. class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
  205. virtual ~CharsetRecog_IBM420_ar_ltr();
  206. const char *getName() const;
  207. virtual UBool match(InputText *det, CharsetMatch *results) const;
  208. };
  209. #endif
  210. U_NAMESPACE_END
  211. #endif /* !UCONFIG_NO_CONVERSION */
  212. #endif /* __CSRSBCS_H */