usrchimp.h 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2001-2015 IBM and others. All rights reserved.
  6. **********************************************************************
  7. * Date Name Description
  8. * 08/13/2001 synwee Creation.
  9. **********************************************************************
  10. */
  11. #ifndef USRCHIMP_H
  12. #define USRCHIMP_H
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_COLLATION
  15. #include "unicode/normalizer2.h"
  16. #include "unicode/ucol.h"
  17. #include "unicode/ucoleitr.h"
  18. #include "unicode/ubrk.h"
  19. /* mask off anything but primary order */
  20. #define UCOL_PRIMARYORDERMASK 0xffff0000
  21. /* mask off anything but secondary order */
  22. #define UCOL_SECONDARYORDERMASK 0x0000ff00
  23. /* mask off anything but tertiary order */
  24. #define UCOL_TERTIARYORDERMASK 0x000000ff
  25. /* primary order shift */
  26. #define UCOL_PRIMARYORDERSHIFT 16
  27. /* secondary order shift */
  28. #define UCOL_SECONDARYORDERSHIFT 8
  29. #define UCOL_IGNORABLE 0
  30. /* get weights from a CE */
  31. #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
  32. #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
  33. #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
  34. #define UCOL_CONTINUATION_MARKER 0xC0
  35. #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
  36. /**
  37. * This indicates an error has occured during processing or there are no more CEs
  38. * to be returned.
  39. */
  40. #define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
  41. U_NAMESPACE_BEGIN
  42. class CollationElementIterator;
  43. class Collator;
  44. struct PCEI
  45. {
  46. uint64_t ce;
  47. int32_t low;
  48. int32_t high;
  49. };
  50. struct PCEBuffer
  51. {
  52. PCEI defaultBuffer[16];
  53. PCEI *buffer;
  54. int32_t bufferIndex;
  55. int32_t bufferSize;
  56. PCEBuffer();
  57. ~PCEBuffer();
  58. void reset();
  59. UBool isEmpty() const;
  60. void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
  61. const PCEI *get();
  62. };
  63. class UCollationPCE : public UMemory {
  64. private:
  65. PCEBuffer pceBuffer;
  66. CollationElementIterator *cei;
  67. UCollationStrength strength;
  68. UBool toShift;
  69. UBool isShifted;
  70. uint32_t variableTop;
  71. public:
  72. UCollationPCE(UCollationElements *elems);
  73. UCollationPCE(CollationElementIterator *iter);
  74. ~UCollationPCE();
  75. void init(UCollationElements *elems);
  76. void init(CollationElementIterator *iter);
  77. /**
  78. * Get the processed ordering priority of the next collation element in the text.
  79. * A single character may contain more than one collation element.
  80. *
  81. * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
  82. * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
  83. * @param status A pointer to an UErrorCode to receive any errors.
  84. * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
  85. * if an error has occured or if the end of string has been reached
  86. */
  87. int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
  88. /**
  89. * Get the processed ordering priority of the previous collation element in the text.
  90. * A single character may contain more than one collation element.
  91. *
  92. * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
  93. * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
  94. * @param status A pointer to an UErrorCode to receive any errors. Noteably
  95. * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
  96. * buffer has been exhausted.
  97. * @return The previous collation elements ordering, otherwise returns
  98. * UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
  99. * string has been reached.
  100. */
  101. int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
  102. private:
  103. void init(const Collator &coll);
  104. uint64_t processCE(uint32_t ce);
  105. };
  106. U_NAMESPACE_END
  107. #define INITIAL_ARRAY_SIZE_ 256
  108. #define MAX_TABLE_SIZE_ 257
  109. struct USearch {
  110. // required since collation element iterator does not have a getText API
  111. const UChar *text;
  112. int32_t textLength; // exact length
  113. UBool isOverlap;
  114. UBool isCanonicalMatch;
  115. int16_t elementComparisonType;
  116. UBreakIterator *internalBreakIter; //internal character breakiterator
  117. UBreakIterator *breakIter;
  118. // value USEARCH_DONE is the default value
  119. // if we are not at the start of the text or the end of the text,
  120. // depending on the iteration direction and matchedIndex is USEARCH_DONE
  121. // it means that we can't find any more matches in that particular direction
  122. int32_t matchedIndex;
  123. int32_t matchedLength;
  124. UBool isForwardSearching;
  125. UBool reset;
  126. };
  127. struct UPattern {
  128. const UChar *text;
  129. int32_t textLength; // exact length
  130. // length required for backwards ce comparison
  131. int32_t cesLength;
  132. int32_t *ces;
  133. int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
  134. int32_t pcesLength;
  135. int64_t *pces;
  136. int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
  137. UBool hasPrefixAccents;
  138. UBool hasSuffixAccents;
  139. int16_t defaultShiftSize;
  140. int16_t shift[MAX_TABLE_SIZE_];
  141. int16_t backShift[MAX_TABLE_SIZE_];
  142. };
  143. struct UStringSearch {
  144. struct USearch *search;
  145. struct UPattern pattern;
  146. const UCollator *collator;
  147. const icu::Normalizer2 *nfd;
  148. // positions within the collation element iterator is used to determine
  149. // if we are at the start of the text.
  150. UCollationElements *textIter;
  151. icu::UCollationPCE *textProcessedIter;
  152. // utility collation element, used throughout program for temporary
  153. // iteration.
  154. UCollationElements *utilIter;
  155. UBool ownCollator;
  156. UCollationStrength strength;
  157. uint32_t ceMask;
  158. uint32_t variableTop;
  159. UBool toShift;
  160. UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
  161. UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
  162. };
  163. /**
  164. * Exact matches without checking for the ends for extra accents.
  165. * The match after the position within the collation element iterator is to be
  166. * found.
  167. * After a match is found the offset in the collation element iterator will be
  168. * shifted to the start of the match.
  169. * Implementation note:
  170. * For tertiary we can't use the collator->tertiaryMask, that is a
  171. * preprocessed mask that takes into account case options. since we are only
  172. * concerned with exact matches, we don't need that.
  173. * Alternate handling - since only the 16 most significant digits is only used,
  174. * we can safely do a compare without masking if the ce is a variable, we mask
  175. * and get only the primary values no shifting to quartenary is required since
  176. * all primary values less than variabletop will need to be masked off anyway.
  177. * If the end character is composite and the pattern ce does not match the text
  178. * ce, we skip it until we find a match in the end composite character or when
  179. * it has passed the character. This is so that we can match pattern "a" with
  180. * the text "\u00e6"
  181. * @param strsrch string search data
  182. * @param status error status if any
  183. * @return TRUE if an exact match is found, FALSE otherwise
  184. */
  185. U_CFUNC
  186. UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
  187. /**
  188. * Canonical matches.
  189. * According to the definition, matches found here will include the whole span
  190. * of beginning and ending accents if it overlaps that region.
  191. * @param strsrch string search data
  192. * @param status error status if any
  193. * @return TRUE if a canonical match is found, FALSE otherwise
  194. */
  195. U_CFUNC
  196. UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
  197. /**
  198. * Gets the previous match.
  199. * Comments follows from handleNextExact
  200. * @param strsrch string search data
  201. * @param status error status if any
  202. * @return True if a exact math is found, FALSE otherwise.
  203. */
  204. U_CFUNC
  205. UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
  206. /**
  207. * Canonical matches.
  208. * According to the definition, matches found here will include the whole span
  209. * of beginning and ending accents if it overlaps that region.
  210. * @param strsrch string search data
  211. * @param status error status if any
  212. * @return TRUE if a canonical match is found, FALSE otherwise
  213. */
  214. U_CFUNC
  215. UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
  216. UErrorCode *status);
  217. #endif /* #if !UCONFIG_NO_COLLATION */
  218. #endif