collationsettings.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2013-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * collationsettings.h
  9. *
  10. * created on: 2013feb07
  11. * created by: Markus W. Scherer
  12. */
  13. #ifndef __COLLATIONSETTINGS_H__
  14. #define __COLLATIONSETTINGS_H__
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_COLLATION
  17. #include "unicode/ucol.h"
  18. #include "collation.h"
  19. #include "sharedobject.h"
  20. #include "umutex.h"
  21. U_NAMESPACE_BEGIN
  22. struct CollationData;
  23. /**
  24. * Collation settings/options/attributes.
  25. * These are the values that can be changed via API.
  26. */
  27. struct U_I18N_API CollationSettings : public SharedObject {
  28. /**
  29. * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
  30. */
  31. static const int32_t CHECK_FCD = 1;
  32. /**
  33. * Options bit 1: Numeric collation.
  34. * Also known as CODAN = COllate Digits As Numbers.
  35. *
  36. * Treat digit sequences as numbers with CE sequences in numeric order,
  37. * rather than returning a normal CE for each digit.
  38. */
  39. static const int32_t NUMERIC = 2;
  40. /**
  41. * "Shifted" alternate handling, see ALTERNATE_MASK.
  42. */
  43. static const int32_t SHIFTED = 4;
  44. /**
  45. * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
  46. * Reserve values 8 and 0xc for shift-trimmed and blanked.
  47. */
  48. static const int32_t ALTERNATE_MASK = 0xc;
  49. /**
  50. * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
  51. */
  52. static const int32_t MAX_VARIABLE_SHIFT = 4;
  53. /** maxVariable options bit mask before shifting. */
  54. static const int32_t MAX_VARIABLE_MASK = 0x70;
  55. /** Options bit 7: Reserved/unused/0. */
  56. /**
  57. * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
  58. */
  59. static const int32_t UPPER_FIRST = 0x100;
  60. /**
  61. * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
  62. * unless case level is on (when they are *moved* into the separate case level).
  63. * By default, the case bits are removed from the tertiary weight (ignored).
  64. *
  65. * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
  66. * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
  67. */
  68. static const int32_t CASE_FIRST = 0x200;
  69. /**
  70. * Options bit mask for caseFirst and upperFirst, before shifting.
  71. * Same value as caseFirst==upperFirst.
  72. */
  73. static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
  74. /**
  75. * Options bit 10: Insert the case level between the secondary and tertiary levels.
  76. */
  77. static const int32_t CASE_LEVEL = 0x400;
  78. /**
  79. * Options bit 11: Compare secondary weights backwards. ("French secondary")
  80. */
  81. static const int32_t BACKWARD_SECONDARY = 0x800;
  82. /**
  83. * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
  84. * It is the top used bit field in the options. (No need to mask after shifting.)
  85. */
  86. static const int32_t STRENGTH_SHIFT = 12;
  87. /** Strength options bit mask before shifting. */
  88. static const int32_t STRENGTH_MASK = 0xf000;
  89. /** maxVariable values */
  90. enum MaxVariable {
  91. MAX_VAR_SPACE,
  92. MAX_VAR_PUNCT,
  93. MAX_VAR_SYMBOL,
  94. MAX_VAR_CURRENCY
  95. };
  96. CollationSettings()
  97. : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) |
  98. (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)),
  99. variableTop(0),
  100. reorderTable(NULL),
  101. minHighNoReorder(0),
  102. reorderRanges(NULL), reorderRangesLength(0),
  103. reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
  104. fastLatinOptions(-1) {}
  105. CollationSettings(const CollationSettings &other);
  106. virtual ~CollationSettings();
  107. UBool operator==(const CollationSettings &other) const;
  108. inline UBool operator!=(const CollationSettings &other) const {
  109. return !operator==(other);
  110. }
  111. int32_t hashCode() const;
  112. void resetReordering();
  113. void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
  114. const uint32_t *ranges, int32_t rangesLength,
  115. const uint8_t *table, UErrorCode &errorCode);
  116. void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength,
  117. UErrorCode &errorCode);
  118. void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode);
  119. inline UBool hasReordering() const { return reorderTable != NULL; }
  120. static UBool reorderTableHasSplitBytes(const uint8_t table[256]);
  121. inline uint32_t reorder(uint32_t p) const {
  122. uint8_t b = reorderTable[p >> 24];
  123. if(b != 0 || p <= Collation::NO_CE_PRIMARY) {
  124. return ((uint32_t)b << 24) | (p & 0xffffff);
  125. } else {
  126. return reorderEx(p);
  127. }
  128. }
  129. void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
  130. static int32_t getStrength(int32_t options) {
  131. return options >> STRENGTH_SHIFT;
  132. }
  133. int32_t getStrength() const {
  134. return getStrength(options);
  135. }
  136. /** Sets the options bit for an on/off attribute. */
  137. void setFlag(int32_t bit, UColAttributeValue value,
  138. int32_t defaultOptions, UErrorCode &errorCode);
  139. UColAttributeValue getFlag(int32_t bit) const {
  140. return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF;
  141. }
  142. void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode);
  143. UColAttributeValue getCaseFirst() const {
  144. int32_t option = options & CASE_FIRST_AND_UPPER_MASK;
  145. return (option == 0) ? UCOL_OFF :
  146. (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST;
  147. }
  148. void setAlternateHandling(UColAttributeValue value,
  149. int32_t defaultOptions, UErrorCode &errorCode);
  150. UColAttributeValue getAlternateHandling() const {
  151. return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED;
  152. }
  153. void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
  154. MaxVariable getMaxVariable() const {
  155. return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT);
  156. }
  157. /**
  158. * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
  159. */
  160. static inline UBool isTertiaryWithCaseBits(int32_t options) {
  161. return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
  162. }
  163. static uint32_t getTertiaryMask(int32_t options) {
  164. // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
  165. return isTertiaryWithCaseBits(options) ?
  166. Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK;
  167. }
  168. static UBool sortsTertiaryUpperCaseFirst(int32_t options) {
  169. // On tertiary level, consider case bits and sort uppercase first
  170. // if caseLevel is off and caseFirst==upperFirst.
  171. return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
  172. }
  173. inline UBool dontCheckFCD() const {
  174. return (options & CHECK_FCD) == 0;
  175. }
  176. inline UBool hasBackwardSecondary() const {
  177. return (options & BACKWARD_SECONDARY) != 0;
  178. }
  179. inline UBool isNumeric() const {
  180. return (options & NUMERIC) != 0;
  181. }
  182. /** CHECK_FCD etc. */
  183. int32_t options;
  184. /** Variable-top primary weight. */
  185. uint32_t variableTop;
  186. /**
  187. * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering.
  188. * A 0 entry at a non-zero index means that the primary lead byte is "split"
  189. * (there are different offsets for primaries that share that lead byte)
  190. * and the reordering offset must be determined via the reorderRanges.
  191. */
  192. const uint8_t *reorderTable;
  193. /** Limit of last reordered range. 0 if no reordering or no split bytes. */
  194. uint32_t minHighNoReorder;
  195. /**
  196. * Primary-weight ranges for script reordering,
  197. * to be used by reorder(p) for split-reordered primary lead bytes.
  198. *
  199. * Each entry is a (limit, offset) pair.
  200. * The upper 16 bits of the entry are the upper 16 bits of the
  201. * exclusive primary limit of a range.
  202. * Primaries between the previous limit and this one have their lead bytes
  203. * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
  204. *
  205. * CollationData::makeReorderRanges() writes a full list where the first range
  206. * (at least for terminators and separators) has a 0 offset.
  207. * The last range has a non-zero offset.
  208. * minHighNoReorder is set to the limit of that last range.
  209. *
  210. * In the settings object, the initial ranges before the first split lead byte
  211. * are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
  212. * If there are no split-reordered lead bytes, then no ranges are needed.
  213. */
  214. const uint32_t *reorderRanges;
  215. int32_t reorderRangesLength;
  216. /** Array of reorder codes; ignored if reorderCodesLength == 0. */
  217. const int32_t *reorderCodes;
  218. /** Number of reorder codes; 0 if no reordering. */
  219. int32_t reorderCodesLength;
  220. /**
  221. * Capacity of reorderCodes.
  222. * If 0, then the codes, the ranges, and the table are aliases.
  223. * Otherwise, this object owns the memory via the reorderCodes pointer;
  224. * the codes, the ranges, and the table are in the same memory block, in that order.
  225. */
  226. int32_t reorderCodesCapacity;
  227. /** Options for CollationFastLatin. Negative if disabled. */
  228. int32_t fastLatinOptions;
  229. uint16_t fastLatinPrimaries[0x180];
  230. private:
  231. void setReorderArrays(const int32_t *codes, int32_t codesLength,
  232. const uint32_t *ranges, int32_t rangesLength,
  233. const uint8_t *table, UErrorCode &errorCode);
  234. uint32_t reorderEx(uint32_t p) const;
  235. };
  236. U_NAMESPACE_END
  237. #endif // !UCONFIG_NO_COLLATION
  238. #endif // __COLLATIONSETTINGS_H__