collationrootelements.h 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2013-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * collationrootelements.h
  9. *
  10. * created on: 2013mar01
  11. * created by: Markus W. Scherer
  12. */
  13. #ifndef __COLLATIONROOTELEMENTS_H__
  14. #define __COLLATIONROOTELEMENTS_H__
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_COLLATION
  17. #include "unicode/uobject.h"
  18. #include "collation.h"
  19. U_NAMESPACE_BEGIN
  20. /**
  21. * Container and access methods for collation elements and weights
  22. * that occur in the root collator.
  23. * Needed for finding boundaries for building a tailoring.
  24. *
  25. * This class takes and returns 16-bit secondary and tertiary weights.
  26. */
  27. class U_I18N_API CollationRootElements : public UMemory {
  28. public:
  29. CollationRootElements(const uint32_t *rootElements, int32_t rootElementsLength)
  30. : elements(rootElements), length(rootElementsLength) {}
  31. /**
  32. * Higher than any root primary.
  33. */
  34. static const uint32_t PRIMARY_SENTINEL = 0xffffff00;
  35. /**
  36. * Flag in a root element, set if the element contains secondary & tertiary weights,
  37. * rather than a primary.
  38. */
  39. static const uint32_t SEC_TER_DELTA_FLAG = 0x80;
  40. /**
  41. * Mask for getting the primary range step value from a primary-range-end element.
  42. */
  43. static const uint8_t PRIMARY_STEP_MASK = 0x7f;
  44. enum {
  45. /**
  46. * Index of the first CE with a non-zero tertiary weight.
  47. * Same as the start of the compact root elements table.
  48. */
  49. IX_FIRST_TERTIARY_INDEX,
  50. /**
  51. * Index of the first CE with a non-zero secondary weight.
  52. */
  53. IX_FIRST_SECONDARY_INDEX,
  54. /**
  55. * Index of the first CE with a non-zero primary weight.
  56. */
  57. IX_FIRST_PRIMARY_INDEX,
  58. /**
  59. * Must match Collation::COMMON_SEC_AND_TER_CE.
  60. */
  61. IX_COMMON_SEC_AND_TER_CE,
  62. /**
  63. * Secondary & tertiary boundaries.
  64. * Bits 31..24: [fixed last secondary common byte 45]
  65. * Bits 23..16: [fixed first ignorable secondary byte 80]
  66. * Bits 15.. 8: reserved, 0
  67. * Bits 7.. 0: [fixed first ignorable tertiary byte 3C]
  68. */
  69. IX_SEC_TER_BOUNDARIES,
  70. /**
  71. * The current number of indexes.
  72. * Currently the same as elements[IX_FIRST_TERTIARY_INDEX].
  73. */
  74. IX_COUNT
  75. };
  76. /**
  77. * Returns the boundary between tertiary weights of primary/secondary CEs
  78. * and those of tertiary CEs.
  79. * This is the upper limit for tertiaries of primary/secondary CEs.
  80. * This minus one is the lower limit for tertiaries of tertiary CEs.
  81. */
  82. uint32_t getTertiaryBoundary() const {
  83. return (elements[IX_SEC_TER_BOUNDARIES] << 8) & 0xff00;
  84. }
  85. /**
  86. * Returns the first assigned tertiary CE.
  87. */
  88. uint32_t getFirstTertiaryCE() const {
  89. return elements[elements[IX_FIRST_TERTIARY_INDEX]] & ~SEC_TER_DELTA_FLAG;
  90. }
  91. /**
  92. * Returns the last assigned tertiary CE.
  93. */
  94. uint32_t getLastTertiaryCE() const {
  95. return elements[elements[IX_FIRST_SECONDARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG;
  96. }
  97. /**
  98. * Returns the last common secondary weight.
  99. * This is the lower limit for secondaries of primary CEs.
  100. */
  101. uint32_t getLastCommonSecondary() const {
  102. return (elements[IX_SEC_TER_BOUNDARIES] >> 16) & 0xff00;
  103. }
  104. /**
  105. * Returns the boundary between secondary weights of primary CEs
  106. * and those of secondary CEs.
  107. * This is the upper limit for secondaries of primary CEs.
  108. * This minus one is the lower limit for secondaries of secondary CEs.
  109. */
  110. uint32_t getSecondaryBoundary() const {
  111. return (elements[IX_SEC_TER_BOUNDARIES] >> 8) & 0xff00;
  112. }
  113. /**
  114. * Returns the first assigned secondary CE.
  115. */
  116. uint32_t getFirstSecondaryCE() const {
  117. return elements[elements[IX_FIRST_SECONDARY_INDEX]] & ~SEC_TER_DELTA_FLAG;
  118. }
  119. /**
  120. * Returns the last assigned secondary CE.
  121. */
  122. uint32_t getLastSecondaryCE() const {
  123. return elements[elements[IX_FIRST_PRIMARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG;
  124. }
  125. /**
  126. * Returns the first assigned primary weight.
  127. */
  128. uint32_t getFirstPrimary() const {
  129. return elements[elements[IX_FIRST_PRIMARY_INDEX]]; // step=0: cannot be a range end
  130. }
  131. /**
  132. * Returns the first assigned primary CE.
  133. */
  134. int64_t getFirstPrimaryCE() const {
  135. return Collation::makeCE(getFirstPrimary());
  136. }
  137. /**
  138. * Returns the last root CE with a primary weight before p.
  139. * Intended only for reordering group boundaries.
  140. */
  141. int64_t lastCEWithPrimaryBefore(uint32_t p) const;
  142. /**
  143. * Returns the first root CE with a primary weight of at least p.
  144. * Intended only for reordering group boundaries.
  145. */
  146. int64_t firstCEWithPrimaryAtLeast(uint32_t p) const;
  147. /**
  148. * Returns the primary weight before p.
  149. * p must be greater than the first root primary.
  150. */
  151. uint32_t getPrimaryBefore(uint32_t p, UBool isCompressible) const;
  152. /** Returns the secondary weight before [p, s]. */
  153. uint32_t getSecondaryBefore(uint32_t p, uint32_t s) const;
  154. /** Returns the tertiary weight before [p, s, t]. */
  155. uint32_t getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) const;
  156. /**
  157. * Finds the index of the input primary.
  158. * p must occur as a root primary, and must not be 0.
  159. */
  160. int32_t findPrimary(uint32_t p) const;
  161. /**
  162. * Returns the primary weight after p where index=findPrimary(p).
  163. * p must be at least the first root primary.
  164. */
  165. uint32_t getPrimaryAfter(uint32_t p, int32_t index, UBool isCompressible) const;
  166. /**
  167. * Returns the secondary weight after [p, s] where index=findPrimary(p)
  168. * except use index=0 for p=0.
  169. *
  170. * Must return a weight for every root [p, s] as well as for every weight
  171. * returned by getSecondaryBefore(). If p!=0 then s can be BEFORE_WEIGHT16.
  172. *
  173. * Exception: [0, 0] is handled by the CollationBuilder:
  174. * Both its lower and upper boundaries are special.
  175. */
  176. uint32_t getSecondaryAfter(int32_t index, uint32_t s) const;
  177. /**
  178. * Returns the tertiary weight after [p, s, t] where index=findPrimary(p)
  179. * except use index=0 for p=0.
  180. *
  181. * Must return a weight for every root [p, s, t] as well as for every weight
  182. * returned by getTertiaryBefore(). If s!=0 then t can be BEFORE_WEIGHT16.
  183. *
  184. * Exception: [0, 0, 0] is handled by the CollationBuilder:
  185. * Both its lower and upper boundaries are special.
  186. */
  187. uint32_t getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const;
  188. private:
  189. /**
  190. * Returns the first secondary & tertiary weights for p where index=findPrimary(p)+1.
  191. */
  192. uint32_t getFirstSecTerForPrimary(int32_t index) const;
  193. /**
  194. * Finds the largest index i where elements[i]<=p.
  195. * Requires first primary<=p<0xffffff00 (PRIMARY_SENTINEL).
  196. * Does not require that p is a root collator primary.
  197. */
  198. int32_t findP(uint32_t p) const;
  199. static inline UBool isEndOfPrimaryRange(uint32_t q) {
  200. return (q & SEC_TER_DELTA_FLAG) == 0 && (q & PRIMARY_STEP_MASK) != 0;
  201. }
  202. /**
  203. * Data structure:
  204. *
  205. * The first few entries are indexes, up to elements[IX_FIRST_TERTIARY_INDEX].
  206. * See the comments on the IX_ constants.
  207. *
  208. * All other elements are a compact form of the root collator CEs
  209. * in mostly collation order.
  210. *
  211. * A sequence of one or more root CEs with the same primary weight is stored as
  212. * one element with the primary weight, with the SEC_TER_DELTA_FLAG flag not set,
  213. * followed by elements with only the secondary/tertiary weights,
  214. * each with that flag set.
  215. * If the lowest secondary/tertiary combination is Collation::COMMON_SEC_AND_TER_CE,
  216. * then the element for that combination is omitted.
  217. *
  218. * Note: If the first actual secondary/tertiary combination is higher than
  219. * Collation::COMMON_SEC_AND_TER_CE (which is unusual),
  220. * the runtime code will assume anyway that Collation::COMMON_SEC_AND_TER_CE is present.
  221. *
  222. * A range of only-primary CEs with a consistent "step" increment
  223. * from each primary to the next may be stored as a range.
  224. * Only the first and last primary are stored, and the last has the step
  225. * value in the low bits (PRIMARY_STEP_MASK).
  226. *
  227. * An range-end element may also either start a new range or be followed by
  228. * elements with secondary/tertiary deltas.
  229. *
  230. * A primary element that is not a range end has zero step bits.
  231. *
  232. * There is no element for the completely ignorable CE (all weights 0).
  233. *
  234. * Before elements[IX_FIRST_PRIMARY_INDEX], all elements are secondary/tertiary deltas,
  235. * for all of the ignorable root CEs.
  236. *
  237. * There are no elements for unassigned-implicit primary CEs.
  238. * All primaries stored here are at most 3 bytes long.
  239. */
  240. const uint32_t *elements;
  241. int32_t length;
  242. };
  243. U_NAMESPACE_END
  244. #endif // !UCONFIG_NO_COLLATION
  245. #endif // __COLLATIONROOTELEMENTS_H__