collationdatareader.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2013-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * collationdatareader.h
  9. *
  10. * created on: 2013feb07
  11. * created by: Markus W. Scherer
  12. */
  13. #ifndef __COLLATIONDATAREADER_H__
  14. #define __COLLATIONDATAREADER_H__
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_COLLATION
  17. #include "unicode/udata.h"
  18. struct UDataMemory;
  19. U_NAMESPACE_BEGIN
  20. struct CollationTailoring;
  21. /**
  22. * Collation binary data reader.
  23. */
  24. struct U_I18N_API CollationDataReader /* all static */ {
  25. // The following constants are also copied into source/common/ucol_swp.cpp.
  26. // Keep them in sync!
  27. enum {
  28. /**
  29. * Number of int32_t indexes.
  30. *
  31. * Can be 2 if there are only options.
  32. * Can be 7 or 8 if there are only options and a script reordering.
  33. * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0.
  34. */
  35. IX_INDEXES_LENGTH, // 0
  36. /**
  37. * Bits 31..24: numericPrimary, for numeric collation
  38. * 23..16: fast Latin format version (0 = no fast Latin table)
  39. * 15.. 0: options bit set
  40. */
  41. IX_OPTIONS,
  42. IX_RESERVED2,
  43. IX_RESERVED3,
  44. /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */
  45. IX_JAMO_CE32S_START, // 4
  46. // Byte offsets from the start of the data, after the generic header.
  47. // The indexes[] are at byte offset 0, other data follows.
  48. // Each data item is aligned properly.
  49. // The data items should be in descending order of unit size,
  50. // to minimize the need for padding.
  51. // Each item's byte length is given by the difference between its offset and
  52. // the next index/offset value.
  53. /** Byte offset to int32_t reorderCodes[]. */
  54. IX_REORDER_CODES_OFFSET,
  55. /**
  56. * Byte offset to uint8_t reorderTable[].
  57. * Empty table if <256 bytes (padding only).
  58. * Otherwise 256 bytes or more (with padding).
  59. */
  60. IX_REORDER_TABLE_OFFSET,
  61. /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */
  62. IX_TRIE_OFFSET,
  63. IX_RESERVED8_OFFSET, // 8
  64. /** Byte offset to int64_t ces[]. */
  65. IX_CES_OFFSET,
  66. IX_RESERVED10_OFFSET,
  67. /** Byte offset to uint32_t ce32s[]. */
  68. IX_CE32S_OFFSET,
  69. /** Byte offset to uint32_t rootElements[]. */
  70. IX_ROOT_ELEMENTS_OFFSET, // 12
  71. /** Byte offset to UChar *contexts[]. */
  72. IX_CONTEXTS_OFFSET,
  73. /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */
  74. IX_UNSAFE_BWD_OFFSET,
  75. /** Byte offset to uint16_t fastLatinTable[]. */
  76. IX_FAST_LATIN_TABLE_OFFSET,
  77. /** Byte offset to uint16_t scripts[]. */
  78. IX_SCRIPTS_OFFSET, // 16
  79. /**
  80. * Byte offset to UBool compressibleBytes[].
  81. * Empty table if <256 bytes (padding only).
  82. * Otherwise 256 bytes or more (with padding).
  83. */
  84. IX_COMPRESSIBLE_BYTES_OFFSET,
  85. IX_RESERVED18_OFFSET,
  86. IX_TOTAL_SIZE
  87. };
  88. static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
  89. CollationTailoring &tailoring, UErrorCode &errorCode);
  90. static UBool U_CALLCONV
  91. isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
  92. private:
  93. CollationDataReader(); // no constructor
  94. };
  95. /*
  96. * Format of collation data (ucadata.icu, binary data in coll/ *.res files).
  97. * Format version 5.
  98. *
  99. * The root collation data is stored in the ucadata.icu file.
  100. * Tailorings are stored inside .res resource bundle files, with a complete file header.
  101. *
  102. * Collation data begins with a standard ICU data file header
  103. * (DataHeader, see ucmndata.h and unicode/udata.h).
  104. * The UDataInfo.dataVersion field contains the UCA and other version numbers,
  105. * see the comments for CollationTailoring.version.
  106. *
  107. * After the header, the file contains the following parts.
  108. * Constants are defined as enum values of the CollationDataReader class.
  109. * See also the Collation class.
  110. *
  111. * int32_t indexes[indexesLength];
  112. * The indexes array has variable length.
  113. * Some tailorings only need the length and the options,
  114. * others only add reorderCodes and the reorderTable,
  115. * some need to store mappings.
  116. * Only as many indexes are stored as needed to read all of the data.
  117. *
  118. * Index 0: indexesLength
  119. * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS
  120. * Index 2..3: Unused/reserved/0.
  121. * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo
  122. * are stored in a short, contiguous part of the ce32s array.
  123. *
  124. * Indexes 5..19 are byte offsets in ascending order.
  125. * Each byte offset marks the start of the next part in the data file,
  126. * and the end of the previous one.
  127. * When two consecutive byte offsets are the same (or too short),
  128. * then the corresponding part is empty.
  129. * Byte offsets are offsets from after the header,
  130. * that is, from the beginning of the indexes[].
  131. * Each part starts at an offset with proper alignment for its data.
  132. * If necessary, the previous part may include padding bytes to achieve this alignment.
  133. * The last byte offset that is stored in the indexes indicates the total size of the data
  134. * (starting with the indexes).
  135. *
  136. * int32_t reorderCodes[]; -- empty in root
  137. * The list of script and reordering codes.
  138. *
  139. * Beginning with format version 5, this array may optionally
  140. * have trailing entries with a full list of reorder ranges
  141. * as described for CollationSettings::reorderRanges.
  142. *
  143. * Script or reorder codes are first and do not exceed 16-bit values.
  144. * Range limits are stored in the upper 16 bits, and are never 0.
  145. * Split this array into reorder codes and ranges at the first entry
  146. * with non-zero upper 16 bits.
  147. *
  148. * If the ranges are missing but needed for split-reordered primary lead bytes,
  149. * then they are regenerated at load time.
  150. *
  151. * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes
  152. * Primary-weight lead byte permutation table.
  153. * Normally present when the reorderCodes are, but can be built at load time.
  154. *
  155. * Beginning with format version 5, a 0 entry at a non-zero index
  156. * (which is otherwise an illegal value)
  157. * means that the primary lead byte is "split"
  158. * (there are different offsets for primaries that share that lead byte)
  159. * and the reordering offset must be determined via the reorder ranges
  160. * that are either stored as part of the reorderCodes array
  161. * or regenerated at load time.
  162. *
  163. * UTrie2 trie; -- see utrie2_impl.h and utrie2.h
  164. * The trie holds the main collation data. Each code point is mapped to a 32-bit value.
  165. * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set,
  166. * in which case it is a special CE32 and contains a 4-bit tag and further data.
  167. * See the Collation class for details.
  168. *
  169. * The trie has a value for each lead surrogate code unit with some bits encoding
  170. * collective properties of the 1024 supplementary characters whose UTF-16 form starts with
  171. * the lead surrogate. See Collation::LEAD_SURROGATE_TAG..
  172. *
  173. * int64_t ces[];
  174. * 64-bit CEs and expansions that cannot be stored in a more compact form.
  175. *
  176. * uint32_t ce32s[];
  177. * CE32s for expansions in compact form, and for characters whose trie values
  178. * contain special data.
  179. *
  180. * uint32_t rootElements[]; -- empty in all tailorings
  181. * Compact storage for all of the CEs that occur in the root collation.
  182. * See the CollationRootElements class.
  183. *
  184. * UChar *contexts[];
  185. * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings.
  186. *
  187. * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize()
  188. * Serialized form of characters that are unsafe when iterating backwards,
  189. * and at the end of an identical string prefix.
  190. * Back up to a safe character.
  191. * Lead surrogates are "unsafe" when any of their corresponding supplementary
  192. * code points are unsafe.
  193. * Does not include [:^lccc=0:][:^tccc=0:].
  194. * For each tailoring, the root unsafeBackwardSet is subtracted.
  195. * (As a result, in many tailorings no set needs to be stored.)
  196. *
  197. * uint16_t fastLatinTable[];
  198. * Optional optimization for Latin text.
  199. * See the CollationFastLatin class.
  200. *
  201. * uint16_t scripts[]; -- empty in all tailorings
  202. * Format version 5:
  203. * uint16_t numScripts;
  204. * uint16_t scriptsIndex[numScripts+16];
  205. * uint16_t scriptStarts[];
  206. * See CollationData::numScripts etc.
  207. *
  208. * Format version 4:
  209. * Table of the reordering groups with their first and last lead bytes,
  210. * and their script and reordering codes.
  211. * See CollationData::scripts.
  212. *
  213. * UBool compressibleBytes[]; -- empty in all tailorings
  214. * Flag for getSortKey(), indicating primary weight lead bytes that are compressible.
  215. *
  216. * -----------------
  217. * Changes for formatVersion 5 (ICU 55)
  218. *
  219. * Reordering moves single scripts, not groups of scripts.
  220. * Reorder ranges are optionally appended to the reorderCodes,
  221. * and a 0 entry in the reorderTable indicates a split lead byte.
  222. * The scripts data has a new format.
  223. *
  224. * The rootElements may contain secondary and tertiary weights below common=05.
  225. * (Used for small Hiragana letters.)
  226. * Where is occurs, there is also an explicit unit with common secondary & tertiary weights.
  227. * There are no other data structure changes, but builder code needs to be able to handle such data.
  228. *
  229. * The collation element for the merge separator code point U+FFFE
  230. * does not necessarily have special, unique secondary/tertiary weights any more.
  231. */
  232. U_NAMESPACE_END
  233. #endif // !UCONFIG_NO_COLLATION
  234. #endif // __COLLATIONDATAREADER_H__