dictionarydata.h 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * dictionarydata.h
  9. *
  10. * created on: 2012may31
  11. * created by: Markus W. Scherer & Maxime Serrano
  12. */
  13. #ifndef __DICTIONARYDATA_H__
  14. #define __DICTIONARYDATA_H__
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_BREAK_ITERATION
  17. #include "unicode/utext.h"
  18. #include "unicode/udata.h"
  19. #include "udataswp.h"
  20. #include "unicode/uobject.h"
  21. #include "unicode/ustringtrie.h"
  22. U_NAMESPACE_BEGIN
  23. class UCharsTrie;
  24. class BytesTrie;
  25. class U_COMMON_API DictionaryData : public UMemory {
  26. public:
  27. static const int32_t TRIE_TYPE_BYTES; // = 0;
  28. static const int32_t TRIE_TYPE_UCHARS; // = 1;
  29. static const int32_t TRIE_TYPE_MASK; // = 7;
  30. static const int32_t TRIE_HAS_VALUES; // = 8;
  31. static const int32_t TRANSFORM_NONE; // = 0;
  32. static const int32_t TRANSFORM_TYPE_OFFSET; // = 0x1000000;
  33. static const int32_t TRANSFORM_TYPE_MASK; // = 0x7f000000;
  34. static const int32_t TRANSFORM_OFFSET_MASK; // = 0x1fffff;
  35. enum {
  36. // Byte offsets from the start of the data, after the generic header.
  37. IX_STRING_TRIE_OFFSET,
  38. IX_RESERVED1_OFFSET,
  39. IX_RESERVED2_OFFSET,
  40. IX_TOTAL_SIZE,
  41. // Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc.
  42. IX_TRIE_TYPE,
  43. // Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc.
  44. IX_TRANSFORM,
  45. IX_RESERVED6,
  46. IX_RESERVED7,
  47. IX_COUNT
  48. };
  49. };
  50. /**
  51. * Wrapper class around generic dictionaries, implementing matches().
  52. * getType() should return a TRIE_TYPE_??? constant from DictionaryData.
  53. *
  54. * All implementations of this interface must be thread-safe if they are to be used inside of the
  55. * dictionary-based break iteration code.
  56. */
  57. class U_COMMON_API DictionaryMatcher : public UMemory {
  58. public:
  59. DictionaryMatcher() {}
  60. virtual ~DictionaryMatcher();
  61. // this should emulate CompactTrieDictionary::matches()
  62. /* @param text The text in which to look for matching words. Matching begins
  63. * at the current position of the UText.
  64. * @param maxLength The max length of match to consider. Units are the native indexing
  65. * units of the UText.
  66. * @param limit Capacity of output arrays, which is also the maximum number of
  67. * matching words to be found.
  68. * @param lengths output array, filled with the lengths of the matches, in order,
  69. * from shortest to longest. Lengths are in native indexing units
  70. * of the UText. May be NULL.
  71. * @param cpLengths output array, filled with the lengths of the matches, in order,
  72. * from shortest to longest. Lengths are the number of Unicode code points.
  73. * May be NULL.
  74. * @param values Output array, filled with the values associated with the words found.
  75. * May be NULL.
  76. * @param prefix Output parameter, the code point length of the prefix match, even if that
  77. * prefix didn't lead to a complete word. Will always be >= the cpLength
  78. * of the longest complete word matched. May be NULL.
  79. * @return Number of matching words found.
  80. */
  81. virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
  82. int32_t *lengths, int32_t *cpLengths, int32_t *values,
  83. int32_t *prefix) const = 0;
  84. /** @return DictionaryData::TRIE_TYPE_XYZ */
  85. virtual int32_t getType() const = 0;
  86. };
  87. // Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary
  88. class U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher {
  89. public:
  90. // constructs a new UCharsDictionaryMatcher.
  91. // The UDataMemory * will be closed on this object's destruction.
  92. UCharsDictionaryMatcher(const UChar *c, UDataMemory *f) : characters(c), file(f) { }
  93. virtual ~UCharsDictionaryMatcher();
  94. virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
  95. int32_t *lengths, int32_t *cpLengths, int32_t *values,
  96. int32_t *prefix) const;
  97. virtual int32_t getType() const;
  98. private:
  99. const UChar *characters;
  100. UDataMemory *file;
  101. };
  102. // Implementation of the DictionaryMatcher interface for a BytesTrie dictionary
  103. class U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher {
  104. public:
  105. // constructs a new BytesTrieDictionaryMatcher
  106. // the transform constant should be the constant read from the file, not a masked version!
  107. // the UDataMemory * fed in here will be closed on this object's destruction
  108. BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f)
  109. : characters(c), transformConstant(t), file(f) { }
  110. virtual ~BytesDictionaryMatcher();
  111. virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
  112. int32_t *lengths, int32_t *cpLengths, int32_t *values,
  113. int32_t *prefix) const;
  114. virtual int32_t getType() const;
  115. private:
  116. UChar32 transform(UChar32 c) const;
  117. const char *characters;
  118. int32_t transformConstant;
  119. UDataMemory *file;
  120. };
  121. U_NAMESPACE_END
  122. U_CAPI int32_t U_EXPORT2
  123. udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode);
  124. /**
  125. * Format of dictionary .dict data files.
  126. * Format version 1.0.
  127. *
  128. * A dictionary .dict data file contains a byte-serialized BytesTrie or
  129. * a UChars-serialized UCharsTrie.
  130. * Such files are used in dictionary-based break iteration (DBBI).
  131. *
  132. * For a BytesTrie, a transformation type is specified for
  133. * transforming Unicode strings into byte sequences.
  134. *
  135. * A .dict file begins with a standard ICU data file header
  136. * (DataHeader, see ucmndata.h and unicode/udata.h).
  137. * The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0).
  138. *
  139. * After the header, the file contains the following parts.
  140. * Constants are defined in the DictionaryData class.
  141. *
  142. * For the data structure of BytesTrie & UCharsTrie see
  143. * http://site.icu-project.org/design/struct/tries
  144. * and the bytestrie.h and ucharstrie.h header files.
  145. *
  146. * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4;
  147. *
  148. * The first four indexes are byte offsets in ascending order.
  149. * Each byte offset marks the start of the next part in the data file,
  150. * and the end of the previous one.
  151. * When two consecutive byte offsets are the same, then the corresponding part is empty.
  152. * Byte offsets are offsets from after the header,
  153. * that is, from the beginning of the indexes[].
  154. * Each part starts at an offset with proper alignment for its data.
  155. * If necessary, the previous part may include padding bytes to achieve this alignment.
  156. *
  157. * trieType=indexes[IX_TRIE_TYPE] defines the trie type.
  158. * transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation.
  159. * If the transformation type is TRANSFORM_TYPE_OFFSET,
  160. * then the lower 21 bits contain the offset code point.
  161. * Each code point c is mapped to byte b = (c - offset).
  162. * Code points outside the range offset..(offset+0xff) cannot be mapped
  163. * and do not occur in the dictionary.
  164. *
  165. * stringTrie; -- a serialized BytesTrie or UCharsTrie
  166. *
  167. * The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType),
  168. * or it maps all strings to 0 (TRIE_HAS_VALUES bit not set).
  169. */
  170. #endif /* !UCONFIG_NO_BREAK_ITERATION */
  171. #endif /* __DICTIONARYDATA_H__ */