bdict.h 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. // Copyright (c) 2011 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #ifndef THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_
  5. #define THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_
  6. #include <stddef.h>
  7. #include <stdint.h>
  8. #include "base/hash/md5.h"
  9. // BDict (binary dictionary) format. All offsets are little endian.
  10. //
  11. // Header (28 bytes).
  12. // "BDic" Signature (4 bytes)
  13. // Version (little endian 4 bytes)
  14. // Absolute offset in file of the aff info. (4 bytes)
  15. // Absolute offset in file of the dic table. (4 bytes)
  16. // (Added by v2.0) MD5 checksum of the aff info and the dic table. (16 bytes)
  17. //
  18. // Aff information:
  19. // Absolute offset in file of the affix group table (4 bytes)
  20. // Absolute offset in file of the affix rules table (4 bytes)
  21. // Absolute offset in file of the replacements table (4 bytes)
  22. // Absolute offset in file of the "other rules" table (4 bytes)
  23. //
  24. // The data between the aff header and the affix rules table is the comment
  25. // from the beginning of the .aff file which often contains copyrights, etc.
  26. //
  27. // Affix group table:
  28. // Array of NULL terminated strings. It will end in a double-NULL.
  29. //
  30. // Affix rules table:
  31. // List of LF termianted lines. NULL terminated.
  32. //
  33. // Replacements table:
  34. // List of pairs of NULL teminated words. The end is indicated by a
  35. // double-NULL. The first word in the pair is the replacement source, the
  36. // second is what to replace it with. Example:
  37. // foo\0bar\0a\0b\0\0
  38. // for replacing ("foo" with "bar") and ("a" with "b").
  39. //
  40. // Other rules table:
  41. // List of LF termianted lines. NULL terminated.
  42. //
  43. //
  44. // Dic table. This stores the .dic file which contains the words in the
  45. // dictionary, and indices for each one that indicate a set of suffixes or
  46. // prefixes that can be applied. We store it in a trie to save space. It
  47. // replaces Hunspell's hash manager.
  48. //
  49. // 0abxxxxx xxxxxxxx (in binary) Leaf node:
  50. // The number stored in the bits represented by x is the affix index.
  51. //
  52. // If bit <a> is set, the leaf node has an additional string. Following the
  53. // 2 byte header is a NULL-terminated (possibly 0-length) string that should
  54. // be appended to the node. This allows long unique endings to be handled
  55. // efficiently.
  56. //
  57. // If bit <b> is set, the leaf node has a supplimental list of affix IDs
  58. // following the ordinary data for the leaf node. These affix group IDs are
  59. // additional rules for the same word. For example, two prefixes may go
  60. // with distinct sets of suffixes.
  61. //
  62. // If the affix index is all 1's, then that means that there is only the
  63. // supplimental list, and the 13-bit of affix built-in to the node don't
  64. // count. This is used to represent numbers greater than 13 bits, since
  65. // the supplimentary list has 16 bits per entry. The node must have a
  66. // supplimenal list if this is set.
  67. //
  68. // This additional array is an array of 16-bit little-endian values,
  69. // terminated by 0xFFFF (since 0 is an affix ID meaning "no affix ID".
  70. //
  71. // 0x110000ab: Lookup node.
  72. // When <a> is set, addresses are 32-bits relative to the beginning of the
  73. // dictionary data. When unset, addresses are 16-bits relative to the
  74. // beginning of this node. All values are little endian.
  75. //
  76. // When <b> is set, there is one additional entry before the table begins.
  77. // This is the 0th character. 0 is a common addition (meaning no more data)
  78. // and this prevents us from having to store entries for all the control
  79. // characters. This magic element is not counted in the table size.
  80. //
  81. // The ID byte is followeed by two bytes:
  82. // XX: First character value in the lookup table.
  83. // XX: Number of characters in the lookup table.
  84. //
  85. // This is followed optionally by the entry for 0, and then by a table of
  86. // size indicated by the second charatcer after the ID.
  87. //
  88. // 1110xxxx: List node with 8-bit addresses.
  89. // The number of items (max 16) in the list is stored in the bits xxxx.
  90. // Followed by N (character byte, 8-bit offset) pairs. These offsets are
  91. // relative to the end of the list of pairs.
  92. // 1111xxxx: List node with 16-bit addresses. Same as above but offsets are
  93. // 2-bytes each. LITTLE ENDIAN!
  94. namespace hunspell {
  95. #pragma pack(push, 1)
  96. class BDict {
  97. public:
  98. // File header.
  99. enum { SIGNATURE = 0x63694442 };
  100. enum {
  101. MAJOR_VERSION = 2,
  102. MINOR_VERSION = 0
  103. };
  104. struct Header {
  105. uint32_t signature;
  106. // Major versions are incompatible with other major versions. Minor versions
  107. // should be readable by older programs expecting the same major version.
  108. uint16_t major_version;
  109. uint16_t minor_version;
  110. uint32_t aff_offset; // Offset of the aff data.
  111. uint32_t dic_offset; // Offset of the dic data.
  112. // Added by version 2.0.
  113. base::MD5Digest digest; // MD5 digest of the aff data and the dic data.
  114. };
  115. // AFF section ===============================================================
  116. struct AffHeader {
  117. uint32_t affix_group_offset;
  118. uint32_t affix_rule_offset;
  119. uint32_t rep_offset; // Replacements table.
  120. uint32_t other_offset;
  121. };
  122. // DIC section ===============================================================
  123. // Leaf ----------------------------------------------------------------------
  124. // Leaf nodes have the high bit set to 0.
  125. enum { LEAF_NODE_TYPE_MASK = 0x80 }; // 10000000
  126. enum { LEAF_NODE_TYPE_VALUE = 0 }; // 00000000
  127. // Leaf nodes with additional strings have the next-to-high bit set to 1.
  128. // This mask/value pair also includes the high bit set to 0 which is the leaf
  129. // indicator.
  130. enum { LEAF_NODE_ADDITIONAL_MASK = 0xC0 }; // 11000000
  131. enum { LEAF_NODE_ADDITIONAL_VALUE = 0x40 }; // 01000000
  132. // Leaf nodes with an additional array of affix rules following it.
  133. enum { LEAF_NODE_FOLLOWING_MASK = 0xA0 }; // 10100000
  134. enum { LEAF_NODE_FOLLOWING_VALUE = 0x20 }; // 00100000
  135. // The low 5 bits of the leaf node ID byte are the first 5 bits of the affix
  136. // ID. The following byte is used for the low bits of the affix ID (we don't
  137. // specify as mask for that).
  138. enum { LEAF_NODE_FIRST_BYTE_AFFIX_MASK = 0x1F }; // 00011111
  139. // The maximum affix value that can be stored in the first entry (not in the
  140. // following list). We reserve all 1's to be a magic value (see next entry)
  141. // so we can store large numbers somewhere else.
  142. enum { LEAF_NODE_MAX_FIRST_AFFIX_ID = 0x1FFE }; // 00011111 11111110
  143. // When the affix built-in to the leaf node (the first one) has too many bits
  144. // for the space reserved for it (13 bits), then we fill it with this value.
  145. // This means that the affix doesn't count. The affix will instead be stored
  146. // in the "following list" which allows up to 16 bits per entry.
  147. enum { FIRST_AFFIX_IS_UNUSED = 0x1FFF }; // 00011111 11111111
  148. // The maximum number of leaf nodes we'll read that have the same word and
  149. // follow each other (the FOLLOWING bit is set).
  150. enum { MAX_AFFIXES_PER_WORD = 32 };
  151. // The terminator for the list of following affix group IDs.
  152. enum { LEAF_NODE_FOLLOWING_LIST_TERMINATOR = 0xFFFF };
  153. // Lookup --------------------------------------------------------------------
  154. // Lookup nodes have the first 6 bits set to 110000.
  155. enum { LOOKUP_NODE_TYPE_MASK = 0xFC }; // 11111100
  156. enum { LOOKUP_NODE_TYPE_VALUE = 0xC0 }; // 11000000
  157. // Lookup nodes have the low bit meaning it has a 0th entry, and the
  158. // next-to-lowest bit indicating whether the offsets are 32-bits. Included
  159. // in these masks are the lookup ID above.
  160. enum { LOOKUP_NODE_0TH_MASK = 0xFD }; // 11111110
  161. enum { LOOKUP_NODE_0TH_VALUE = 0xC1 }; // 11000010
  162. enum { LOOKUP_NODE_32BIT_MASK = 0xFE}; // 11111110
  163. enum { LOOKUP_NODE_32BIT_VALUE = 0xC2}; // 11000001
  164. // List ----------------------------------------------------------------------
  165. // List nodes have the first 3 bits set to 1.
  166. enum { LIST_NODE_TYPE_MASK = 0xE0 }; // 11100000
  167. enum { LIST_NODE_TYPE_VALUE = 0xE0 }; // 11100000
  168. // The 4th from highest bit indicates a 16 bit (as opposed to 8 bit) list.
  169. // This mask/value also includes the list ID in the high 3 bits.
  170. enum { LIST_NODE_16BIT_MASK = 0xF0 }; // 11110000
  171. enum { LIST_NODE_16BIT_VALUE = 0xF0 }; // 11110000
  172. // The low 4 bits of the list ID byte are the count.
  173. enum { LIST_NODE_COUNT_MASK = 0xF }; // 00001111
  174. // Verifies the specified BDICT is sane. This function checks the BDICT header
  175. // and compares the MD5 digest of the data with the one in the header.
  176. static bool Verify(const char* bdict_data, size_t bdict_length);
  177. };
  178. #pragma pack(pop)
  179. } // namespace hunspell
  180. #endif // THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_