bdict_reader.h 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. // Copyright (c) 2011 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #ifndef THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_READER_H_
  5. #define THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_READER_H_
  6. #include <stddef.h>
  7. #include <string>
  8. #include <vector>
  9. #include "base/macros.h"
  10. #include "third_party/hunspell/google/bdict.h"
  11. namespace hunspell {
  12. class BDictReader;
  13. class NodeReader;
  14. // Iterators -------------------------------------------------------------------
  15. // Iterates through all words in the dictionary. It will fill the word into
  16. // a caller-specified buffer.
  17. class WordIterator {
  18. public:
  19. WordIterator(const WordIterator& other);
  20. ~WordIterator();
  21. // This must be explicitly declared and implemneted in the .cc file so it will
  22. // compile without knowing the size of NodeInfo.
  23. WordIterator& operator=(const WordIterator&);
  24. // Fills the buffer with the next word and the affixes for it into the given
  25. // array. Returns the number of affixes. A return value of 0 means there are
  26. // no more words.
  27. int Advance(char* output_buffer, size_t output_len,
  28. int affix_ids[BDict::MAX_AFFIXES_PER_WORD]);
  29. private:
  30. friend class BDictReader;
  31. struct NodeInfo;
  32. WordIterator(const NodeReader& reader);
  33. // Called by Advance when a leaf is found to generate the word, affix list,
  34. // and return value.
  35. int FoundLeaf(const NodeReader& node, char cur_char,
  36. char* output_buffer, size_t output_len,
  37. int affix_ids[BDict::MAX_AFFIXES_PER_WORD]);
  38. std::vector<NodeInfo> stack_;
  39. };
  40. // Will iterate over a list of lines separated by NULLs.
  41. class LineIterator {
  42. public:
  43. // Returns the next word in the sequence or NULL if there are no mode.
  44. const char* Advance();
  45. // Advances to the next word in the sequence and copies it into the given
  46. // buffer, of the given length. If it doesn't fit, it will be truncated.
  47. // Returns true on success.
  48. bool AdvanceAndCopy(char* buf, size_t buf_len);
  49. // Returns true when all data has been read. We're done when we reach a
  50. // double-NULL or a the end of the input (shouldn't happen).
  51. bool IsDone() const;
  52. protected:
  53. friend class BDictReader;
  54. LineIterator(const unsigned char* bdict_data, size_t bdict_length,
  55. size_t first_offset);
  56. const unsigned char* bdict_data_;
  57. size_t bdict_length_;
  58. // Current offset within bdict_data of the next string to read.
  59. size_t cur_offset_;
  60. };
  61. // Created by GetReplacementIterator to iterate over all replacement pairs.
  62. class ReplacementIterator : public LineIterator {
  63. public:
  64. // Fills pointers to NULL terminated strings into the given output params.
  65. // Returns false if there are no more pairs and nothing was filled in.
  66. bool GetNext(const char** first, const char** second);
  67. private:
  68. friend class BDictReader;
  69. ReplacementIterator(const unsigned char* bdict_data, size_t bdict_length,
  70. size_t first_offset)
  71. : LineIterator(bdict_data, bdict_length, first_offset) {
  72. }
  73. };
  74. // Reads a BDict file mapped into memory.
  75. class BDictReader {
  76. public:
  77. // You must call Init and it must succeed before calling any other functions.
  78. BDictReader();
  79. // Initializes the reader with the given data. The data does not transfer
  80. // ownership, and the caller must keep it valid until the reader is destroyed.
  81. // Returns true on success.
  82. bool Init(const unsigned char* bdic_data, size_t bdic_length);
  83. // Returns true if Init() succeeded and other functions can be called.
  84. bool IsValid() const { return !!bdict_data_; }
  85. // Locates the given word in the dictionary. There may be multiple matches if
  86. // the word is listed multiple times in the dictionary with different affix
  87. // rules.
  88. //
  89. // The number of matches is returned, and that number of corresponding affix
  90. // group IDs are filled into |*affix_indices|. These IDs may be 0 to indicate
  91. // there is no affix for that particular match. A return valuf of 0 means that
  92. // there are no matches.
  93. int FindWord(const char* word,
  94. int affix_indices[BDict::MAX_AFFIXES_PER_WORD]) const;
  95. // Returns an iterator that will go over all AF lines ("affix groups").
  96. LineIterator GetAfLineIterator() const;
  97. // Returns an iterator that will go over all SFX/PFX lines ("affix rules").
  98. LineIterator GetAffixLineIterator() const;
  99. // Returns an iterator that will go over all "other" lines.
  100. LineIterator GetOtherLineIterator() const;
  101. // Returns an iterator that can be used to iterate all replacements.
  102. ReplacementIterator GetReplacementIterator() const;
  103. // Used for testing, returns an iterator for all words in the dictionary.
  104. WordIterator GetAllWordIterator() const;
  105. private:
  106. // Non-NULL indicates Init succeeded.
  107. const unsigned char* bdict_data_;
  108. size_t bdict_length_;
  109. // Pointer not owned by this class. It will point into the data. It will be
  110. // NULL if the data is invalid.
  111. const BDict::Header* header_;
  112. const BDict::AffHeader* aff_header_;
  113. DISALLOW_COPY_AND_ASSIGN(BDictReader);
  114. };
  115. } // namespace hunspell
  116. #endif // THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_READER_H_