strmatch.h 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. * Copyright (C) 2001-2011, International Business Machines Corporation
  5. * and others. All Rights Reserved.
  6. **********************************************************************
  7. * Date Name Description
  8. * 07/23/01 aliu Creation.
  9. **********************************************************************
  10. */
  11. #ifndef STRMATCH_H
  12. #define STRMATCH_H
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_TRANSLITERATION
  15. #include "unicode/unistr.h"
  16. #include "unicode/unifunct.h"
  17. #include "unicode/unimatch.h"
  18. #include "unicode/unirepl.h"
  19. U_NAMESPACE_BEGIN
  20. class TransliterationRuleData;
  21. /**
  22. * An object that matches a fixed input string, implementing the
  23. * UnicodeMatcher API. This object also implements the
  24. * UnicodeReplacer API, allowing it to emit the matched text as
  25. * output. Since the match text may contain flexible match elements,
  26. * such as UnicodeSets, the emitted text is not the match pattern, but
  27. * instead a substring of the actual matched text. Following
  28. * convention, the output text is the leftmost match seen up to this
  29. * point.
  30. *
  31. * A StringMatcher may represent a segment, in which case it has a
  32. * positive segment number. This affects how the matcher converts
  33. * itself to a pattern but does not otherwise affect its function.
  34. *
  35. * A StringMatcher that is not a segment should not be used as a
  36. * UnicodeReplacer.
  37. */
  38. class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
  39. public:
  40. /**
  41. * Construct a matcher that matches the given pattern string.
  42. * @param string the pattern to be matched, possibly containing
  43. * stand-ins that represent nested UnicodeMatcher objects.
  44. * @param start inclusive start index of text to be replaced
  45. * @param limit exclusive end index of text to be replaced;
  46. * must be greater than or equal to start
  47. * @param segmentNum the segment number from 1..n, or 0 if this is
  48. * not a segment.
  49. * @param data context object mapping stand-ins to
  50. * UnicodeMatcher objects.
  51. */
  52. StringMatcher(const UnicodeString& string,
  53. int32_t start,
  54. int32_t limit,
  55. int32_t segmentNum,
  56. const TransliterationRuleData& data);
  57. /**
  58. * Copy constructor
  59. * @param o the object to be copied.
  60. */
  61. StringMatcher(const StringMatcher& o);
  62. /**
  63. * Destructor
  64. */
  65. virtual ~StringMatcher();
  66. /**
  67. * Implement UnicodeFunctor
  68. * @return a copy of the object.
  69. */
  70. virtual StringMatcher* clone() const;
  71. /**
  72. * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
  73. * and return the pointer.
  74. * @return the UnicodeMatcher point.
  75. */
  76. virtual UnicodeMatcher* toMatcher() const;
  77. /**
  78. * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
  79. * and return the pointer.
  80. * @return the UnicodeReplacer pointer.
  81. */
  82. virtual UnicodeReplacer* toReplacer() const;
  83. /**
  84. * Implement UnicodeMatcher
  85. * @param text the text to be matched
  86. * @param offset on input, the index into text at which to begin
  87. * matching. On output, the limit of the matched text. The
  88. * number of matched characters is the output value of offset
  89. * minus the input value. Offset should always point to the
  90. * HIGH SURROGATE (leading code unit) of a pair of surrogates,
  91. * both on entry and upon return.
  92. * @param limit the limit index of text to be matched. Greater
  93. * than offset for a forward direction match, less than offset for
  94. * a backward direction match. The last character to be
  95. * considered for matching will be text.charAt(limit-1) in the
  96. * forward direction or text.charAt(limit+1) in the backward
  97. * direction.
  98. * @param incremental if TRUE, then assume further characters may
  99. * be inserted at limit and check for partial matching. Otherwise
  100. * assume the text as given is complete.
  101. * @return a match degree value indicating a full match, a partial
  102. * match, or a mismatch. If incremental is FALSE then
  103. * U_PARTIAL_MATCH should never be returned.
  104. */
  105. virtual UMatchDegree matches(const Replaceable& text,
  106. int32_t& offset,
  107. int32_t limit,
  108. UBool incremental);
  109. /**
  110. * Implement UnicodeMatcher
  111. * @param result Output param to receive the pattern.
  112. * @param escapeUnprintable if True then escape the unprintable characters.
  113. * @return A reference to 'result'.
  114. */
  115. virtual UnicodeString& toPattern(UnicodeString& result,
  116. UBool escapeUnprintable = FALSE) const;
  117. /**
  118. * Implement UnicodeMatcher
  119. * Returns TRUE if this matcher will match a character c, where c
  120. * & 0xFF == v, at offset, in the forward direction (with limit >
  121. * offset). This is used by <tt>RuleBasedTransliterator</tt> for
  122. * indexing.
  123. * @param v the given value
  124. * @return TRUE if this matcher will match a character c,
  125. * where c & 0xFF == v
  126. */
  127. virtual UBool matchesIndexValue(uint8_t v) const;
  128. /**
  129. * Implement UnicodeMatcher
  130. */
  131. virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
  132. /**
  133. * Implement UnicodeFunctor
  134. */
  135. virtual void setData(const TransliterationRuleData*);
  136. /**
  137. * Replace characters in 'text' from 'start' to 'limit' with the
  138. * output text of this object. Update the 'cursor' parameter to
  139. * give the cursor position and return the length of the
  140. * replacement text.
  141. *
  142. * @param text the text to be matched
  143. * @param start inclusive start index of text to be replaced
  144. * @param limit exclusive end index of text to be replaced;
  145. * must be greater than or equal to start
  146. * @param cursor output parameter for the cursor position.
  147. * Not all replacer objects will update this, but in a complete
  148. * tree of replacer objects, representing the entire output side
  149. * of a transliteration rule, at least one must update it.
  150. * @return the number of 16-bit code units in the text replacing
  151. * the characters at offsets start..(limit-1) in text
  152. */
  153. virtual int32_t replace(Replaceable& text,
  154. int32_t start,
  155. int32_t limit,
  156. int32_t& cursor);
  157. /**
  158. * Returns a string representation of this replacer. If the
  159. * result of calling this function is passed to the appropriate
  160. * parser, typically TransliteratorParser, it will produce another
  161. * replacer that is equal to this one.
  162. * @param result the string to receive the pattern. Previous
  163. * contents will be deleted.
  164. * @param escapeUnprintable if TRUE then convert unprintable
  165. * character to their hex escape representations, \\uxxxx or
  166. * \\Uxxxxxxxx. Unprintable characters are defined by
  167. * Utility.isUnprintable().
  168. * @return a reference to 'result'.
  169. */
  170. virtual UnicodeString& toReplacerPattern(UnicodeString& result,
  171. UBool escapeUnprintable) const;
  172. /**
  173. * Remove any match data. This must be called before performing a
  174. * set of matches with this segment.
  175. */
  176. void resetMatch();
  177. /**
  178. * ICU "poor man's RTTI", returns a UClassID for the actual class.
  179. */
  180. virtual UClassID getDynamicClassID() const;
  181. /**
  182. * ICU "poor man's RTTI", returns a UClassID for this class.
  183. */
  184. static UClassID U_EXPORT2 getStaticClassID();
  185. /**
  186. * Union the set of all characters that may output by this object
  187. * into the given set.
  188. * @param toUnionTo the set into which to union the output characters
  189. */
  190. virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
  191. private:
  192. /**
  193. * The text to be matched.
  194. */
  195. UnicodeString pattern;
  196. /**
  197. * Context object that maps stand-ins to matcher and replacer
  198. * objects.
  199. */
  200. const TransliterationRuleData* data;
  201. /**
  202. * The segment number, 1-based, or 0 if not a segment.
  203. */
  204. int32_t segmentNumber;
  205. /**
  206. * Start offset, in the match text, of the <em>rightmost</em>
  207. * match.
  208. */
  209. int32_t matchStart;
  210. /**
  211. * Limit offset, in the match text, of the <em>rightmost</em>
  212. * match.
  213. */
  214. int32_t matchLimit;
  215. };
  216. U_NAMESPACE_END
  217. #endif /* #if !UCONFIG_NO_TRANSLITERATION */
  218. #endif