tridpars.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **************************************************************************
  5. * Copyright (c) 2002-2010, International Business Machines Corporation *
  6. * and others. All Rights Reserved. *
  7. **************************************************************************
  8. * Date Name Description *
  9. * 01/28/2002 aliu Creation. *
  10. **************************************************************************
  11. */
  12. #ifndef TRIDPARS_H
  13. #define TRIDPARS_H
  14. #include "unicode/utypes.h"
  15. #if !UCONFIG_NO_TRANSLITERATION
  16. #include "unicode/uobject.h"
  17. #include "unicode/unistr.h"
  18. U_NAMESPACE_BEGIN
  19. class Transliterator;
  20. class UnicodeSet;
  21. class UVector;
  22. /**
  23. * Parsing component for transliterator IDs. This class contains only
  24. * static members; it cannot be instantiated. Methods in this class
  25. * parse various ID formats, including the following:
  26. *
  27. * A basic ID, which contains source, target, and variant, but no
  28. * filter and no explicit inverse. Examples include
  29. * "Latin-Greek/UNGEGN" and "Null".
  30. *
  31. * A single ID, which is a basic ID plus optional filter and optional
  32. * explicit inverse. Examples include "[a-zA-Z] Latin-Greek" and
  33. * "Lower (Upper)".
  34. *
  35. * A compound ID, which is a sequence of one or more single IDs,
  36. * separated by semicolons, with optional forward and reverse global
  37. * filters. The global filters are UnicodeSet patterns prepended or
  38. * appended to the IDs, separated by semicolons. An appended filter
  39. * must be enclosed in parentheses and applies in the reverse
  40. * direction.
  41. *
  42. * @author Alan Liu
  43. */
  44. class TransliteratorIDParser /* not : public UObject because all methods are static */ {
  45. public:
  46. /**
  47. * A structure containing the parsed data of a filtered ID, that
  48. * is, a basic ID optionally with a filter.
  49. *
  50. * 'source' and 'target' will always be non-null. The 'variant'
  51. * will be non-null only if a non-empty variant was parsed.
  52. *
  53. * 'sawSource' is true if there was an explicit source in the
  54. * parsed id. If there was no explicit source, then an implied
  55. * source of ANY is returned and 'sawSource' is set to false.
  56. *
  57. * 'filter' is the parsed filter pattern, or null if there was no
  58. * filter.
  59. */
  60. class Specs : public UMemory {
  61. public:
  62. UnicodeString source; // not null
  63. UnicodeString target; // not null
  64. UnicodeString variant; // may be null
  65. UnicodeString filter; // may be null
  66. UBool sawSource;
  67. Specs(const UnicodeString& s, const UnicodeString& t,
  68. const UnicodeString& v, UBool sawS,
  69. const UnicodeString& f);
  70. private:
  71. Specs(const Specs &other); // forbid copying of this class
  72. Specs &operator=(const Specs &other); // forbid copying of this class
  73. };
  74. /**
  75. * A structure containing the canonicalized data of a filtered ID,
  76. * that is, a basic ID optionally with a filter.
  77. *
  78. * 'canonID' is always non-null. It may be the empty string "".
  79. * It is the id that should be assigned to the created
  80. * transliterator. It _cannot_ be instantiated directly.
  81. *
  82. * 'basicID' is always non-null and non-empty. It is always of
  83. * the form S-T or S-T/V. It is designed to be fed to low-level
  84. * instantiation code that only understands these two formats.
  85. *
  86. * 'filter' may be null, if there is none, or non-null and
  87. * non-empty.
  88. */
  89. class SingleID : public UMemory {
  90. public:
  91. UnicodeString canonID;
  92. UnicodeString basicID;
  93. UnicodeString filter;
  94. SingleID(const UnicodeString& c, const UnicodeString& b,
  95. const UnicodeString& f);
  96. SingleID(const UnicodeString& c, const UnicodeString& b);
  97. Transliterator* createInstance();
  98. private:
  99. SingleID(const SingleID &other); // forbid copying of this class
  100. SingleID &operator=(const SingleID &other); // forbid copying of this class
  101. };
  102. /**
  103. * Parse a filter ID, that is, an ID of the general form
  104. * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
  105. * @param id the id to be parsed
  106. * @param pos INPUT-OUTPUT parameter. On input, the position of
  107. * the first character to parse. On output, the position after
  108. * the last character parsed.
  109. * @return a SingleID object or null if the parse fails
  110. */
  111. static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);
  112. /**
  113. * Parse a single ID, that is, an ID of the general form
  114. * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
  115. * optional, the filters optional, and the variants optional.
  116. * @param id the id to be parsed
  117. * @param pos INPUT-OUTPUT parameter. On input, the position of
  118. * the first character to parse. On output, the position after
  119. * the last character parsed.
  120. * @param dir the direction. If the direction is REVERSE then the
  121. * SingleID is constructed for the reverse direction.
  122. * @return a SingleID object or null
  123. */
  124. static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
  125. int32_t dir, UErrorCode& status);
  126. /**
  127. * Parse a global filter of the form "[f]" or "([f])", depending
  128. * on 'withParens'.
  129. * @param id the pattern the parse
  130. * @param pos INPUT-OUTPUT parameter. On input, the position of
  131. * the first character to parse. On output, the position after
  132. * the last character parsed.
  133. * @param dir the direction.
  134. * @param withParens INPUT-OUTPUT parameter. On entry, if
  135. * withParens[0] is 0, then parens are disallowed. If it is 1,
  136. * then parens are required. If it is -1, then parens are
  137. * optional, and the return result will be set to 0 or 1.
  138. * @param canonID OUTPUT parameter. The pattern for the filter
  139. * added to the canonID, either at the end, if dir is FORWARD, or
  140. * at the start, if dir is REVERSE. The pattern will be enclosed
  141. * in parentheses if appropriate, and will be suffixed with an
  142. * ID_DELIM character. May be null.
  143. * @return a UnicodeSet object or null. A non-null results
  144. * indicates a successful parse, regardless of whether the filter
  145. * applies to the given direction. The caller should discard it
  146. * if withParens != (dir == REVERSE).
  147. */
  148. static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
  149. int32_t dir,
  150. int32_t& withParens,
  151. UnicodeString* canonID);
  152. /**
  153. * Parse a compound ID, consisting of an optional forward global
  154. * filter, a separator, one or more single IDs delimited by
  155. * separators, an an optional reverse global filter. The
  156. * separator is a semicolon. The global filters are UnicodeSet
  157. * patterns. The reverse global filter must be enclosed in
  158. * parentheses.
  159. * @param id the pattern the parse
  160. * @param dir the direction.
  161. * @param canonID OUTPUT parameter that receives the canonical ID,
  162. * consisting of canonical IDs for all elements, as returned by
  163. * parseSingleID(), separated by semicolons. Previous contents
  164. * are discarded.
  165. * @param list OUTPUT parameter that receives a list of SingleID
  166. * objects representing the parsed IDs. Previous contents are
  167. * discarded.
  168. * @param globalFilter OUTPUT parameter that receives a pointer to
  169. * a newly created global filter for this ID in this direction, or
  170. * null if there is none.
  171. * @return true if the parse succeeds, that is, if the entire
  172. * id is consumed without syntax error.
  173. */
  174. static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
  175. UnicodeString& canonID,
  176. UVector& list,
  177. UnicodeSet*& globalFilter);
  178. /**
  179. * Convert the elements of the 'list' vector, which are SingleID
  180. * objects, into actual Transliterator objects. In the course of
  181. * this, some (or all) entries may be removed. If all entries
  182. * are removed, the Null transliterator will be added.
  183. *
  184. * Delete entries with empty basicIDs; these are generated by
  185. * elements like "(A)" in the forward direction, or "A()" in
  186. * the reverse. THIS MAY RESULT IN AN EMPTY VECTOR. Convert
  187. * SingleID entries to actual transliterators.
  188. *
  189. * @param list vector of SingleID objects. On exit, vector
  190. * of one or more Transliterators.
  191. * @param ec Output param to receive a success or an error code.
  192. * @return new value of insertIndex. The index will shift if
  193. * there are empty items, like "(Lower)", with indices less than
  194. * insertIndex.
  195. */
  196. static void instantiateList(UVector& list,
  197. UErrorCode& ec);
  198. /**
  199. * Parse an ID into pieces. Take IDs of the form T, T/V, S-T,
  200. * S-T/V, or S/V-T. If the source is missing, return a source of
  201. * ANY.
  202. * @param id the id string, in any of several forms
  203. * @param source the given source.
  204. * @param target the given target.
  205. * @param variant the given variant
  206. * @param isSourcePresent If TRUE then the source is present.
  207. * If the source is not present, ANY will be
  208. * given as the source, and isSourcePresent will be null
  209. * @return an array of 4 strings: source, target, variant, and
  210. * isSourcePresent. If the source is not present, ANY will be
  211. * given as the source, and isSourcePresent will be null. Otherwise
  212. * isSourcePresent will be non-null. The target may be empty if the
  213. * id is not well-formed. The variant may be empty.
  214. */
  215. static void IDtoSTV(const UnicodeString& id,
  216. UnicodeString& source,
  217. UnicodeString& target,
  218. UnicodeString& variant,
  219. UBool& isSourcePresent);
  220. /**
  221. * Given source, target, and variant strings, concatenate them into a
  222. * full ID. If the source is empty, then "Any" will be used for the
  223. * source, so the ID will always be of the form s-t/v or s-t.
  224. */
  225. static void STVtoID(const UnicodeString& source,
  226. const UnicodeString& target,
  227. const UnicodeString& variant,
  228. UnicodeString& id);
  229. /**
  230. * Register two targets as being inverses of one another. For
  231. * example, calling registerSpecialInverse("NFC", "NFD", true) causes
  232. * Transliterator to form the following inverse relationships:
  233. *
  234. * <pre>NFC => NFD
  235. * Any-NFC => Any-NFD
  236. * NFD => NFC
  237. * Any-NFD => Any-NFC</pre>
  238. *
  239. * (Without the special inverse registration, the inverse of NFC
  240. * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
  241. * that the presence or absence of "Any-" is preserved.
  242. *
  243. * <p>The relationship is symmetrical; registering (a, b) is
  244. * equivalent to registering (b, a).
  245. *
  246. * <p>The relevant IDs must still be registered separately as
  247. * factories or classes.
  248. *
  249. * <p>Only the targets are specified. Special inverses always
  250. * have the form Any-Target1 <=> Any-Target2. The target should
  251. * have canonical casing (the casing desired to be produced when
  252. * an inverse is formed) and should contain no whitespace or other
  253. * extraneous characters.
  254. *
  255. * @param target the target against which to register the inverse
  256. * @param inverseTarget the inverse of target, that is
  257. * Any-target.getInverse() => Any-inverseTarget
  258. * @param bidirectional if true, register the reverse relation
  259. * as well, that is, Any-inverseTarget.getInverse() => Any-target
  260. */
  261. static void registerSpecialInverse(const UnicodeString& target,
  262. const UnicodeString& inverseTarget,
  263. UBool bidirectional,
  264. UErrorCode &status);
  265. /**
  266. * Free static memory.
  267. */
  268. static void cleanup();
  269. private:
  270. //----------------------------------------------------------------
  271. // Private implementation
  272. //----------------------------------------------------------------
  273. // forbid instantiation
  274. TransliteratorIDParser();
  275. /**
  276. * Parse an ID into component pieces. Take IDs of the form T,
  277. * T/V, S-T, S-T/V, or S/V-T. If the source is missing, return a
  278. * source of ANY.
  279. * @param id the id string, in any of several forms
  280. * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
  281. * offset of the first character to parse in id. On output,
  282. * pos[0] is the offset after the last parsed character. If the
  283. * parse failed, pos[0] will be unchanged.
  284. * @param allowFilter if true, a UnicodeSet pattern is allowed
  285. * at any location between specs or delimiters, and is returned
  286. * as the fifth string in the array.
  287. * @return a Specs object, or null if the parse failed. If
  288. * neither source nor target was seen in the parsed id, then the
  289. * parse fails. If allowFilter is true, then the parsed filter
  290. * pattern is returned in the Specs object, otherwise the returned
  291. * filter reference is null. If the parse fails for any reason
  292. * null is returned.
  293. */
  294. static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
  295. UBool allowFilter);
  296. /**
  297. * Givens a Specs object, convert it to a SingleID object. The
  298. * Spec object is a more unprocessed parse result. The SingleID
  299. * object contains information about canonical and basic IDs.
  300. * @param specs the given Specs object.
  301. * @param dir either FORWARD or REVERSE.
  302. * @return a SingleID; never returns null. Returned object always
  303. * has 'filter' field of null.
  304. */
  305. static SingleID* specsToID(const Specs* specs, int32_t dir);
  306. /**
  307. * Given a Specs object, return a SingleID representing the
  308. * special inverse of that ID. If there is no special inverse
  309. * then return null.
  310. * @param specs the given Specs.
  311. * @return a SingleID or null. Returned object always has
  312. * 'filter' field of null.
  313. */
  314. static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status);
  315. /**
  316. * Glue method to get around access problems in C++.
  317. * @param id the id string for the transliterator, in any of several forms
  318. * @param canonID the given canonical ID
  319. */
  320. static Transliterator* createBasicInstance(const UnicodeString& id,
  321. const UnicodeString* canonID);
  322. /**
  323. * Initialize static memory.
  324. */
  325. static void U_CALLCONV init(UErrorCode &status);
  326. friend class SingleID;
  327. };
  328. U_NAMESPACE_END
  329. #endif /* #if !UCONFIG_NO_TRANSLITERATION */
  330. #endif