translit.h 66 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 1999-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * Date Name Description
  9. * 11/17/99 aliu Creation.
  10. **********************************************************************
  11. */
  12. #ifndef TRANSLIT_H
  13. #define TRANSLIT_H
  14. #include "unicode/utypes.h"
  15. #if U_SHOW_CPLUSPLUS_API
  16. /**
  17. * \file
  18. * \brief C++ API: Tranforms text from one format to another.
  19. */
  20. #if !UCONFIG_NO_TRANSLITERATION
  21. #include "unicode/uobject.h"
  22. #include "unicode/unistr.h"
  23. #include "unicode/parseerr.h"
  24. #include "unicode/utrans.h" // UTransPosition, UTransDirection
  25. #include "unicode/strenum.h"
  26. U_NAMESPACE_BEGIN
  27. class UnicodeFilter;
  28. class UnicodeSet;
  29. class TransliteratorParser;
  30. class NormalizationTransliterator;
  31. class TransliteratorIDParser;
  32. /**
  33. *
  34. * <code>Transliterator</code> is an abstract class that
  35. * transliterates text from one format to another. The most common
  36. * kind of transliterator is a script, or alphabet, transliterator.
  37. * For example, a Russian to Latin transliterator changes Russian text
  38. * written in Cyrillic characters to phonetically equivalent Latin
  39. * characters. It does not <em>translate</em> Russian to English!
  40. * Transliteration, unlike translation, operates on characters, without
  41. * reference to the meanings of words and sentences.
  42. *
  43. * <p>Although script conversion is its most common use, a
  44. * transliterator can actually perform a more general class of tasks.
  45. * In fact, <code>Transliterator</code> defines a very general API
  46. * which specifies only that a segment of the input text is replaced
  47. * by new text. The particulars of this conversion are determined
  48. * entirely by subclasses of <code>Transliterator</code>.
  49. *
  50. * <p><b>Transliterators are stateless</b>
  51. *
  52. * <p><code>Transliterator</code> objects are <em>stateless</em>; they
  53. * retain no information between calls to
  54. * <code>transliterate()</code>. (However, this does <em>not</em>
  55. * mean that threads may share transliterators without synchronizing
  56. * them. Transliterators are not immutable, so they must be
  57. * synchronized when shared between threads.) This might seem to
  58. * limit the complexity of the transliteration operation. In
  59. * practice, subclasses perform complex transliterations by delaying
  60. * the replacement of text until it is known that no other
  61. * replacements are possible. In other words, although the
  62. * <code>Transliterator</code> objects are stateless, the source text
  63. * itself embodies all the needed information, and delayed operation
  64. * allows arbitrary complexity.
  65. *
  66. * <p><b>Batch transliteration</b>
  67. *
  68. * <p>The simplest way to perform transliteration is all at once, on a
  69. * string of existing text. This is referred to as <em>batch</em>
  70. * transliteration. For example, given a string <code>input</code>
  71. * and a transliterator <code>t</code>, the call
  72. *
  73. * String result = t.transliterate(input);
  74. *
  75. * will transliterate it and return the result. Other methods allow
  76. * the client to specify a substring to be transliterated and to use
  77. * {@link Replaceable } objects instead of strings, in order to
  78. * preserve out-of-band information (such as text styles).
  79. *
  80. * <p><b>Keyboard transliteration</b>
  81. *
  82. * <p>Somewhat more involved is <em>keyboard</em>, or incremental
  83. * transliteration. This is the transliteration of text that is
  84. * arriving from some source (typically the user's keyboard) one
  85. * character at a time, or in some other piecemeal fashion.
  86. *
  87. * <p>In keyboard transliteration, a <code>Replaceable</code> buffer
  88. * stores the text. As text is inserted, as much as possible is
  89. * transliterated on the fly. This means a GUI that displays the
  90. * contents of the buffer may show text being modified as each new
  91. * character arrives.
  92. *
  93. * <p>Consider the simple rule-based Transliterator:
  94. * <pre>
  95. * th>{theta}
  96. * t>{tau}
  97. * </pre>
  98. *
  99. * When the user types 't', nothing will happen, since the
  100. * transliterator is waiting to see if the next character is 'h'. To
  101. * remedy this, we introduce the notion of a cursor, marked by a '|'
  102. * in the output string:
  103. * <pre>
  104. * t>|{tau}
  105. * {tau}h>{theta}
  106. * </pre>
  107. *
  108. * Now when the user types 't', tau appears, and if the next character
  109. * is 'h', the tau changes to a theta. This is accomplished by
  110. * maintaining a cursor position (independent of the insertion point,
  111. * and invisible in the GUI) across calls to
  112. * <code>transliterate()</code>. Typically, the cursor will
  113. * be coincident with the insertion point, but in a case like the one
  114. * above, it will precede the insertion point.
  115. *
  116. * <p>Keyboard transliteration methods maintain a set of three indices
  117. * that are updated with each call to
  118. * <code>transliterate()</code>, including the cursor, start,
  119. * and limit. Since these indices are changed by the method, they are
  120. * passed in an <code>int[]</code> array. The <code>START</code> index
  121. * marks the beginning of the substring that the transliterator will
  122. * look at. It is advanced as text becomes committed (but it is not
  123. * the committed index; that's the <code>CURSOR</code>). The
  124. * <code>CURSOR</code> index, described above, marks the point at
  125. * which the transliterator last stopped, either because it reached
  126. * the end, or because it required more characters to disambiguate
  127. * between possible inputs. The <code>CURSOR</code> can also be
  128. * explicitly set by rules in a rule-based Transliterator.
  129. * Any characters before the <code>CURSOR</code> index are frozen;
  130. * future keyboard transliteration calls within this input sequence
  131. * will not change them. New text is inserted at the
  132. * <code>LIMIT</code> index, which marks the end of the substring that
  133. * the transliterator looks at.
  134. *
  135. * <p>Because keyboard transliteration assumes that more characters
  136. * are to arrive, it is conservative in its operation. It only
  137. * transliterates when it can do so unambiguously. Otherwise it waits
  138. * for more characters to arrive. When the client code knows that no
  139. * more characters are forthcoming, perhaps because the user has
  140. * performed some input termination operation, then it should call
  141. * <code>finishTransliteration()</code> to complete any
  142. * pending transliterations.
  143. *
  144. * <p><b>Inverses</b>
  145. *
  146. * <p>Pairs of transliterators may be inverses of one another. For
  147. * example, if transliterator <b>A</b> transliterates characters by
  148. * incrementing their Unicode value (so "abc" -> "def"), and
  149. * transliterator <b>B</b> decrements character values, then <b>A</b>
  150. * is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
  151. * with <b>B</b> in a compound transliterator, the result is the
  152. * indentity transliterator, that is, a transliterator that does not
  153. * change its input text.
  154. *
  155. * The <code>Transliterator</code> method <code>getInverse()</code>
  156. * returns a transliterator's inverse, if one exists, or
  157. * <code>null</code> otherwise. However, the result of
  158. * <code>getInverse()</code> usually will <em>not</em> be a true
  159. * mathematical inverse. This is because true inverse transliterators
  160. * are difficult to formulate. For example, consider two
  161. * transliterators: <b>AB</b>, which transliterates the character 'A'
  162. * to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
  163. * seem that these are exact inverses, since
  164. *
  165. * \htmlonly<blockquote>\endhtmlonly"A" x <b>AB</b> -> "B"<br>
  166. * "B" x <b>BA</b> -> "A"\htmlonly</blockquote>\endhtmlonly
  167. *
  168. * where 'x' represents transliteration. However,
  169. *
  170. * \htmlonly<blockquote>\endhtmlonly"ABCD" x <b>AB</b> -> "BBCD"<br>
  171. * "BBCD" x <b>BA</b> -> "AACD"\htmlonly</blockquote>\endhtmlonly
  172. *
  173. * so <b>AB</b> composed with <b>BA</b> is not the
  174. * identity. Nonetheless, <b>BA</b> may be usefully considered to be
  175. * <b>AB</b>'s inverse, and it is on this basis that
  176. * <b>AB</b><code>.getInverse()</code> could legitimately return
  177. * <b>BA</b>.
  178. *
  179. * <p><b>IDs and display names</b>
  180. *
  181. * <p>A transliterator is designated by a short identifier string or
  182. * <em>ID</em>. IDs follow the format <em>source-destination</em>,
  183. * where <em>source</em> describes the entity being replaced, and
  184. * <em>destination</em> describes the entity replacing
  185. * <em>source</em>. The entities may be the names of scripts,
  186. * particular sequences of characters, or whatever else it is that the
  187. * transliterator converts to or from. For example, a transliterator
  188. * from Russian to Latin might be named "Russian-Latin". A
  189. * transliterator from keyboard escape sequences to Latin-1 characters
  190. * might be named "KeyboardEscape-Latin1". By convention, system
  191. * entity names are in English, with the initial letters of words
  192. * capitalized; user entity names may follow any format so long as
  193. * they do not contain dashes.
  194. *
  195. * <p>In addition to programmatic IDs, transliterator objects have
  196. * display names for presentation in user interfaces, returned by
  197. * {@link #getDisplayName }.
  198. *
  199. * <p><b>Factory methods and registration</b>
  200. *
  201. * <p>In general, client code should use the factory method
  202. * {@link #createInstance } to obtain an instance of a
  203. * transliterator given its ID. Valid IDs may be enumerated using
  204. * <code>getAvailableIDs()</code>. Since transliterators are mutable,
  205. * multiple calls to {@link #createInstance } with the same ID will
  206. * return distinct objects.
  207. *
  208. * <p>In addition to the system transliterators registered at startup,
  209. * user transliterators may be registered by calling
  210. * <code>registerInstance()</code> at run time. A registered instance
  211. * acts a template; future calls to {@link #createInstance } with the ID
  212. * of the registered object return clones of that object. Thus any
  213. * object passed to <tt>registerInstance()</tt> must implement
  214. * <tt>clone()</tt> propertly. To register a transliterator subclass
  215. * without instantiating it (until it is needed), users may call
  216. * {@link #registerFactory }. In this case, the objects are
  217. * instantiated by invoking the zero-argument public constructor of
  218. * the class.
  219. *
  220. * <p><b>Subclassing</b>
  221. *
  222. * Subclasses must implement the abstract method
  223. * <code>handleTransliterate()</code>. <p>Subclasses should override
  224. * the <code>transliterate()</code> method taking a
  225. * <code>Replaceable</code> and the <code>transliterate()</code>
  226. * method taking a <code>String</code> and <code>StringBuffer</code>
  227. * if the performance of these methods can be improved over the
  228. * performance obtained by the default implementations in this class.
  229. *
  230. * <p><b>Rule syntax</b>
  231. *
  232. * <p>A set of rules determines how to perform translations.
  233. * Rules within a rule set are separated by semicolons (';').
  234. * To include a literal semicolon, prefix it with a backslash ('\').
  235. * Unicode Pattern_White_Space is ignored.
  236. * If the first non-blank character on a line is '#',
  237. * the entire line is ignored as a comment.
  238. *
  239. * <p>Each set of rules consists of two groups, one forward, and one
  240. * reverse. This is a convention that is not enforced; rules for one
  241. * direction may be omitted, with the result that translations in
  242. * that direction will not modify the source text. In addition,
  243. * bidirectional forward-reverse rules may be specified for
  244. * symmetrical transformations.
  245. *
  246. * <p>Note: Another description of the Transliterator rule syntax is available in
  247. * <a href="https://www.unicode.org/reports/tr35/tr35-general.html#Transform_Rules_Syntax">section
  248. * Transform Rules Syntax of UTS #35: Unicode LDML</a>.
  249. * The rules are shown there using arrow symbols ← and → and ↔.
  250. * ICU supports both those and the equivalent ASCII symbols &lt; and &gt; and &lt;&gt;.
  251. *
  252. * <p>Rule statements take one of the following forms:
  253. *
  254. * <dl>
  255. * <dt><code>$alefmadda=\\u0622;</code></dt>
  256. * <dd><strong>Variable definition.</strong> The name on the
  257. * left is assigned the text on the right. In this example,
  258. * after this statement, instances of the left hand name,
  259. * &quot;<code>$alefmadda</code>&quot;, will be replaced by
  260. * the Unicode character U+0622. Variable names must begin
  261. * with a letter and consist only of letters, digits, and
  262. * underscores. Case is significant. Duplicate names cause
  263. * an exception to be thrown, that is, variables cannot be
  264. * redefined. The right hand side may contain well-formed
  265. * text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
  266. * The right hand side may contain embedded <code>UnicodeSet</code>
  267. * patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
  268. * <dt><code>ai&gt;$alefmadda;</code></dt>
  269. * <dd><strong>Forward translation rule.</strong> This rule
  270. * states that the string on the left will be changed to the
  271. * string on the right when performing forward
  272. * transliteration.</dd>
  273. * <dt><code>ai&lt;$alefmadda;</code></dt>
  274. * <dd><strong>Reverse translation rule.</strong> This rule
  275. * states that the string on the right will be changed to
  276. * the string on the left when performing reverse
  277. * transliteration.</dd>
  278. * </dl>
  279. *
  280. * <dl>
  281. * <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
  282. * <dd><strong>Bidirectional translation rule.</strong> This
  283. * rule states that the string on the right will be changed
  284. * to the string on the left when performing forward
  285. * transliteration, and vice versa when performing reverse
  286. * transliteration.</dd>
  287. * </dl>
  288. *
  289. * <p>Translation rules consist of a <em>match pattern</em> and an <em>output
  290. * string</em>. The match pattern consists of literal characters,
  291. * optionally preceded by context, and optionally followed by
  292. * context. Context characters, like literal pattern characters,
  293. * must be matched in the text being transliterated. However, unlike
  294. * literal pattern characters, they are not replaced by the output
  295. * text. For example, the pattern &quot;<code>abc{def}</code>&quot;
  296. * indicates the characters &quot;<code>def</code>&quot; must be
  297. * preceded by &quot;<code>abc</code>&quot; for a successful match.
  298. * If there is a successful match, &quot;<code>def</code>&quot; will
  299. * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'
  300. * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to
  301. * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;
  302. * (or &quot;<code>123}456</code>&quot;) in which the literal
  303. * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
  304. *
  305. * <p>The output string of a forward or reverse rule consists of
  306. * characters to replace the literal pattern characters. If the
  307. * output string contains the character '<code>|</code>', this is
  308. * taken to indicate the location of the <em>cursor</em> after
  309. * replacement. The cursor is the point in the text at which the
  310. * next replacement, if any, will be applied. The cursor is usually
  311. * placed within the replacement text; however, it can actually be
  312. * placed into the precending or following context by using the
  313. * special character '@'. Examples:
  314. *
  315. * <pre>
  316. * a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor before a
  317. * {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between y and z
  318. * </pre>
  319. *
  320. * <p><b>UnicodeSet</b>
  321. *
  322. * <p><code>UnicodeSet</code> patterns may appear anywhere that
  323. * makes sense. They may appear in variable definitions.
  324. * Contrariwise, <code>UnicodeSet</code> patterns may themselves
  325. * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
  326. * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.
  327. *
  328. * <p><code>UnicodeSet</code> patterns may also be embedded directly
  329. * into rule strings. Thus, the following two rules are equivalent:
  330. *
  331. * <pre>
  332. * $vowel=[aeiou]; $vowel&gt;'*'; # One way to do this
  333. * [aeiou]&gt;'*'; # Another way
  334. * </pre>
  335. *
  336. * <p>See {@link UnicodeSet} for more documentation and examples.
  337. *
  338. * <p><b>Segments</b>
  339. *
  340. * <p>Segments of the input string can be matched and copied to the
  341. * output string. This makes certain sets of rules simpler and more
  342. * general, and makes reordering possible. For example:
  343. *
  344. * <pre>
  345. * ([a-z]) &gt; $1 $1; # double lowercase letters
  346. * ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs
  347. * </pre>
  348. *
  349. * <p>The segment of the input string to be copied is delimited by
  350. * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
  351. * nine segments may be defined. Segments may not overlap. In the
  352. * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
  353. * represent the input string segments, in left-to-right order of
  354. * definition.
  355. *
  356. * <p><b>Anchors</b>
  357. *
  358. * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
  359. * special characters '<code>^</code>' and '<code>$</code>'. For example:
  360. *
  361. * <pre>
  362. * ^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text
  363. * &nbsp; a&nbsp;&nbsp; &gt; 'A'; # match other instances of 'a'
  364. * &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text
  365. * &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances of 'z'
  366. * </pre>
  367. *
  368. * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
  369. * This is done by including a virtual anchor character '<code>$</code>' at the end of the
  370. * set pattern. Although this is usually the match chafacter for the end anchor, the set will
  371. * match either the beginning or the end of the text, depending on its placement. For
  372. * example:
  373. *
  374. * <pre>
  375. * $x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor
  376. * $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start
  377. * &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end
  378. * </pre>
  379. *
  380. * <p><b>Example</b>
  381. *
  382. * <p>The following example rules illustrate many of the features of
  383. * the rule language.
  384. *
  385. * <table border="0" cellpadding="4">
  386. * <tr>
  387. * <td style="vertical-align: top;">Rule 1.</td>
  388. * <td style="vertical-align: top; write-space: nowrap;"><code>abc{def}&gt;x|y</code></td>
  389. * </tr>
  390. * <tr>
  391. * <td style="vertical-align: top;">Rule 2.</td>
  392. * <td style="vertical-align: top; write-space: nowrap;"><code>xyz&gt;r</code></td>
  393. * </tr>
  394. * <tr>
  395. * <td style="vertical-align: top;">Rule 3.</td>
  396. * <td style="vertical-align: top; write-space: nowrap;"><code>yz&gt;q</code></td>
  397. * </tr>
  398. * </table>
  399. *
  400. * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
  401. * yields the following results:
  402. *
  403. * <table border="0" cellpadding="4">
  404. * <tr>
  405. * <td style="vertical-align: top; write-space: nowrap;"><code>|adefabcdefz</code></td>
  406. * <td style="vertical-align: top;">Initial state, no rules match. Advance
  407. * cursor.</td>
  408. * </tr>
  409. * <tr>
  410. * <td style="vertical-align: top; write-space: nowrap;"><code>a|defabcdefz</code></td>
  411. * <td style="vertical-align: top;">Still no match. Rule 1 does not match
  412. * because the preceding context is not present.</td>
  413. * </tr>
  414. * <tr>
  415. * <td style="vertical-align: top; write-space: nowrap;"><code>ad|efabcdefz</code></td>
  416. * <td style="vertical-align: top;">Still no match. Keep advancing until
  417. * there is a match...</td>
  418. * </tr>
  419. * <tr>
  420. * <td style="vertical-align: top; write-space: nowrap;"><code>ade|fabcdefz</code></td>
  421. * <td style="vertical-align: top;">...</td>
  422. * </tr>
  423. * <tr>
  424. * <td style="vertical-align: top; write-space: nowrap;"><code>adef|abcdefz</code></td>
  425. * <td style="vertical-align: top;">...</td>
  426. * </tr>
  427. * <tr>
  428. * <td style="vertical-align: top; write-space: nowrap;"><code>adefa|bcdefz</code></td>
  429. * <td style="vertical-align: top;">...</td>
  430. * </tr>
  431. * <tr>
  432. * <td style="vertical-align: top; write-space: nowrap;"><code>adefab|cdefz</code></td>
  433. * <td style="vertical-align: top;">...</td>
  434. * </tr>
  435. * <tr>
  436. * <td style="vertical-align: top; write-space: nowrap;"><code>adefabc|defz</code></td>
  437. * <td style="vertical-align: top;">Rule 1 matches; replace &quot;<code>def</code>&quot;
  438. * with &quot;<code>xy</code>&quot; and back up the cursor
  439. * to before the '<code>y</code>'.</td>
  440. * </tr>
  441. * <tr>
  442. * <td style="vertical-align: top; write-space: nowrap;"><code>adefabcx|yz</code></td>
  443. * <td style="vertical-align: top;">Although &quot;<code>xyz</code>&quot; is
  444. * present, rule 2 does not match because the cursor is
  445. * before the '<code>y</code>', not before the '<code>x</code>'.
  446. * Rule 3 does match. Replace &quot;<code>yz</code>&quot;
  447. * with &quot;<code>q</code>&quot;.</td>
  448. * </tr>
  449. * <tr>
  450. * <td style="vertical-align: top; write-space: nowrap;"><code>adefabcxq|</code></td>
  451. * <td style="vertical-align: top;">The cursor is at the end;
  452. * transliteration is complete.</td>
  453. * </tr>
  454. * </table>
  455. *
  456. * <p>The order of rules is significant. If multiple rules may match
  457. * at some point, the first matching rule is applied.
  458. *
  459. * <p>Forward and reverse rules may have an empty output string.
  460. * Otherwise, an empty left or right hand side of any statement is a
  461. * syntax error.
  462. *
  463. * <p>Single quotes are used to quote any character other than a
  464. * digit or letter. To specify a single quote itself, inside or
  465. * outside of quotes, use two single quotes in a row. For example,
  466. * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
  467. * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
  468. *
  469. * <p><b>Notes</b>
  470. *
  471. * <p>While a Transliterator is being built from rules, it checks that
  472. * the rules are added in proper order. For example, if the rule
  473. * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
  474. * then the second rule will throw an exception. The reason is that
  475. * the second rule can never be triggered, since the first rule
  476. * always matches anything it matches. In other words, the first
  477. * rule <em>masks</em> the second rule.
  478. *
  479. * @author Alan Liu
  480. * @stable ICU 2.0
  481. */
  482. class U_I18N_API Transliterator : public UObject {
  483. private:
  484. /**
  485. * Programmatic name, e.g., "Latin-Arabic".
  486. */
  487. UnicodeString ID;
  488. /**
  489. * This transliterator's filter. Any character for which
  490. * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
  491. * altered by this transliterator. If <tt>filter</tt> is
  492. * <tt>null</tt> then no filtering is applied.
  493. */
  494. UnicodeFilter* filter;
  495. int32_t maximumContextLength;
  496. public:
  497. /**
  498. * A context integer or pointer for a factory function, passed by
  499. * value.
  500. * @stable ICU 2.4
  501. */
  502. union Token {
  503. /**
  504. * This token, interpreted as a 32-bit integer.
  505. * @stable ICU 2.4
  506. */
  507. int32_t integer;
  508. /**
  509. * This token, interpreted as a native pointer.
  510. * @stable ICU 2.4
  511. */
  512. void* pointer;
  513. };
  514. #ifndef U_HIDE_INTERNAL_API
  515. /**
  516. * Return a token containing an integer.
  517. * @return a token containing an integer.
  518. * @internal
  519. */
  520. inline static Token integerToken(int32_t);
  521. /**
  522. * Return a token containing a pointer.
  523. * @return a token containing a pointer.
  524. * @internal
  525. */
  526. inline static Token pointerToken(void*);
  527. #endif /* U_HIDE_INTERNAL_API */
  528. /**
  529. * A function that creates and returns a Transliterator. When
  530. * invoked, it will be passed the ID string that is being
  531. * instantiated, together with the context pointer that was passed
  532. * in when the factory function was first registered. Many
  533. * factory functions will ignore both parameters, however,
  534. * functions that are registered to more than one ID may use the
  535. * ID or the context parameter to parameterize the transliterator
  536. * they create.
  537. * @param ID the string identifier for this transliterator
  538. * @param context a context pointer that will be stored and
  539. * later passed to the factory function when an ID matching
  540. * the registration ID is being instantiated with this factory.
  541. * @stable ICU 2.4
  542. */
  543. typedef Transliterator* (U_EXPORT2 *Factory)(const UnicodeString& ID, Token context);
  544. protected:
  545. /**
  546. * Default constructor.
  547. * @param ID the string identifier for this transliterator
  548. * @param adoptedFilter the filter. Any character for which
  549. * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
  550. * altered by this transliterator. If <tt>filter</tt> is
  551. * <tt>null</tt> then no filtering is applied.
  552. * @stable ICU 2.4
  553. */
  554. Transliterator(const UnicodeString& ID, UnicodeFilter* adoptedFilter);
  555. /**
  556. * Copy constructor.
  557. * @stable ICU 2.4
  558. */
  559. Transliterator(const Transliterator&);
  560. /**
  561. * Assignment operator.
  562. * @stable ICU 2.4
  563. */
  564. Transliterator& operator=(const Transliterator&);
  565. /**
  566. * Create a transliterator from a basic ID. This is an ID
  567. * containing only the forward direction source, target, and
  568. * variant.
  569. * @param id a basic ID of the form S-T or S-T/V.
  570. * @param canon canonical ID to assign to the object, or
  571. * NULL to leave the ID unchanged
  572. * @return a newly created Transliterator or null if the ID is
  573. * invalid.
  574. * @stable ICU 2.4
  575. */
  576. static Transliterator* createBasicInstance(const UnicodeString& id,
  577. const UnicodeString* canon);
  578. friend class TransliteratorParser; // for parseID()
  579. friend class TransliteratorIDParser; // for createBasicInstance()
  580. friend class TransliteratorAlias; // for setID()
  581. public:
  582. /**
  583. * Destructor.
  584. * @stable ICU 2.0
  585. */
  586. virtual ~Transliterator();
  587. /**
  588. * Implements Cloneable.
  589. * All subclasses are encouraged to implement this method if it is
  590. * possible and reasonable to do so. Subclasses that are to be
  591. * registered with the system using <tt>registerInstance()</tt>
  592. * are required to implement this method. If a subclass does not
  593. * implement clone() properly and is registered with the system
  594. * using registerInstance(), then the default clone() implementation
  595. * will return null, and calls to createInstance() will fail.
  596. *
  597. * @return a copy of the object.
  598. * @see #registerInstance
  599. * @stable ICU 2.0
  600. */
  601. virtual Transliterator* clone() const;
  602. /**
  603. * Transliterates a segment of a string, with optional filtering.
  604. *
  605. * @param text the string to be transliterated
  606. * @param start the beginning index, inclusive; <code>0 <= start
  607. * <= limit</code>.
  608. * @param limit the ending index, exclusive; <code>start <= limit
  609. * <= text.length()</code>.
  610. * @return The new limit index. The text previously occupying <code>[start,
  611. * limit)</code> has been transliterated, possibly to a string of a different
  612. * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
  613. * <em>new-limit</em> is the return value. If the input offsets are out of bounds,
  614. * the returned value is -1 and the input string remains unchanged.
  615. * @stable ICU 2.0
  616. */
  617. virtual int32_t transliterate(Replaceable& text,
  618. int32_t start, int32_t limit) const;
  619. /**
  620. * Transliterates an entire string in place. Convenience method.
  621. * @param text the string to be transliterated
  622. * @stable ICU 2.0
  623. */
  624. virtual void transliterate(Replaceable& text) const;
  625. /**
  626. * Transliterates the portion of the text buffer that can be
  627. * transliterated unambiguosly after new text has been inserted,
  628. * typically as a result of a keyboard event. The new text in
  629. * <code>insertion</code> will be inserted into <code>text</code>
  630. * at <code>index.limit</code>, advancing
  631. * <code>index.limit</code> by <code>insertion.length()</code>.
  632. * Then the transliterator will try to transliterate characters of
  633. * <code>text</code> between <code>index.cursor</code> and
  634. * <code>index.limit</code>. Characters before
  635. * <code>index.cursor</code> will not be changed.
  636. *
  637. * <p>Upon return, values in <code>index</code> will be updated.
  638. * <code>index.start</code> will be advanced to the first
  639. * character that future calls to this method will read.
  640. * <code>index.cursor</code> and <code>index.limit</code> will
  641. * be adjusted to delimit the range of text that future calls to
  642. * this method may change.
  643. *
  644. * <p>Typical usage of this method begins with an initial call
  645. * with <code>index.start</code> and <code>index.limit</code>
  646. * set to indicate the portion of <code>text</code> to be
  647. * transliterated, and <code>index.cursor == index.start</code>.
  648. * Thereafter, <code>index</code> can be used without
  649. * modification in future calls, provided that all changes to
  650. * <code>text</code> are made via this method.
  651. *
  652. * <p>This method assumes that future calls may be made that will
  653. * insert new text into the buffer. As a result, it only performs
  654. * unambiguous transliterations. After the last call to this
  655. * method, there may be untransliterated text that is waiting for
  656. * more input to resolve an ambiguity. In order to perform these
  657. * pending transliterations, clients should call {@link
  658. * #finishTransliteration } after the last call to this
  659. * method has been made.
  660. *
  661. * @param text the buffer holding transliterated and untransliterated text
  662. * @param index an array of three integers.
  663. *
  664. * <ul><li><code>index.start</code>: the beginning index,
  665. * inclusive; <code>0 <= index.start <= index.limit</code>.
  666. *
  667. * <li><code>index.limit</code>: the ending index, exclusive;
  668. * <code>index.start <= index.limit <= text.length()</code>.
  669. * <code>insertion</code> is inserted at
  670. * <code>index.limit</code>.
  671. *
  672. * <li><code>index.cursor</code>: the next character to be
  673. * considered for transliteration; <code>index.start <=
  674. * index.cursor <= index.limit</code>. Characters before
  675. * <code>index.cursor</code> will not be changed by future calls
  676. * to this method.</ul>
  677. *
  678. * @param insertion text to be inserted and possibly
  679. * transliterated into the translation buffer at
  680. * <code>index.limit</code>. If <code>null</code> then no text
  681. * is inserted.
  682. * @param status Output param to filled in with a success or an error.
  683. * @see #handleTransliterate
  684. * @exception IllegalArgumentException if <code>index</code>
  685. * is invalid
  686. * @see UTransPosition
  687. * @stable ICU 2.0
  688. */
  689. virtual void transliterate(Replaceable& text, UTransPosition& index,
  690. const UnicodeString& insertion,
  691. UErrorCode& status) const;
  692. /**
  693. * Transliterates the portion of the text buffer that can be
  694. * transliterated unambiguosly after a new character has been
  695. * inserted, typically as a result of a keyboard event. This is a
  696. * convenience method.
  697. * @param text the buffer holding transliterated and
  698. * untransliterated text
  699. * @param index an array of three integers.
  700. * @param insertion text to be inserted and possibly
  701. * transliterated into the translation buffer at
  702. * <code>index.limit</code>.
  703. * @param status Output param to filled in with a success or an error.
  704. * @see #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode&) const
  705. * @stable ICU 2.0
  706. */
  707. virtual void transliterate(Replaceable& text, UTransPosition& index,
  708. UChar32 insertion,
  709. UErrorCode& status) const;
  710. /**
  711. * Transliterates the portion of the text buffer that can be
  712. * transliterated unambiguosly. This is a convenience method; see
  713. * {@link
  714. * #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode&) const }
  715. * for details.
  716. * @param text the buffer holding transliterated and
  717. * untransliterated text
  718. * @param index an array of three integers.
  719. * @param status Output param to filled in with a success or an error.
  720. * @see #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode &) const
  721. * @stable ICU 2.0
  722. */
  723. virtual void transliterate(Replaceable& text, UTransPosition& index,
  724. UErrorCode& status) const;
  725. /**
  726. * Finishes any pending transliterations that were waiting for
  727. * more characters. Clients should call this method as the last
  728. * call after a sequence of one or more calls to
  729. * <code>transliterate()</code>.
  730. * @param text the buffer holding transliterated and
  731. * untransliterated text.
  732. * @param index the array of indices previously passed to {@link
  733. * #transliterate }
  734. * @stable ICU 2.0
  735. */
  736. virtual void finishTransliteration(Replaceable& text,
  737. UTransPosition& index) const;
  738. private:
  739. /**
  740. * This internal method does incremental transliteration. If the
  741. * 'insertion' is non-null then we append it to 'text' before
  742. * proceeding. This method calls through to the pure virtual
  743. * framework method handleTransliterate() to do the actual
  744. * work.
  745. * @param text the buffer holding transliterated and
  746. * untransliterated text
  747. * @param index an array of three integers. See {@link
  748. * #transliterate(Replaceable, int[], String)}.
  749. * @param insertion text to be inserted and possibly
  750. * transliterated into the translation buffer at
  751. * <code>index.limit</code>.
  752. * @param status Output param to filled in with a success or an error.
  753. */
  754. void _transliterate(Replaceable& text,
  755. UTransPosition& index,
  756. const UnicodeString* insertion,
  757. UErrorCode &status) const;
  758. protected:
  759. /**
  760. * Abstract method that concrete subclasses define to implement
  761. * their transliteration algorithm. This method handles both
  762. * incremental and non-incremental transliteration. Let
  763. * <code>originalStart</code> refer to the value of
  764. * <code>pos.start</code> upon entry.
  765. *
  766. * <ul>
  767. * <li>If <code>incremental</code> is false, then this method
  768. * should transliterate all characters between
  769. * <code>pos.start</code> and <code>pos.limit</code>. Upon return
  770. * <code>pos.start</code> must == <code> pos.limit</code>.</li>
  771. *
  772. * <li>If <code>incremental</code> is true, then this method
  773. * should transliterate all characters between
  774. * <code>pos.start</code> and <code>pos.limit</code> that can be
  775. * unambiguously transliterated, regardless of future insertions
  776. * of text at <code>pos.limit</code>. Upon return,
  777. * <code>pos.start</code> should be in the range
  778. * [<code>originalStart</code>, <code>pos.limit</code>).
  779. * <code>pos.start</code> should be positioned such that
  780. * characters [<code>originalStart</code>, <code>
  781. * pos.start</code>) will not be changed in the future by this
  782. * transliterator and characters [<code>pos.start</code>,
  783. * <code>pos.limit</code>) are unchanged.</li>
  784. * </ul>
  785. *
  786. * <p>Implementations of this method should also obey the
  787. * following invariants:</p>
  788. *
  789. * <ul>
  790. * <li> <code>pos.limit</code> and <code>pos.contextLimit</code>
  791. * should be updated to reflect changes in length of the text
  792. * between <code>pos.start</code> and <code>pos.limit</code>. The
  793. * difference <code> pos.contextLimit - pos.limit</code> should
  794. * not change.</li>
  795. *
  796. * <li><code>pos.contextStart</code> should not change.</li>
  797. *
  798. * <li>Upon return, neither <code>pos.start</code> nor
  799. * <code>pos.limit</code> should be less than
  800. * <code>originalStart</code>.</li>
  801. *
  802. * <li>Text before <code>originalStart</code> and text after
  803. * <code>pos.limit</code> should not change.</li>
  804. *
  805. * <li>Text before <code>pos.contextStart</code> and text after
  806. * <code> pos.contextLimit</code> should be ignored.</li>
  807. * </ul>
  808. *
  809. * <p>Subclasses may safely assume that all characters in
  810. * [<code>pos.start</code>, <code>pos.limit</code>) are filtered.
  811. * In other words, the filter has already been applied by the time
  812. * this method is called. See
  813. * <code>filteredTransliterate()</code>.
  814. *
  815. * <p>This method is <b>not</b> for public consumption. Calling
  816. * this method directly will transliterate
  817. * [<code>pos.start</code>, <code>pos.limit</code>) without
  818. * applying the filter. End user code should call <code>
  819. * transliterate()</code> instead of this method. Subclass code
  820. * and wrapping transliterators should call
  821. * <code>filteredTransliterate()</code> instead of this method.<p>
  822. *
  823. * @param text the buffer holding transliterated and
  824. * untransliterated text
  825. *
  826. * @param pos the indices indicating the start, limit, context
  827. * start, and context limit of the text.
  828. *
  829. * @param incremental if true, assume more text may be inserted at
  830. * <code>pos.limit</code> and act accordingly. Otherwise,
  831. * transliterate all text between <code>pos.start</code> and
  832. * <code>pos.limit</code> and move <code>pos.start</code> up to
  833. * <code>pos.limit</code>.
  834. *
  835. * @see #transliterate
  836. * @stable ICU 2.4
  837. */
  838. virtual void handleTransliterate(Replaceable& text,
  839. UTransPosition& pos,
  840. UBool incremental) const = 0;
  841. public:
  842. /**
  843. * Transliterate a substring of text, as specified by index, taking filters
  844. * into account. This method is for subclasses that need to delegate to
  845. * another transliterator.
  846. * @param text the text to be transliterated
  847. * @param index the position indices
  848. * @param incremental if TRUE, then assume more characters may be inserted
  849. * at index.limit, and postpone processing to accomodate future incoming
  850. * characters
  851. * @stable ICU 2.4
  852. */
  853. virtual void filteredTransliterate(Replaceable& text,
  854. UTransPosition& index,
  855. UBool incremental) const;
  856. private:
  857. /**
  858. * Top-level transliteration method, handling filtering, incremental and
  859. * non-incremental transliteration, and rollback. All transliteration
  860. * public API methods eventually call this method with a rollback argument
  861. * of TRUE. Other entities may call this method but rollback should be
  862. * FALSE.
  863. *
  864. * <p>If this transliterator has a filter, break up the input text into runs
  865. * of unfiltered characters. Pass each run to
  866. * subclass.handleTransliterate().
  867. *
  868. * <p>In incremental mode, if rollback is TRUE, perform a special
  869. * incremental procedure in which several passes are made over the input
  870. * text, adding one character at a time, and committing successful
  871. * transliterations as they occur. Unsuccessful transliterations are rolled
  872. * back and retried with additional characters to give correct results.
  873. *
  874. * @param text the text to be transliterated
  875. * @param index the position indices
  876. * @param incremental if TRUE, then assume more characters may be inserted
  877. * at index.limit, and postpone processing to accomodate future incoming
  878. * characters
  879. * @param rollback if TRUE and if incremental is TRUE, then perform special
  880. * incremental processing, as described above, and undo partial
  881. * transliterations where necessary. If incremental is FALSE then this
  882. * parameter is ignored.
  883. */
  884. virtual void filteredTransliterate(Replaceable& text,
  885. UTransPosition& index,
  886. UBool incremental,
  887. UBool rollback) const;
  888. public:
  889. /**
  890. * Returns the length of the longest context required by this transliterator.
  891. * This is <em>preceding</em> context. The default implementation supplied
  892. * by <code>Transliterator</code> returns zero; subclasses
  893. * that use preceding context should override this method to return the
  894. * correct value. For example, if a transliterator translates "ddd" (where
  895. * d is any digit) to "555" when preceded by "(ddd)", then the preceding
  896. * context length is 5, the length of "(ddd)".
  897. *
  898. * @return The maximum number of preceding context characters this
  899. * transliterator needs to examine
  900. * @stable ICU 2.0
  901. */
  902. int32_t getMaximumContextLength(void) const;
  903. protected:
  904. /**
  905. * Method for subclasses to use to set the maximum context length.
  906. * @param maxContextLength the new value to be set.
  907. * @see #getMaximumContextLength
  908. * @stable ICU 2.4
  909. */
  910. void setMaximumContextLength(int32_t maxContextLength);
  911. public:
  912. /**
  913. * Returns a programmatic identifier for this transliterator.
  914. * If this identifier is passed to <code>createInstance()</code>, it
  915. * will return this object, if it has been registered.
  916. * @return a programmatic identifier for this transliterator.
  917. * @see #registerInstance
  918. * @see #registerFactory
  919. * @see #getAvailableIDs
  920. * @stable ICU 2.0
  921. */
  922. virtual const UnicodeString& getID(void) const;
  923. /**
  924. * Returns a name for this transliterator that is appropriate for
  925. * display to the user in the default locale. See {@link
  926. * #getDisplayName } for details.
  927. * @param ID the string identifier for this transliterator
  928. * @param result Output param to receive the display name
  929. * @return A reference to 'result'.
  930. * @stable ICU 2.0
  931. */
  932. static UnicodeString& U_EXPORT2 getDisplayName(const UnicodeString& ID,
  933. UnicodeString& result);
  934. /**
  935. * Returns a name for this transliterator that is appropriate for
  936. * display to the user in the given locale. This name is taken
  937. * from the locale resource data in the standard manner of the
  938. * <code>java.text</code> package.
  939. *
  940. * <p>If no localized names exist in the system resource bundles,
  941. * a name is synthesized using a localized
  942. * <code>MessageFormat</code> pattern from the resource data. The
  943. * arguments to this pattern are an integer followed by one or two
  944. * strings. The integer is the number of strings, either 1 or 2.
  945. * The strings are formed by splitting the ID for this
  946. * transliterator at the first '-'. If there is no '-', then the
  947. * entire ID forms the only string.
  948. * @param ID the string identifier for this transliterator
  949. * @param inLocale the Locale in which the display name should be
  950. * localized.
  951. * @param result Output param to receive the display name
  952. * @return A reference to 'result'.
  953. * @stable ICU 2.0
  954. */
  955. static UnicodeString& U_EXPORT2 getDisplayName(const UnicodeString& ID,
  956. const Locale& inLocale,
  957. UnicodeString& result);
  958. /**
  959. * Returns the filter used by this transliterator, or <tt>NULL</tt>
  960. * if this transliterator uses no filter.
  961. * @return the filter used by this transliterator, or <tt>NULL</tt>
  962. * if this transliterator uses no filter.
  963. * @stable ICU 2.0
  964. */
  965. const UnicodeFilter* getFilter(void) const;
  966. /**
  967. * Returns the filter used by this transliterator, or <tt>NULL</tt> if this
  968. * transliterator uses no filter. The caller must eventually delete the
  969. * result. After this call, this transliterator's filter is set to
  970. * <tt>NULL</tt>.
  971. * @return the filter used by this transliterator, or <tt>NULL</tt> if this
  972. * transliterator uses no filter.
  973. * @stable ICU 2.4
  974. */
  975. UnicodeFilter* orphanFilter(void);
  976. /**
  977. * Changes the filter used by this transliterator. If the filter
  978. * is set to <tt>null</tt> then no filtering will occur.
  979. *
  980. * <p>Callers must take care if a transliterator is in use by
  981. * multiple threads. The filter should not be changed by one
  982. * thread while another thread may be transliterating.
  983. * @param adoptedFilter the new filter to be adopted.
  984. * @stable ICU 2.0
  985. */
  986. void adoptFilter(UnicodeFilter* adoptedFilter);
  987. /**
  988. * Returns this transliterator's inverse. See the class
  989. * documentation for details. This implementation simply inverts
  990. * the two entities in the ID and attempts to retrieve the
  991. * resulting transliterator. That is, if <code>getID()</code>
  992. * returns "A-B", then this method will return the result of
  993. * <code>createInstance("B-A")</code>, or <code>null</code> if that
  994. * call fails.
  995. *
  996. * <p>Subclasses with knowledge of their inverse may wish to
  997. * override this method.
  998. *
  999. * @param status Output param to filled in with a success or an error.
  1000. * @return a transliterator that is an inverse, not necessarily
  1001. * exact, of this transliterator, or <code>null</code> if no such
  1002. * transliterator is registered.
  1003. * @see #registerInstance
  1004. * @stable ICU 2.0
  1005. */
  1006. Transliterator* createInverse(UErrorCode& status) const;
  1007. /**
  1008. * Returns a <code>Transliterator</code> object given its ID.
  1009. * The ID must be either a system transliterator ID or a ID registered
  1010. * using <code>registerInstance()</code>.
  1011. *
  1012. * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
  1013. * @param dir either FORWARD or REVERSE.
  1014. * @param parseError Struct to recieve information on position
  1015. * of error if an error is encountered
  1016. * @param status Output param to filled in with a success or an error.
  1017. * @return A <code>Transliterator</code> object with the given ID
  1018. * @see #registerInstance
  1019. * @see #getAvailableIDs
  1020. * @see #getID
  1021. * @stable ICU 2.0
  1022. */
  1023. static Transliterator* U_EXPORT2 createInstance(const UnicodeString& ID,
  1024. UTransDirection dir,
  1025. UParseError& parseError,
  1026. UErrorCode& status);
  1027. /**
  1028. * Returns a <code>Transliterator</code> object given its ID.
  1029. * The ID must be either a system transliterator ID or a ID registered
  1030. * using <code>registerInstance()</code>.
  1031. * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
  1032. * @param dir either FORWARD or REVERSE.
  1033. * @param status Output param to filled in with a success or an error.
  1034. * @return A <code>Transliterator</code> object with the given ID
  1035. * @stable ICU 2.0
  1036. */
  1037. static Transliterator* U_EXPORT2 createInstance(const UnicodeString& ID,
  1038. UTransDirection dir,
  1039. UErrorCode& status);
  1040. /**
  1041. * Returns a <code>Transliterator</code> object constructed from
  1042. * the given rule string. This will be a rule-based Transliterator,
  1043. * if the rule string contains only rules, or a
  1044. * compound Transliterator, if it contains ID blocks, or a
  1045. * null Transliterator, if it contains ID blocks which parse as
  1046. * empty for the given direction.
  1047. *
  1048. * @param ID the id for the transliterator.
  1049. * @param rules rules, separated by ';'
  1050. * @param dir either FORWARD or REVERSE.
  1051. * @param parseError Struct to receive information on position
  1052. * of error if an error is encountered
  1053. * @param status Output param set to success/failure code.
  1054. * @return a newly created Transliterator
  1055. * @stable ICU 2.0
  1056. */
  1057. static Transliterator* U_EXPORT2 createFromRules(const UnicodeString& ID,
  1058. const UnicodeString& rules,
  1059. UTransDirection dir,
  1060. UParseError& parseError,
  1061. UErrorCode& status);
  1062. /**
  1063. * Create a rule string that can be passed to createFromRules()
  1064. * to recreate this transliterator.
  1065. * @param result the string to receive the rules. Previous
  1066. * contents will be deleted.
  1067. * @param escapeUnprintable if TRUE then convert unprintable
  1068. * character to their hex escape representations, \\uxxxx or
  1069. * \\Uxxxxxxxx. Unprintable characters are those other than
  1070. * U+000A, U+0020..U+007E.
  1071. * @stable ICU 2.0
  1072. */
  1073. virtual UnicodeString& toRules(UnicodeString& result,
  1074. UBool escapeUnprintable) const;
  1075. /**
  1076. * Return the number of elements that make up this transliterator.
  1077. * For example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
  1078. * were created, the return value of this method would be 3.
  1079. *
  1080. * <p>If this transliterator is not composed of other
  1081. * transliterators, then this method returns 1.
  1082. * @return the number of transliterators that compose this
  1083. * transliterator, or 1 if this transliterator is not composed of
  1084. * multiple transliterators
  1085. * @stable ICU 3.0
  1086. */
  1087. int32_t countElements() const;
  1088. /**
  1089. * Return an element that makes up this transliterator. For
  1090. * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
  1091. * were created, the return value of this method would be one
  1092. * of the three transliterator objects that make up that
  1093. * transliterator: [NFD, Jamo-Latin, Latin-Greek].
  1094. *
  1095. * <p>If this transliterator is not composed of other
  1096. * transliterators, then this method will return a reference to
  1097. * this transliterator when given the index 0.
  1098. * @param index a value from 0..countElements()-1 indicating the
  1099. * transliterator to return
  1100. * @param ec input-output error code
  1101. * @return one of the transliterators that makes up this
  1102. * transliterator, if this transliterator is made up of multiple
  1103. * transliterators, otherwise a reference to this object if given
  1104. * an index of 0
  1105. * @stable ICU 3.0
  1106. */
  1107. const Transliterator& getElement(int32_t index, UErrorCode& ec) const;
  1108. /**
  1109. * Returns the set of all characters that may be modified in the
  1110. * input text by this Transliterator. This incorporates this
  1111. * object's current filter; if the filter is changed, the return
  1112. * value of this function will change. The default implementation
  1113. * returns an empty set. Some subclasses may override {@link
  1114. * #handleGetSourceSet } to return a more precise result. The
  1115. * return result is approximate in any case and is intended for
  1116. * use by tests, tools, or utilities.
  1117. * @param result receives result set; previous contents lost
  1118. * @return a reference to result
  1119. * @see #getTargetSet
  1120. * @see #handleGetSourceSet
  1121. * @stable ICU 2.4
  1122. */
  1123. UnicodeSet& getSourceSet(UnicodeSet& result) const;
  1124. /**
  1125. * Framework method that returns the set of all characters that
  1126. * may be modified in the input text by this Transliterator,
  1127. * ignoring the effect of this object's filter. The base class
  1128. * implementation returns the empty set. Subclasses that wish to
  1129. * implement this should override this method.
  1130. * @return the set of characters that this transliterator may
  1131. * modify. The set may be modified, so subclasses should return a
  1132. * newly-created object.
  1133. * @param result receives result set; previous contents lost
  1134. * @see #getSourceSet
  1135. * @see #getTargetSet
  1136. * @stable ICU 2.4
  1137. */
  1138. virtual void handleGetSourceSet(UnicodeSet& result) const;
  1139. /**
  1140. * Returns the set of all characters that may be generated as
  1141. * replacement text by this transliterator. The default
  1142. * implementation returns the empty set. Some subclasses may
  1143. * override this method to return a more precise result. The
  1144. * return result is approximate in any case and is intended for
  1145. * use by tests, tools, or utilities requiring such
  1146. * meta-information.
  1147. * @param result receives result set; previous contents lost
  1148. * @return a reference to result
  1149. * @see #getTargetSet
  1150. * @stable ICU 2.4
  1151. */
  1152. virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
  1153. public:
  1154. /**
  1155. * Registers a factory function that creates transliterators of
  1156. * a given ID.
  1157. *
  1158. * Because ICU may choose to cache Transliterators internally, this must
  1159. * be called at application startup, prior to any calls to
  1160. * Transliterator::createXXX to avoid undefined behavior.
  1161. *
  1162. * @param id the ID being registered
  1163. * @param factory a function pointer that will be copied and
  1164. * called later when the given ID is passed to createInstance()
  1165. * @param context a context pointer that will be stored and
  1166. * later passed to the factory function when an ID matching
  1167. * the registration ID is being instantiated with this factory.
  1168. * @stable ICU 2.0
  1169. */
  1170. static void U_EXPORT2 registerFactory(const UnicodeString& id,
  1171. Factory factory,
  1172. Token context);
  1173. /**
  1174. * Registers an instance <tt>obj</tt> of a subclass of
  1175. * <code>Transliterator</code> with the system. When
  1176. * <tt>createInstance()</tt> is called with an ID string that is
  1177. * equal to <tt>obj->getID()</tt>, then <tt>obj->clone()</tt> is
  1178. * returned.
  1179. *
  1180. * After this call the Transliterator class owns the adoptedObj
  1181. * and will delete it.
  1182. *
  1183. * Because ICU may choose to cache Transliterators internally, this must
  1184. * be called at application startup, prior to any calls to
  1185. * Transliterator::createXXX to avoid undefined behavior.
  1186. *
  1187. * @param adoptedObj an instance of subclass of
  1188. * <code>Transliterator</code> that defines <tt>clone()</tt>
  1189. * @see #createInstance
  1190. * @see #registerFactory
  1191. * @see #unregister
  1192. * @stable ICU 2.0
  1193. */
  1194. static void U_EXPORT2 registerInstance(Transliterator* adoptedObj);
  1195. /**
  1196. * Registers an ID string as an alias of another ID string.
  1197. * That is, after calling this function, <tt>createInstance(aliasID)</tt>
  1198. * will return the same thing as <tt>createInstance(realID)</tt>.
  1199. * This is generally used to create shorter, more mnemonic aliases
  1200. * for long compound IDs.
  1201. *
  1202. * @param aliasID The new ID being registered.
  1203. * @param realID The ID that the new ID is to be an alias for.
  1204. * This can be a compound ID and can include filters and should
  1205. * refer to transliterators that have already been registered with
  1206. * the framework, although this isn't checked.
  1207. * @stable ICU 3.6
  1208. */
  1209. static void U_EXPORT2 registerAlias(const UnicodeString& aliasID,
  1210. const UnicodeString& realID);
  1211. protected:
  1212. #ifndef U_HIDE_INTERNAL_API
  1213. /**
  1214. * @param id the ID being registered
  1215. * @param factory a function pointer that will be copied and
  1216. * called later when the given ID is passed to createInstance()
  1217. * @param context a context pointer that will be stored and
  1218. * later passed to the factory function when an ID matching
  1219. * the registration ID is being instantiated with this factory.
  1220. * @internal
  1221. */
  1222. static void _registerFactory(const UnicodeString& id,
  1223. Factory factory,
  1224. Token context);
  1225. /**
  1226. * @internal
  1227. */
  1228. static void _registerInstance(Transliterator* adoptedObj);
  1229. /**
  1230. * @internal
  1231. */
  1232. static void _registerAlias(const UnicodeString& aliasID, const UnicodeString& realID);
  1233. /**
  1234. * Register two targets as being inverses of one another. For
  1235. * example, calling registerSpecialInverse("NFC", "NFD", true) causes
  1236. * Transliterator to form the following inverse relationships:
  1237. *
  1238. * <pre>NFC => NFD
  1239. * Any-NFC => Any-NFD
  1240. * NFD => NFC
  1241. * Any-NFD => Any-NFC</pre>
  1242. *
  1243. * (Without the special inverse registration, the inverse of NFC
  1244. * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
  1245. * that the presence or absence of "Any-" is preserved.
  1246. *
  1247. * <p>The relationship is symmetrical; registering (a, b) is
  1248. * equivalent to registering (b, a).
  1249. *
  1250. * <p>The relevant IDs must still be registered separately as
  1251. * factories or classes.
  1252. *
  1253. * <p>Only the targets are specified. Special inverses always
  1254. * have the form Any-Target1 <=> Any-Target2. The target should
  1255. * have canonical casing (the casing desired to be produced when
  1256. * an inverse is formed) and should contain no whitespace or other
  1257. * extraneous characters.
  1258. *
  1259. * @param target the target against which to register the inverse
  1260. * @param inverseTarget the inverse of target, that is
  1261. * Any-target.getInverse() => Any-inverseTarget
  1262. * @param bidirectional if true, register the reverse relation
  1263. * as well, that is, Any-inverseTarget.getInverse() => Any-target
  1264. * @internal
  1265. */
  1266. static void _registerSpecialInverse(const UnicodeString& target,
  1267. const UnicodeString& inverseTarget,
  1268. UBool bidirectional);
  1269. #endif /* U_HIDE_INTERNAL_API */
  1270. public:
  1271. /**
  1272. * Unregisters a transliterator or class. This may be either
  1273. * a system transliterator or a user transliterator or class.
  1274. * Any attempt to construct an unregistered transliterator based
  1275. * on its ID will fail.
  1276. *
  1277. * Because ICU may choose to cache Transliterators internally, this should
  1278. * be called during application shutdown, after all calls to
  1279. * Transliterator::createXXX to avoid undefined behavior.
  1280. *
  1281. * @param ID the ID of the transliterator or class
  1282. * @return the <code>Object</code> that was registered with
  1283. * <code>ID</code>, or <code>null</code> if none was
  1284. * @see #registerInstance
  1285. * @see #registerFactory
  1286. * @stable ICU 2.0
  1287. */
  1288. static void U_EXPORT2 unregister(const UnicodeString& ID);
  1289. public:
  1290. /**
  1291. * Return a StringEnumeration over the IDs available at the time of the
  1292. * call, including user-registered IDs.
  1293. * @param ec input-output error code
  1294. * @return a newly-created StringEnumeration over the transliterators
  1295. * available at the time of the call. The caller should delete this object
  1296. * when done using it.
  1297. * @stable ICU 3.0
  1298. */
  1299. static StringEnumeration* U_EXPORT2 getAvailableIDs(UErrorCode& ec);
  1300. /**
  1301. * Return the number of registered source specifiers.
  1302. * @return the number of registered source specifiers.
  1303. * @stable ICU 2.0
  1304. */
  1305. static int32_t U_EXPORT2 countAvailableSources(void);
  1306. /**
  1307. * Return a registered source specifier.
  1308. * @param index which specifier to return, from 0 to n-1, where
  1309. * n = countAvailableSources()
  1310. * @param result fill-in paramter to receive the source specifier.
  1311. * If index is out of range, result will be empty.
  1312. * @return reference to result
  1313. * @stable ICU 2.0
  1314. */
  1315. static UnicodeString& U_EXPORT2 getAvailableSource(int32_t index,
  1316. UnicodeString& result);
  1317. /**
  1318. * Return the number of registered target specifiers for a given
  1319. * source specifier.
  1320. * @param source the given source specifier.
  1321. * @return the number of registered target specifiers for a given
  1322. * source specifier.
  1323. * @stable ICU 2.0
  1324. */
  1325. static int32_t U_EXPORT2 countAvailableTargets(const UnicodeString& source);
  1326. /**
  1327. * Return a registered target specifier for a given source.
  1328. * @param index which specifier to return, from 0 to n-1, where
  1329. * n = countAvailableTargets(source)
  1330. * @param source the source specifier
  1331. * @param result fill-in paramter to receive the target specifier.
  1332. * If source is invalid or if index is out of range, result will
  1333. * be empty.
  1334. * @return reference to result
  1335. * @stable ICU 2.0
  1336. */
  1337. static UnicodeString& U_EXPORT2 getAvailableTarget(int32_t index,
  1338. const UnicodeString& source,
  1339. UnicodeString& result);
  1340. /**
  1341. * Return the number of registered variant specifiers for a given
  1342. * source-target pair.
  1343. * @param source the source specifiers.
  1344. * @param target the target specifiers.
  1345. * @stable ICU 2.0
  1346. */
  1347. static int32_t U_EXPORT2 countAvailableVariants(const UnicodeString& source,
  1348. const UnicodeString& target);
  1349. /**
  1350. * Return a registered variant specifier for a given source-target
  1351. * pair.
  1352. * @param index which specifier to return, from 0 to n-1, where
  1353. * n = countAvailableVariants(source, target)
  1354. * @param source the source specifier
  1355. * @param target the target specifier
  1356. * @param result fill-in paramter to receive the variant
  1357. * specifier. If source is invalid or if target is invalid or if
  1358. * index is out of range, result will be empty.
  1359. * @return reference to result
  1360. * @stable ICU 2.0
  1361. */
  1362. static UnicodeString& U_EXPORT2 getAvailableVariant(int32_t index,
  1363. const UnicodeString& source,
  1364. const UnicodeString& target,
  1365. UnicodeString& result);
  1366. protected:
  1367. #ifndef U_HIDE_INTERNAL_API
  1368. /**
  1369. * Non-mutexed internal method
  1370. * @internal
  1371. */
  1372. static int32_t _countAvailableSources(void);
  1373. /**
  1374. * Non-mutexed internal method
  1375. * @internal
  1376. */
  1377. static UnicodeString& _getAvailableSource(int32_t index,
  1378. UnicodeString& result);
  1379. /**
  1380. * Non-mutexed internal method
  1381. * @internal
  1382. */
  1383. static int32_t _countAvailableTargets(const UnicodeString& source);
  1384. /**
  1385. * Non-mutexed internal method
  1386. * @internal
  1387. */
  1388. static UnicodeString& _getAvailableTarget(int32_t index,
  1389. const UnicodeString& source,
  1390. UnicodeString& result);
  1391. /**
  1392. * Non-mutexed internal method
  1393. * @internal
  1394. */
  1395. static int32_t _countAvailableVariants(const UnicodeString& source,
  1396. const UnicodeString& target);
  1397. /**
  1398. * Non-mutexed internal method
  1399. * @internal
  1400. */
  1401. static UnicodeString& _getAvailableVariant(int32_t index,
  1402. const UnicodeString& source,
  1403. const UnicodeString& target,
  1404. UnicodeString& result);
  1405. #endif /* U_HIDE_INTERNAL_API */
  1406. protected:
  1407. /**
  1408. * Set the ID of this transliterators. Subclasses shouldn't do
  1409. * this, unless the underlying script behavior has changed.
  1410. * @param id the new id t to be set.
  1411. * @stable ICU 2.4
  1412. */
  1413. void setID(const UnicodeString& id);
  1414. public:
  1415. /**
  1416. * Return the class ID for this class. This is useful only for
  1417. * comparing to a return value from getDynamicClassID().
  1418. * Note that Transliterator is an abstract base class, and therefor
  1419. * no fully constructed object will have a dynamic
  1420. * UCLassID that equals the UClassID returned from
  1421. * TRansliterator::getStaticClassID().
  1422. * @return The class ID for class Transliterator.
  1423. * @stable ICU 2.0
  1424. */
  1425. static UClassID U_EXPORT2 getStaticClassID(void);
  1426. /**
  1427. * Returns a unique class ID <b>polymorphically</b>. This method
  1428. * is to implement a simple version of RTTI, since not all C++
  1429. * compilers support genuine RTTI. Polymorphic operator==() and
  1430. * clone() methods call this method.
  1431. *
  1432. * <p>Concrete subclasses of Transliterator must use the
  1433. * UOBJECT_DEFINE_RTTI_IMPLEMENTATION macro from
  1434. * uobject.h to provide the RTTI functions.
  1435. *
  1436. * @return The class ID for this object. All objects of a given
  1437. * class have the same class ID. Objects of other classes have
  1438. * different class IDs.
  1439. * @stable ICU 2.0
  1440. */
  1441. virtual UClassID getDynamicClassID(void) const = 0;
  1442. private:
  1443. static UBool initializeRegistry(UErrorCode &status);
  1444. public:
  1445. #ifndef U_HIDE_OBSOLETE_API
  1446. /**
  1447. * Return the number of IDs currently registered with the system.
  1448. * To retrieve the actual IDs, call getAvailableID(i) with
  1449. * i from 0 to countAvailableIDs() - 1.
  1450. * @return the number of IDs currently registered with the system.
  1451. * @obsolete ICU 3.4 use getAvailableIDs() instead
  1452. */
  1453. static int32_t U_EXPORT2 countAvailableIDs(void);
  1454. /**
  1455. * Return the index-th available ID. index must be between 0
  1456. * and countAvailableIDs() - 1, inclusive. If index is out of
  1457. * range, the result of getAvailableID(0) is returned.
  1458. * @param index the given ID index.
  1459. * @return the index-th available ID. index must be between 0
  1460. * and countAvailableIDs() - 1, inclusive. If index is out of
  1461. * range, the result of getAvailableID(0) is returned.
  1462. * @obsolete ICU 3.4 use getAvailableIDs() instead; this function
  1463. * is not thread safe, since it returns a reference to storage that
  1464. * may become invalid if another thread calls unregister
  1465. */
  1466. static const UnicodeString& U_EXPORT2 getAvailableID(int32_t index);
  1467. #endif /* U_HIDE_OBSOLETE_API */
  1468. };
  1469. inline int32_t Transliterator::getMaximumContextLength(void) const {
  1470. return maximumContextLength;
  1471. }
  1472. inline void Transliterator::setID(const UnicodeString& id) {
  1473. ID = id;
  1474. // NUL-terminate the ID string, which is a non-aliased copy.
  1475. ID.append((char16_t)0);
  1476. ID.truncate(ID.length()-1);
  1477. }
  1478. #ifndef U_HIDE_INTERNAL_API
  1479. inline Transliterator::Token Transliterator::integerToken(int32_t i) {
  1480. Token t;
  1481. t.integer = i;
  1482. return t;
  1483. }
  1484. inline Transliterator::Token Transliterator::pointerToken(void* p) {
  1485. Token t;
  1486. t.pointer = p;
  1487. return t;
  1488. }
  1489. #endif /* U_HIDE_INTERNAL_API */
  1490. U_NAMESPACE_END
  1491. #endif /* #if !UCONFIG_NO_TRANSLITERATION */
  1492. #endif /* U_SHOW_CPLUSPLUS_API */
  1493. #endif