ustring.h 72 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 1998-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. *
  9. * File ustring.h
  10. *
  11. * Modification History:
  12. *
  13. * Date Name Description
  14. * 12/07/98 bertrand Creation.
  15. ******************************************************************************
  16. */
  17. #ifndef USTRING_H
  18. #define USTRING_H
  19. #include "unicode/utypes.h"
  20. #include "unicode/putil.h"
  21. #include "unicode/uiter.h"
  22. /**
  23. * \def UBRK_TYPEDEF_UBREAK_ITERATOR
  24. * @internal
  25. */
  26. #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
  27. # define UBRK_TYPEDEF_UBREAK_ITERATOR
  28. /** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/
  29. typedef struct UBreakIterator UBreakIterator;
  30. #endif
  31. /**
  32. * \file
  33. * \brief C API: Unicode string handling functions
  34. *
  35. * These C API functions provide general Unicode string handling.
  36. *
  37. * Some functions are equivalent in name, signature, and behavior to the ANSI C <string.h>
  38. * functions. (For example, they do not check for bad arguments like NULL string pointers.)
  39. * In some cases, only the thread-safe variant of such a function is implemented here
  40. * (see u_strtok_r()).
  41. *
  42. * Other functions provide more Unicode-specific functionality like locale-specific
  43. * upper/lower-casing and string comparison in code point order.
  44. *
  45. * ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units.
  46. * UTF-16 encodes each Unicode code point with either one or two UChar code units.
  47. * (This is the default form of Unicode, and a forward-compatible extension of the original,
  48. * fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0
  49. * in 1996.)
  50. *
  51. * Some APIs accept a 32-bit UChar32 value for a single code point.
  52. *
  53. * ICU also handles 16-bit Unicode text with unpaired surrogates.
  54. * Such text is not well-formed UTF-16.
  55. * Code-point-related functions treat unpaired surrogates as surrogate code points,
  56. * i.e., as separate units.
  57. *
  58. * Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings),
  59. * it is much more efficient even for random access because the code unit values
  60. * for single-unit characters vs. lead units vs. trail units are completely disjoint.
  61. * This means that it is easy to determine character (code point) boundaries from
  62. * random offsets in the string.
  63. *
  64. * Unicode (UTF-16) string processing is optimized for the single-unit case.
  65. * Although it is important to support supplementary characters
  66. * (which use pairs of lead/trail code units called "surrogates"),
  67. * their occurrence is rare. Almost all characters in modern use require only
  68. * a single UChar code unit (i.e., their code point values are <=0xffff).
  69. *
  70. * For more details see the User Guide Strings chapter (http://icu-project.org/userguide/strings.html).
  71. * For a discussion of the handling of unpaired surrogates see also
  72. * Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18.
  73. */
  74. /**
  75. * \defgroup ustring_ustrlen String Length
  76. * \ingroup ustring_strlen
  77. */
  78. /*@{*/
  79. /**
  80. * Determine the length of an array of UChar.
  81. *
  82. * @param s The array of UChars, NULL (U+0000) terminated.
  83. * @return The number of UChars in <code>chars</code>, minus the terminator.
  84. * @stable ICU 2.0
  85. */
  86. U_STABLE int32_t U_EXPORT2
  87. u_strlen(const UChar *s);
  88. /*@}*/
  89. /**
  90. * Count Unicode code points in the length UChar code units of the string.
  91. * A code point may occupy either one or two UChar code units.
  92. * Counting code points involves reading all code units.
  93. *
  94. * This functions is basically the inverse of the U16_FWD_N() macro (see utf.h).
  95. *
  96. * @param s The input string.
  97. * @param length The number of UChar code units to be checked, or -1 to count all
  98. * code points before the first NUL (U+0000).
  99. * @return The number of code points in the specified code units.
  100. * @stable ICU 2.0
  101. */
  102. U_STABLE int32_t U_EXPORT2
  103. u_countChar32(const UChar *s, int32_t length);
  104. /**
  105. * Check if the string contains more Unicode code points than a certain number.
  106. * This is more efficient than counting all code points in the entire string
  107. * and comparing that number with a threshold.
  108. * This function may not need to scan the string at all if the length is known
  109. * (not -1 for NUL-termination) and falls within a certain range, and
  110. * never needs to count more than 'number+1' code points.
  111. * Logically equivalent to (u_countChar32(s, length)>number).
  112. * A Unicode code point may occupy either one or two UChar code units.
  113. *
  114. * @param s The input string.
  115. * @param length The length of the string, or -1 if it is NUL-terminated.
  116. * @param number The number of code points in the string is compared against
  117. * the 'number' parameter.
  118. * @return Boolean value for whether the string contains more Unicode code points
  119. * than 'number'. Same as (u_countChar32(s, length)>number).
  120. * @stable ICU 2.4
  121. */
  122. U_STABLE UBool U_EXPORT2
  123. u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number);
  124. /**
  125. * Concatenate two ustrings. Appends a copy of <code>src</code>,
  126. * including the null terminator, to <code>dst</code>. The initial copied
  127. * character from <code>src</code> overwrites the null terminator in <code>dst</code>.
  128. *
  129. * @param dst The destination string.
  130. * @param src The source string.
  131. * @return A pointer to <code>dst</code>.
  132. * @stable ICU 2.0
  133. */
  134. U_STABLE UChar* U_EXPORT2
  135. u_strcat(UChar *dst,
  136. const UChar *src);
  137. /**
  138. * Concatenate two ustrings.
  139. * Appends at most <code>n</code> characters from <code>src</code> to <code>dst</code>.
  140. * Adds a terminating NUL.
  141. * If src is too long, then only <code>n-1</code> characters will be copied
  142. * before the terminating NUL.
  143. * If <code>n&lt;=0</code> then dst is not modified.
  144. *
  145. * @param dst The destination string.
  146. * @param src The source string (can be NULL/invalid if n<=0).
  147. * @param n The maximum number of characters to append; no-op if <=0.
  148. * @return A pointer to <code>dst</code>.
  149. * @stable ICU 2.0
  150. */
  151. U_STABLE UChar* U_EXPORT2
  152. u_strncat(UChar *dst,
  153. const UChar *src,
  154. int32_t n);
  155. /**
  156. * Find the first occurrence of a substring in a string.
  157. * The substring is found at code point boundaries.
  158. * That means that if the substring begins with
  159. * a trail surrogate or ends with a lead surrogate,
  160. * then it is found only if these surrogates stand alone in the text.
  161. * Otherwise, the substring edge units would be matched against
  162. * halves of surrogate pairs.
  163. *
  164. * @param s The string to search (NUL-terminated).
  165. * @param substring The substring to find (NUL-terminated).
  166. * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
  167. * or <code>s</code> itself if the <code>substring</code> is empty,
  168. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  169. * @stable ICU 2.0
  170. *
  171. * @see u_strrstr
  172. * @see u_strFindFirst
  173. * @see u_strFindLast
  174. */
  175. U_STABLE UChar * U_EXPORT2
  176. u_strstr(const UChar *s, const UChar *substring);
  177. /**
  178. * Find the first occurrence of a substring in a string.
  179. * The substring is found at code point boundaries.
  180. * That means that if the substring begins with
  181. * a trail surrogate or ends with a lead surrogate,
  182. * then it is found only if these surrogates stand alone in the text.
  183. * Otherwise, the substring edge units would be matched against
  184. * halves of surrogate pairs.
  185. *
  186. * @param s The string to search.
  187. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
  188. * @param substring The substring to find (NUL-terminated).
  189. * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
  190. * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
  191. * or <code>s</code> itself if the <code>substring</code> is empty,
  192. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  193. * @stable ICU 2.4
  194. *
  195. * @see u_strstr
  196. * @see u_strFindLast
  197. */
  198. U_STABLE UChar * U_EXPORT2
  199. u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
  200. /**
  201. * Find the first occurrence of a BMP code point in a string.
  202. * A surrogate code point is found only if its match in the text is not
  203. * part of a surrogate pair.
  204. * A NUL character is found at the string terminator.
  205. *
  206. * @param s The string to search (NUL-terminated).
  207. * @param c The BMP code point to find.
  208. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  209. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  210. * @stable ICU 2.0
  211. *
  212. * @see u_strchr32
  213. * @see u_memchr
  214. * @see u_strstr
  215. * @see u_strFindFirst
  216. */
  217. U_STABLE UChar * U_EXPORT2
  218. u_strchr(const UChar *s, UChar c);
  219. /**
  220. * Find the first occurrence of a code point in a string.
  221. * A surrogate code point is found only if its match in the text is not
  222. * part of a surrogate pair.
  223. * A NUL character is found at the string terminator.
  224. *
  225. * @param s The string to search (NUL-terminated).
  226. * @param c The code point to find.
  227. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  228. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  229. * @stable ICU 2.0
  230. *
  231. * @see u_strchr
  232. * @see u_memchr32
  233. * @see u_strstr
  234. * @see u_strFindFirst
  235. */
  236. U_STABLE UChar * U_EXPORT2
  237. u_strchr32(const UChar *s, UChar32 c);
  238. /**
  239. * Find the last occurrence of a substring in a string.
  240. * The substring is found at code point boundaries.
  241. * That means that if the substring begins with
  242. * a trail surrogate or ends with a lead surrogate,
  243. * then it is found only if these surrogates stand alone in the text.
  244. * Otherwise, the substring edge units would be matched against
  245. * halves of surrogate pairs.
  246. *
  247. * @param s The string to search (NUL-terminated).
  248. * @param substring The substring to find (NUL-terminated).
  249. * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
  250. * or <code>s</code> itself if the <code>substring</code> is empty,
  251. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  252. * @stable ICU 2.4
  253. *
  254. * @see u_strstr
  255. * @see u_strFindFirst
  256. * @see u_strFindLast
  257. */
  258. U_STABLE UChar * U_EXPORT2
  259. u_strrstr(const UChar *s, const UChar *substring);
  260. /**
  261. * Find the last occurrence of a substring in a string.
  262. * The substring is found at code point boundaries.
  263. * That means that if the substring begins with
  264. * a trail surrogate or ends with a lead surrogate,
  265. * then it is found only if these surrogates stand alone in the text.
  266. * Otherwise, the substring edge units would be matched against
  267. * halves of surrogate pairs.
  268. *
  269. * @param s The string to search.
  270. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
  271. * @param substring The substring to find (NUL-terminated).
  272. * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
  273. * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
  274. * or <code>s</code> itself if the <code>substring</code> is empty,
  275. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  276. * @stable ICU 2.4
  277. *
  278. * @see u_strstr
  279. * @see u_strFindLast
  280. */
  281. U_STABLE UChar * U_EXPORT2
  282. u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
  283. /**
  284. * Find the last occurrence of a BMP code point in a string.
  285. * A surrogate code point is found only if its match in the text is not
  286. * part of a surrogate pair.
  287. * A NUL character is found at the string terminator.
  288. *
  289. * @param s The string to search (NUL-terminated).
  290. * @param c The BMP code point to find.
  291. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  292. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  293. * @stable ICU 2.4
  294. *
  295. * @see u_strrchr32
  296. * @see u_memrchr
  297. * @see u_strrstr
  298. * @see u_strFindLast
  299. */
  300. U_STABLE UChar * U_EXPORT2
  301. u_strrchr(const UChar *s, UChar c);
  302. /**
  303. * Find the last occurrence of a code point in a string.
  304. * A surrogate code point is found only if its match in the text is not
  305. * part of a surrogate pair.
  306. * A NUL character is found at the string terminator.
  307. *
  308. * @param s The string to search (NUL-terminated).
  309. * @param c The code point to find.
  310. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  311. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  312. * @stable ICU 2.4
  313. *
  314. * @see u_strrchr
  315. * @see u_memchr32
  316. * @see u_strrstr
  317. * @see u_strFindLast
  318. */
  319. U_STABLE UChar * U_EXPORT2
  320. u_strrchr32(const UChar *s, UChar32 c);
  321. /**
  322. * Locates the first occurrence in the string <code>string</code> of any of the characters
  323. * in the string <code>matchSet</code>.
  324. * Works just like C's strpbrk but with Unicode.
  325. *
  326. * @param string The string in which to search, NUL-terminated.
  327. * @param matchSet A NUL-terminated string defining a set of code points
  328. * for which to search in the text string.
  329. * @return A pointer to the character in <code>string</code> that matches one of the
  330. * characters in <code>matchSet</code>, or NULL if no such character is found.
  331. * @stable ICU 2.0
  332. */
  333. U_STABLE UChar * U_EXPORT2
  334. u_strpbrk(const UChar *string, const UChar *matchSet);
  335. /**
  336. * Returns the number of consecutive characters in <code>string</code>,
  337. * beginning with the first, that do not occur somewhere in <code>matchSet</code>.
  338. * Works just like C's strcspn but with Unicode.
  339. *
  340. * @param string The string in which to search, NUL-terminated.
  341. * @param matchSet A NUL-terminated string defining a set of code points
  342. * for which to search in the text string.
  343. * @return The number of initial characters in <code>string</code> that do not
  344. * occur in <code>matchSet</code>.
  345. * @see u_strspn
  346. * @stable ICU 2.0
  347. */
  348. U_STABLE int32_t U_EXPORT2
  349. u_strcspn(const UChar *string, const UChar *matchSet);
  350. /**
  351. * Returns the number of consecutive characters in <code>string</code>,
  352. * beginning with the first, that occur somewhere in <code>matchSet</code>.
  353. * Works just like C's strspn but with Unicode.
  354. *
  355. * @param string The string in which to search, NUL-terminated.
  356. * @param matchSet A NUL-terminated string defining a set of code points
  357. * for which to search in the text string.
  358. * @return The number of initial characters in <code>string</code> that do
  359. * occur in <code>matchSet</code>.
  360. * @see u_strcspn
  361. * @stable ICU 2.0
  362. */
  363. U_STABLE int32_t U_EXPORT2
  364. u_strspn(const UChar *string, const UChar *matchSet);
  365. /**
  366. * The string tokenizer API allows an application to break a string into
  367. * tokens. Unlike strtok(), the saveState (the current pointer within the
  368. * original string) is maintained in saveState. In the first call, the
  369. * argument src is a pointer to the string. In subsequent calls to
  370. * return successive tokens of that string, src must be specified as
  371. * NULL. The value saveState is set by this function to maintain the
  372. * function's position within the string, and on each subsequent call
  373. * you must give this argument the same variable. This function does
  374. * handle surrogate pairs. This function is similar to the strtok_r()
  375. * the POSIX Threads Extension (1003.1c-1995) version.
  376. *
  377. * @param src String containing token(s). This string will be modified.
  378. * After the first call to u_strtok_r(), this argument must
  379. * be NULL to get to the next token.
  380. * @param delim Set of delimiter characters (Unicode code points).
  381. * @param saveState The current pointer within the original string,
  382. * which is set by this function. The saveState
  383. * parameter should the address of a local variable of type
  384. * UChar *. (i.e. defined "UChar *myLocalSaveState" and use
  385. * &myLocalSaveState for this parameter).
  386. * @return A pointer to the next token found in src, or NULL
  387. * when there are no more tokens.
  388. * @stable ICU 2.0
  389. */
  390. U_STABLE UChar * U_EXPORT2
  391. u_strtok_r(UChar *src,
  392. const UChar *delim,
  393. UChar **saveState);
  394. /**
  395. * Compare two Unicode strings for bitwise equality (code unit order).
  396. *
  397. * @param s1 A string to compare.
  398. * @param s2 A string to compare.
  399. * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
  400. * value if <code>s1</code> is bitwise less than <code>s2,</code>; a positive
  401. * value if <code>s1</code> is bitwise greater than <code>s2</code>.
  402. * @stable ICU 2.0
  403. */
  404. U_STABLE int32_t U_EXPORT2
  405. u_strcmp(const UChar *s1,
  406. const UChar *s2);
  407. /**
  408. * Compare two Unicode strings in code point order.
  409. * See u_strCompare for details.
  410. *
  411. * @param s1 A string to compare.
  412. * @param s2 A string to compare.
  413. * @return a negative/zero/positive integer corresponding to whether
  414. * the first string is less than/equal to/greater than the second one
  415. * in code point order
  416. * @stable ICU 2.0
  417. */
  418. U_STABLE int32_t U_EXPORT2
  419. u_strcmpCodePointOrder(const UChar *s1, const UChar *s2);
  420. /**
  421. * Compare two Unicode strings (binary order).
  422. *
  423. * The comparison can be done in code unit order or in code point order.
  424. * They differ only in UTF-16 when
  425. * comparing supplementary code points (U+10000..U+10ffff)
  426. * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
  427. * In code unit order, high BMP code points sort after supplementary code points
  428. * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
  429. *
  430. * This functions works with strings of different explicitly specified lengths
  431. * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
  432. * NUL-terminated strings are possible with length arguments of -1.
  433. *
  434. * @param s1 First source string.
  435. * @param length1 Length of first source string, or -1 if NUL-terminated.
  436. *
  437. * @param s2 Second source string.
  438. * @param length2 Length of second source string, or -1 if NUL-terminated.
  439. *
  440. * @param codePointOrder Choose between code unit order (FALSE)
  441. * and code point order (TRUE).
  442. *
  443. * @return <0 or 0 or >0 as usual for string comparisons
  444. *
  445. * @stable ICU 2.2
  446. */
  447. U_STABLE int32_t U_EXPORT2
  448. u_strCompare(const UChar *s1, int32_t length1,
  449. const UChar *s2, int32_t length2,
  450. UBool codePointOrder);
  451. /**
  452. * Compare two Unicode strings (binary order)
  453. * as presented by UCharIterator objects.
  454. * Works otherwise just like u_strCompare().
  455. *
  456. * Both iterators are reset to their start positions.
  457. * When the function returns, it is undefined where the iterators
  458. * have stopped.
  459. *
  460. * @param iter1 First source string iterator.
  461. * @param iter2 Second source string iterator.
  462. * @param codePointOrder Choose between code unit order (FALSE)
  463. * and code point order (TRUE).
  464. *
  465. * @return <0 or 0 or >0 as usual for string comparisons
  466. *
  467. * @see u_strCompare
  468. *
  469. * @stable ICU 2.6
  470. */
  471. U_STABLE int32_t U_EXPORT2
  472. u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder);
  473. /**
  474. * Compare two strings case-insensitively using full case folding.
  475. * This is equivalent to
  476. * u_strCompare(u_strFoldCase(s1, options),
  477. * u_strFoldCase(s2, options),
  478. * (options&U_COMPARE_CODE_POINT_ORDER)!=0).
  479. *
  480. * The comparison can be done in UTF-16 code unit order or in code point order.
  481. * They differ only when comparing supplementary code points (U+10000..U+10ffff)
  482. * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
  483. * In code unit order, high BMP code points sort after supplementary code points
  484. * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
  485. *
  486. * This functions works with strings of different explicitly specified lengths
  487. * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
  488. * NUL-terminated strings are possible with length arguments of -1.
  489. *
  490. * @param s1 First source string.
  491. * @param length1 Length of first source string, or -1 if NUL-terminated.
  492. *
  493. * @param s2 Second source string.
  494. * @param length2 Length of second source string, or -1 if NUL-terminated.
  495. *
  496. * @param options A bit set of options:
  497. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  498. * Comparison in code unit order with default case folding.
  499. *
  500. * - U_COMPARE_CODE_POINT_ORDER
  501. * Set to choose code point order instead of code unit order
  502. * (see u_strCompare for details).
  503. *
  504. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  505. *
  506. * @param pErrorCode Must be a valid pointer to an error code value,
  507. * which must not indicate a failure before the function call.
  508. *
  509. * @return <0 or 0 or >0 as usual for string comparisons
  510. *
  511. * @stable ICU 2.2
  512. */
  513. U_STABLE int32_t U_EXPORT2
  514. u_strCaseCompare(const UChar *s1, int32_t length1,
  515. const UChar *s2, int32_t length2,
  516. uint32_t options,
  517. UErrorCode *pErrorCode);
  518. /**
  519. * Compare two ustrings for bitwise equality.
  520. * Compares at most <code>n</code> characters.
  521. *
  522. * @param ucs1 A string to compare (can be NULL/invalid if n<=0).
  523. * @param ucs2 A string to compare (can be NULL/invalid if n<=0).
  524. * @param n The maximum number of characters to compare; always returns 0 if n<=0.
  525. * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
  526. * value if <code>s1</code> is bitwise less than <code>s2</code>; a positive
  527. * value if <code>s1</code> is bitwise greater than <code>s2</code>.
  528. * @stable ICU 2.0
  529. */
  530. U_STABLE int32_t U_EXPORT2
  531. u_strncmp(const UChar *ucs1,
  532. const UChar *ucs2,
  533. int32_t n);
  534. /**
  535. * Compare two Unicode strings in code point order.
  536. * This is different in UTF-16 from u_strncmp() if supplementary characters are present.
  537. * For details, see u_strCompare().
  538. *
  539. * @param s1 A string to compare.
  540. * @param s2 A string to compare.
  541. * @param n The maximum number of characters to compare.
  542. * @return a negative/zero/positive integer corresponding to whether
  543. * the first string is less than/equal to/greater than the second one
  544. * in code point order
  545. * @stable ICU 2.0
  546. */
  547. U_STABLE int32_t U_EXPORT2
  548. u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n);
  549. /**
  550. * Compare two strings case-insensitively using full case folding.
  551. * This is equivalent to u_strcmp(u_strFoldCase(s1, options), u_strFoldCase(s2, options)).
  552. *
  553. * @param s1 A string to compare.
  554. * @param s2 A string to compare.
  555. * @param options A bit set of options:
  556. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  557. * Comparison in code unit order with default case folding.
  558. *
  559. * - U_COMPARE_CODE_POINT_ORDER
  560. * Set to choose code point order instead of code unit order
  561. * (see u_strCompare for details).
  562. *
  563. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  564. *
  565. * @return A negative, zero, or positive integer indicating the comparison result.
  566. * @stable ICU 2.0
  567. */
  568. U_STABLE int32_t U_EXPORT2
  569. u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options);
  570. /**
  571. * Compare two strings case-insensitively using full case folding.
  572. * This is equivalent to u_strcmp(u_strFoldCase(s1, at most n, options),
  573. * u_strFoldCase(s2, at most n, options)).
  574. *
  575. * @param s1 A string to compare.
  576. * @param s2 A string to compare.
  577. * @param n The maximum number of characters each string to case-fold and then compare.
  578. * @param options A bit set of options:
  579. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  580. * Comparison in code unit order with default case folding.
  581. *
  582. * - U_COMPARE_CODE_POINT_ORDER
  583. * Set to choose code point order instead of code unit order
  584. * (see u_strCompare for details).
  585. *
  586. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  587. *
  588. * @return A negative, zero, or positive integer indicating the comparison result.
  589. * @stable ICU 2.0
  590. */
  591. U_STABLE int32_t U_EXPORT2
  592. u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options);
  593. /**
  594. * Compare two strings case-insensitively using full case folding.
  595. * This is equivalent to u_strcmp(u_strFoldCase(s1, n, options),
  596. * u_strFoldCase(s2, n, options)).
  597. *
  598. * @param s1 A string to compare.
  599. * @param s2 A string to compare.
  600. * @param length The number of characters in each string to case-fold and then compare.
  601. * @param options A bit set of options:
  602. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  603. * Comparison in code unit order with default case folding.
  604. *
  605. * - U_COMPARE_CODE_POINT_ORDER
  606. * Set to choose code point order instead of code unit order
  607. * (see u_strCompare for details).
  608. *
  609. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  610. *
  611. * @return A negative, zero, or positive integer indicating the comparison result.
  612. * @stable ICU 2.0
  613. */
  614. U_STABLE int32_t U_EXPORT2
  615. u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options);
  616. /**
  617. * Copy a ustring. Adds a null terminator.
  618. *
  619. * @param dst The destination string.
  620. * @param src The source string.
  621. * @return A pointer to <code>dst</code>.
  622. * @stable ICU 2.0
  623. */
  624. U_STABLE UChar* U_EXPORT2
  625. u_strcpy(UChar *dst,
  626. const UChar *src);
  627. /**
  628. * Copy a ustring.
  629. * Copies at most <code>n</code> characters. The result will be null terminated
  630. * if the length of <code>src</code> is less than <code>n</code>.
  631. *
  632. * @param dst The destination string.
  633. * @param src The source string (can be NULL/invalid if n<=0).
  634. * @param n The maximum number of characters to copy; no-op if <=0.
  635. * @return A pointer to <code>dst</code>.
  636. * @stable ICU 2.0
  637. */
  638. U_STABLE UChar* U_EXPORT2
  639. u_strncpy(UChar *dst,
  640. const UChar *src,
  641. int32_t n);
  642. #if !UCONFIG_NO_CONVERSION
  643. /**
  644. * Copy a byte string encoded in the default codepage to a ustring.
  645. * Adds a null terminator.
  646. * Performs a host byte to UChar conversion
  647. *
  648. * @param dst The destination string.
  649. * @param src The source string.
  650. * @return A pointer to <code>dst</code>.
  651. * @stable ICU 2.0
  652. */
  653. U_STABLE UChar* U_EXPORT2 u_uastrcpy(UChar *dst,
  654. const char *src );
  655. /**
  656. * Copy a byte string encoded in the default codepage to a ustring.
  657. * Copies at most <code>n</code> characters. The result will be null terminated
  658. * if the length of <code>src</code> is less than <code>n</code>.
  659. * Performs a host byte to UChar conversion
  660. *
  661. * @param dst The destination string.
  662. * @param src The source string.
  663. * @param n The maximum number of characters to copy.
  664. * @return A pointer to <code>dst</code>.
  665. * @stable ICU 2.0
  666. */
  667. U_STABLE UChar* U_EXPORT2 u_uastrncpy(UChar *dst,
  668. const char *src,
  669. int32_t n);
  670. /**
  671. * Copy ustring to a byte string encoded in the default codepage.
  672. * Adds a null terminator.
  673. * Performs a UChar to host byte conversion
  674. *
  675. * @param dst The destination string.
  676. * @param src The source string.
  677. * @return A pointer to <code>dst</code>.
  678. * @stable ICU 2.0
  679. */
  680. U_STABLE char* U_EXPORT2 u_austrcpy(char *dst,
  681. const UChar *src );
  682. /**
  683. * Copy ustring to a byte string encoded in the default codepage.
  684. * Copies at most <code>n</code> characters. The result will be null terminated
  685. * if the length of <code>src</code> is less than <code>n</code>.
  686. * Performs a UChar to host byte conversion
  687. *
  688. * @param dst The destination string.
  689. * @param src The source string.
  690. * @param n The maximum number of characters to copy.
  691. * @return A pointer to <code>dst</code>.
  692. * @stable ICU 2.0
  693. */
  694. U_STABLE char* U_EXPORT2 u_austrncpy(char *dst,
  695. const UChar *src,
  696. int32_t n );
  697. #endif
  698. /**
  699. * Synonym for memcpy(), but with UChars only.
  700. * @param dest The destination string
  701. * @param src The source string (can be NULL/invalid if count<=0)
  702. * @param count The number of characters to copy; no-op if <=0
  703. * @return A pointer to <code>dest</code>
  704. * @stable ICU 2.0
  705. */
  706. U_STABLE UChar* U_EXPORT2
  707. u_memcpy(UChar *dest, const UChar *src, int32_t count);
  708. /**
  709. * Synonym for memmove(), but with UChars only.
  710. * @param dest The destination string
  711. * @param src The source string (can be NULL/invalid if count<=0)
  712. * @param count The number of characters to move; no-op if <=0
  713. * @return A pointer to <code>dest</code>
  714. * @stable ICU 2.0
  715. */
  716. U_STABLE UChar* U_EXPORT2
  717. u_memmove(UChar *dest, const UChar *src, int32_t count);
  718. /**
  719. * Initialize <code>count</code> characters of <code>dest</code> to <code>c</code>.
  720. *
  721. * @param dest The destination string.
  722. * @param c The character to initialize the string.
  723. * @param count The maximum number of characters to set.
  724. * @return A pointer to <code>dest</code>.
  725. * @stable ICU 2.0
  726. */
  727. U_STABLE UChar* U_EXPORT2
  728. u_memset(UChar *dest, UChar c, int32_t count);
  729. /**
  730. * Compare the first <code>count</code> UChars of each buffer.
  731. *
  732. * @param buf1 The first string to compare.
  733. * @param buf2 The second string to compare.
  734. * @param count The maximum number of UChars to compare.
  735. * @return When buf1 < buf2, a negative number is returned.
  736. * When buf1 == buf2, 0 is returned.
  737. * When buf1 > buf2, a positive number is returned.
  738. * @stable ICU 2.0
  739. */
  740. U_STABLE int32_t U_EXPORT2
  741. u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count);
  742. /**
  743. * Compare two Unicode strings in code point order.
  744. * This is different in UTF-16 from u_memcmp() if supplementary characters are present.
  745. * For details, see u_strCompare().
  746. *
  747. * @param s1 A string to compare.
  748. * @param s2 A string to compare.
  749. * @param count The maximum number of characters to compare.
  750. * @return a negative/zero/positive integer corresponding to whether
  751. * the first string is less than/equal to/greater than the second one
  752. * in code point order
  753. * @stable ICU 2.0
  754. */
  755. U_STABLE int32_t U_EXPORT2
  756. u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count);
  757. /**
  758. * Find the first occurrence of a BMP code point in a string.
  759. * A surrogate code point is found only if its match in the text is not
  760. * part of a surrogate pair.
  761. * A NUL character is found at the string terminator.
  762. *
  763. * @param s The string to search (contains <code>count</code> UChars).
  764. * @param c The BMP code point to find.
  765. * @param count The length of the string.
  766. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  767. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  768. * @stable ICU 2.0
  769. *
  770. * @see u_strchr
  771. * @see u_memchr32
  772. * @see u_strFindFirst
  773. */
  774. U_STABLE UChar* U_EXPORT2
  775. u_memchr(const UChar *s, UChar c, int32_t count);
  776. /**
  777. * Find the first occurrence of a code point in a string.
  778. * A surrogate code point is found only if its match in the text is not
  779. * part of a surrogate pair.
  780. * A NUL character is found at the string terminator.
  781. *
  782. * @param s The string to search (contains <code>count</code> UChars).
  783. * @param c The code point to find.
  784. * @param count The length of the string.
  785. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  786. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  787. * @stable ICU 2.0
  788. *
  789. * @see u_strchr32
  790. * @see u_memchr
  791. * @see u_strFindFirst
  792. */
  793. U_STABLE UChar* U_EXPORT2
  794. u_memchr32(const UChar *s, UChar32 c, int32_t count);
  795. /**
  796. * Find the last occurrence of a BMP code point in a string.
  797. * A surrogate code point is found only if its match in the text is not
  798. * part of a surrogate pair.
  799. * A NUL character is found at the string terminator.
  800. *
  801. * @param s The string to search (contains <code>count</code> UChars).
  802. * @param c The BMP code point to find.
  803. * @param count The length of the string.
  804. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  805. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  806. * @stable ICU 2.4
  807. *
  808. * @see u_strrchr
  809. * @see u_memrchr32
  810. * @see u_strFindLast
  811. */
  812. U_STABLE UChar* U_EXPORT2
  813. u_memrchr(const UChar *s, UChar c, int32_t count);
  814. /**
  815. * Find the last occurrence of a code point in a string.
  816. * A surrogate code point is found only if its match in the text is not
  817. * part of a surrogate pair.
  818. * A NUL character is found at the string terminator.
  819. *
  820. * @param s The string to search (contains <code>count</code> UChars).
  821. * @param c The code point to find.
  822. * @param count The length of the string.
  823. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  824. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  825. * @stable ICU 2.4
  826. *
  827. * @see u_strrchr32
  828. * @see u_memrchr
  829. * @see u_strFindLast
  830. */
  831. U_STABLE UChar* U_EXPORT2
  832. u_memrchr32(const UChar *s, UChar32 c, int32_t count);
  833. /**
  834. * Unicode String literals in C.
  835. * We need one macro to declare a variable for the string
  836. * and to statically preinitialize it if possible,
  837. * and a second macro to dynamically initialize such a string variable if necessary.
  838. *
  839. * The macros are defined for maximum performance.
  840. * They work only for strings that contain "invariant characters", i.e.,
  841. * only latin letters, digits, and some punctuation.
  842. * See utypes.h for details.
  843. *
  844. * A pair of macros for a single string must be used with the same
  845. * parameters.
  846. * The string parameter must be a C string literal.
  847. * The length of the string, not including the terminating
  848. * `NUL`, must be specified as a constant.
  849. * The U_STRING_DECL macro should be invoked exactly once for one
  850. * such string variable before it is used.
  851. *
  852. * Usage:
  853. *
  854. * U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11);
  855. * U_STRING_DECL(ustringVar2, "jumps 5%", 8);
  856. * static UBool didInit=FALSE;
  857. *
  858. * int32_t function() {
  859. * if(!didInit) {
  860. * U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11);
  861. * U_STRING_INIT(ustringVar2, "jumps 5%", 8);
  862. * didInit=TRUE;
  863. * }
  864. * return u_strcmp(ustringVar1, ustringVar2);
  865. * }
  866. *
  867. * Note that the macros will NOT consistently work if their argument is another #`define`.
  868. * The following will not work on all platforms, don't use it.
  869. *
  870. * #define GLUCK "Mr. Gluck"
  871. * U_STRING_DECL(var, GLUCK, 9)
  872. * U_STRING_INIT(var, GLUCK, 9)
  873. *
  874. * Instead, use the string literal "Mr. Gluck" as the argument to both macro
  875. * calls.
  876. *
  877. *
  878. * @stable ICU 2.0
  879. */
  880. #if defined(U_DECLARE_UTF16)
  881. # define U_STRING_DECL(var, cs, length) static const UChar *var=(const UChar *)U_DECLARE_UTF16(cs)
  882. /**@stable ICU 2.0 */
  883. # define U_STRING_INIT(var, cs, length)
  884. #elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16)))
  885. # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs
  886. /**@stable ICU 2.0 */
  887. # define U_STRING_INIT(var, cs, length)
  888. #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
  889. # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=cs
  890. /**@stable ICU 2.0 */
  891. # define U_STRING_INIT(var, cs, length)
  892. #else
  893. # define U_STRING_DECL(var, cs, length) static UChar var[(length)+1]
  894. /**@stable ICU 2.0 */
  895. # define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1)
  896. #endif
  897. /**
  898. * Unescape a string of characters and write the resulting
  899. * Unicode characters to the destination buffer. The following escape
  900. * sequences are recognized:
  901. *
  902. * \\uhhhh 4 hex digits; h in [0-9A-Fa-f]
  903. * \\Uhhhhhhhh 8 hex digits
  904. * \\xhh 1-2 hex digits
  905. * \\x{h...} 1-8 hex digits
  906. * \\ooo 1-3 octal digits; o in [0-7]
  907. * \\cX control-X; X is masked with 0x1F
  908. *
  909. * as well as the standard ANSI C escapes:
  910. *
  911. * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
  912. * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
  913. * \\&quot; => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
  914. *
  915. * Anything else following a backslash is generically escaped. For
  916. * example, "[a\\-z]" returns "[a-z]".
  917. *
  918. * If an escape sequence is ill-formed, this method returns an empty
  919. * string. An example of an ill-formed sequence is "\\u" followed by
  920. * fewer than 4 hex digits.
  921. *
  922. * The above characters are recognized in the compiler's codepage,
  923. * that is, they are coded as 'u', '\\', etc. Characters that are
  924. * not parts of escape sequences are converted using u_charsToUChars().
  925. *
  926. * This function is similar to UnicodeString::unescape() but not
  927. * identical to it. The latter takes a source UnicodeString, so it
  928. * does escape recognition but no conversion.
  929. *
  930. * @param src a zero-terminated string of invariant characters
  931. * @param dest pointer to buffer to receive converted and unescaped
  932. * text and, if there is room, a zero terminator. May be NULL for
  933. * preflighting, in which case no UChars will be written, but the
  934. * return value will still be valid. On error, an empty string is
  935. * stored here (if possible).
  936. * @param destCapacity the number of UChars that may be written at
  937. * dest. Ignored if dest == NULL.
  938. * @return the length of unescaped string.
  939. * @see u_unescapeAt
  940. * @see UnicodeString#unescape()
  941. * @see UnicodeString#unescapeAt()
  942. * @stable ICU 2.0
  943. */
  944. U_STABLE int32_t U_EXPORT2
  945. u_unescape(const char *src,
  946. UChar *dest, int32_t destCapacity);
  947. U_CDECL_BEGIN
  948. /**
  949. * Callback function for u_unescapeAt() that returns a character of
  950. * the source text given an offset and a context pointer. The context
  951. * pointer will be whatever is passed into u_unescapeAt().
  952. *
  953. * @param offset pointer to the offset that will be passed to u_unescapeAt().
  954. * @param context an opaque pointer passed directly into u_unescapeAt()
  955. * @return the character represented by the escape sequence at
  956. * offset
  957. * @see u_unescapeAt
  958. * @stable ICU 2.0
  959. */
  960. typedef UChar (U_CALLCONV *UNESCAPE_CHAR_AT)(int32_t offset, void *context);
  961. U_CDECL_END
  962. /**
  963. * Unescape a single sequence. The character at offset-1 is assumed
  964. * (without checking) to be a backslash. This method takes a callback
  965. * pointer to a function that returns the UChar at a given offset. By
  966. * varying this callback, ICU functions are able to unescape char*
  967. * strings, UnicodeString objects, and UFILE pointers.
  968. *
  969. * If offset is out of range, or if the escape sequence is ill-formed,
  970. * (UChar32)0xFFFFFFFF is returned. See documentation of u_unescape()
  971. * for a list of recognized sequences.
  972. *
  973. * @param charAt callback function that returns a UChar of the source
  974. * text given an offset and a context pointer.
  975. * @param offset pointer to the offset that will be passed to charAt.
  976. * The offset value will be updated upon return to point after the
  977. * last parsed character of the escape sequence. On error the offset
  978. * is unchanged.
  979. * @param length the number of characters in the source text. The
  980. * last character of the source text is considered to be at offset
  981. * length-1.
  982. * @param context an opaque pointer passed directly into charAt.
  983. * @return the character represented by the escape sequence at
  984. * offset, or (UChar32)0xFFFFFFFF on error.
  985. * @see u_unescape()
  986. * @see UnicodeString#unescape()
  987. * @see UnicodeString#unescapeAt()
  988. * @stable ICU 2.0
  989. */
  990. U_STABLE UChar32 U_EXPORT2
  991. u_unescapeAt(UNESCAPE_CHAR_AT charAt,
  992. int32_t *offset,
  993. int32_t length,
  994. void *context);
  995. /**
  996. * Uppercase the characters in a string.
  997. * Casing is locale-dependent and context-sensitive.
  998. * The result may be longer or shorter than the original.
  999. * The source string and the destination buffer are allowed to overlap.
  1000. *
  1001. * @param dest A buffer for the result string. The result will be zero-terminated if
  1002. * the buffer is large enough.
  1003. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1004. * dest may be NULL and the function will only return the length of the result
  1005. * without writing any of the result string.
  1006. * @param src The original string
  1007. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1008. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1009. * @param pErrorCode Must be a valid pointer to an error code value,
  1010. * which must not indicate a failure before the function call.
  1011. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1012. * only some of the result was written to the destination buffer.
  1013. * @stable ICU 2.0
  1014. */
  1015. U_STABLE int32_t U_EXPORT2
  1016. u_strToUpper(UChar *dest, int32_t destCapacity,
  1017. const UChar *src, int32_t srcLength,
  1018. const char *locale,
  1019. UErrorCode *pErrorCode);
  1020. /**
  1021. * Lowercase the characters in a string.
  1022. * Casing is locale-dependent and context-sensitive.
  1023. * The result may be longer or shorter than the original.
  1024. * The source string and the destination buffer are allowed to overlap.
  1025. *
  1026. * @param dest A buffer for the result string. The result will be zero-terminated if
  1027. * the buffer is large enough.
  1028. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1029. * dest may be NULL and the function will only return the length of the result
  1030. * without writing any of the result string.
  1031. * @param src The original string
  1032. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1033. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1034. * @param pErrorCode Must be a valid pointer to an error code value,
  1035. * which must not indicate a failure before the function call.
  1036. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1037. * only some of the result was written to the destination buffer.
  1038. * @stable ICU 2.0
  1039. */
  1040. U_STABLE int32_t U_EXPORT2
  1041. u_strToLower(UChar *dest, int32_t destCapacity,
  1042. const UChar *src, int32_t srcLength,
  1043. const char *locale,
  1044. UErrorCode *pErrorCode);
  1045. #if !UCONFIG_NO_BREAK_ITERATION
  1046. /**
  1047. * Titlecase a string.
  1048. * Casing is locale-dependent and context-sensitive.
  1049. * Titlecasing uses a break iterator to find the first characters of words
  1050. * that are to be titlecased. It titlecases those characters and lowercases
  1051. * all others.
  1052. *
  1053. * The titlecase break iterator can be provided to customize for arbitrary
  1054. * styles, using rules and dictionaries beyond the standard iterators.
  1055. * It may be more efficient to always provide an iterator to avoid
  1056. * opening and closing one for each string.
  1057. * The standard titlecase iterator for the root locale implements the
  1058. * algorithm of Unicode TR 21.
  1059. *
  1060. * This function uses only the setText(), first() and next() methods of the
  1061. * provided break iterator.
  1062. *
  1063. * The result may be longer or shorter than the original.
  1064. * The source string and the destination buffer are allowed to overlap.
  1065. *
  1066. * @param dest A buffer for the result string. The result will be zero-terminated if
  1067. * the buffer is large enough.
  1068. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1069. * dest may be NULL and the function will only return the length of the result
  1070. * without writing any of the result string.
  1071. * @param src The original string
  1072. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1073. * @param titleIter A break iterator to find the first characters of words
  1074. * that are to be titlecased.
  1075. * If none is provided (NULL), then a standard titlecase
  1076. * break iterator is opened.
  1077. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1078. * @param pErrorCode Must be a valid pointer to an error code value,
  1079. * which must not indicate a failure before the function call.
  1080. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1081. * only some of the result was written to the destination buffer.
  1082. * @stable ICU 2.1
  1083. */
  1084. U_STABLE int32_t U_EXPORT2
  1085. u_strToTitle(UChar *dest, int32_t destCapacity,
  1086. const UChar *src, int32_t srcLength,
  1087. UBreakIterator *titleIter,
  1088. const char *locale,
  1089. UErrorCode *pErrorCode);
  1090. #endif
  1091. /**
  1092. * Case-folds the characters in a string.
  1093. *
  1094. * Case-folding is locale-independent and not context-sensitive,
  1095. * but there is an option for whether to include or exclude mappings for dotted I
  1096. * and dotless i that are marked with 'T' in CaseFolding.txt.
  1097. *
  1098. * The result may be longer or shorter than the original.
  1099. * The source string and the destination buffer are allowed to overlap.
  1100. *
  1101. * @param dest A buffer for the result string. The result will be zero-terminated if
  1102. * the buffer is large enough.
  1103. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1104. * dest may be NULL and the function will only return the length of the result
  1105. * without writing any of the result string.
  1106. * @param src The original string
  1107. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1108. * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
  1109. * @param pErrorCode Must be a valid pointer to an error code value,
  1110. * which must not indicate a failure before the function call.
  1111. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1112. * only some of the result was written to the destination buffer.
  1113. * @stable ICU 2.0
  1114. */
  1115. U_STABLE int32_t U_EXPORT2
  1116. u_strFoldCase(UChar *dest, int32_t destCapacity,
  1117. const UChar *src, int32_t srcLength,
  1118. uint32_t options,
  1119. UErrorCode *pErrorCode);
  1120. #if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION
  1121. /**
  1122. * Convert a UTF-16 string to a wchar_t string.
  1123. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
  1124. * this function simply calls the fast, dedicated function for that.
  1125. * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed.
  1126. *
  1127. * @param dest A buffer for the result string. The result will be zero-terminated if
  1128. * the buffer is large enough.
  1129. * @param destCapacity The size of the buffer (number of wchar_t's). If it is 0, then
  1130. * dest may be NULL and the function will only return the length of the
  1131. * result without writing any of the result string (pre-flighting).
  1132. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1133. * pDestLength!=NULL then *pDestLength is always set to the
  1134. * number of output units corresponding to the transformation of
  1135. * all the input units, even in case of a buffer overflow.
  1136. * @param src The original source string
  1137. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1138. * @param pErrorCode Must be a valid pointer to an error code value,
  1139. * which must not indicate a failure before the function call.
  1140. * @return The pointer to destination buffer.
  1141. * @stable ICU 2.0
  1142. */
  1143. U_STABLE wchar_t* U_EXPORT2
  1144. u_strToWCS(wchar_t *dest,
  1145. int32_t destCapacity,
  1146. int32_t *pDestLength,
  1147. const UChar *src,
  1148. int32_t srcLength,
  1149. UErrorCode *pErrorCode);
  1150. /**
  1151. * Convert a wchar_t string to UTF-16.
  1152. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
  1153. * this function simply calls the fast, dedicated function for that.
  1154. * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed.
  1155. *
  1156. * @param dest A buffer for the result string. The result will be zero-terminated if
  1157. * the buffer is large enough.
  1158. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1159. * dest may be NULL and the function will only return the length of the
  1160. * result without writing any of the result string (pre-flighting).
  1161. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1162. * pDestLength!=NULL then *pDestLength is always set to the
  1163. * number of output units corresponding to the transformation of
  1164. * all the input units, even in case of a buffer overflow.
  1165. * @param src The original source string
  1166. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1167. * @param pErrorCode Must be a valid pointer to an error code value,
  1168. * which must not indicate a failure before the function call.
  1169. * @return The pointer to destination buffer.
  1170. * @stable ICU 2.0
  1171. */
  1172. U_STABLE UChar* U_EXPORT2
  1173. u_strFromWCS(UChar *dest,
  1174. int32_t destCapacity,
  1175. int32_t *pDestLength,
  1176. const wchar_t *src,
  1177. int32_t srcLength,
  1178. UErrorCode *pErrorCode);
  1179. #endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */
  1180. /**
  1181. * Convert a UTF-16 string to UTF-8.
  1182. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1183. *
  1184. * @param dest A buffer for the result string. The result will be zero-terminated if
  1185. * the buffer is large enough.
  1186. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1187. * dest may be NULL and the function will only return the length of the
  1188. * result without writing any of the result string (pre-flighting).
  1189. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1190. * pDestLength!=NULL then *pDestLength is always set to the
  1191. * number of output units corresponding to the transformation of
  1192. * all the input units, even in case of a buffer overflow.
  1193. * @param src The original source string
  1194. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1195. * @param pErrorCode Must be a valid pointer to an error code value,
  1196. * which must not indicate a failure before the function call.
  1197. * @return The pointer to destination buffer.
  1198. * @stable ICU 2.0
  1199. * @see u_strToUTF8WithSub
  1200. * @see u_strFromUTF8
  1201. */
  1202. U_STABLE char* U_EXPORT2
  1203. u_strToUTF8(char *dest,
  1204. int32_t destCapacity,
  1205. int32_t *pDestLength,
  1206. const UChar *src,
  1207. int32_t srcLength,
  1208. UErrorCode *pErrorCode);
  1209. /**
  1210. * Convert a UTF-8 string to UTF-16.
  1211. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1212. *
  1213. * @param dest A buffer for the result string. The result will be zero-terminated if
  1214. * the buffer is large enough.
  1215. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1216. * dest may be NULL and the function will only return the length of the
  1217. * result without writing any of the result string (pre-flighting).
  1218. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1219. * pDestLength!=NULL then *pDestLength is always set to the
  1220. * number of output units corresponding to the transformation of
  1221. * all the input units, even in case of a buffer overflow.
  1222. * @param src The original source string
  1223. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1224. * @param pErrorCode Must be a valid pointer to an error code value,
  1225. * which must not indicate a failure before the function call.
  1226. * @return The pointer to destination buffer.
  1227. * @stable ICU 2.0
  1228. * @see u_strFromUTF8WithSub
  1229. * @see u_strFromUTF8Lenient
  1230. */
  1231. U_STABLE UChar* U_EXPORT2
  1232. u_strFromUTF8(UChar *dest,
  1233. int32_t destCapacity,
  1234. int32_t *pDestLength,
  1235. const char *src,
  1236. int32_t srcLength,
  1237. UErrorCode *pErrorCode);
  1238. /**
  1239. * Convert a UTF-16 string to UTF-8.
  1240. *
  1241. * Same as u_strToUTF8() except for the additional subchar which is output for
  1242. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1243. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
  1244. *
  1245. * @param dest A buffer for the result string. The result will be zero-terminated if
  1246. * the buffer is large enough.
  1247. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1248. * dest may be NULL and the function will only return the length of the
  1249. * result without writing any of the result string (pre-flighting).
  1250. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1251. * pDestLength!=NULL then *pDestLength is always set to the
  1252. * number of output units corresponding to the transformation of
  1253. * all the input units, even in case of a buffer overflow.
  1254. * @param src The original source string
  1255. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1256. * @param subchar The substitution character to use in place of an illegal input sequence,
  1257. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1258. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1259. * except for surrogate code points (U+D800..U+DFFF).
  1260. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1261. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1262. * Set to 0 if no substitutions occur or subchar<0.
  1263. * pNumSubstitutions can be NULL.
  1264. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1265. * pass the U_SUCCESS() test, or else the function returns
  1266. * immediately. Check for U_FAILURE() on output or use with
  1267. * function chaining. (See User Guide for details.)
  1268. * @return The pointer to destination buffer.
  1269. * @see u_strToUTF8
  1270. * @see u_strFromUTF8WithSub
  1271. * @stable ICU 3.6
  1272. */
  1273. U_STABLE char* U_EXPORT2
  1274. u_strToUTF8WithSub(char *dest,
  1275. int32_t destCapacity,
  1276. int32_t *pDestLength,
  1277. const UChar *src,
  1278. int32_t srcLength,
  1279. UChar32 subchar, int32_t *pNumSubstitutions,
  1280. UErrorCode *pErrorCode);
  1281. /**
  1282. * Convert a UTF-8 string to UTF-16.
  1283. *
  1284. * Same as u_strFromUTF8() except for the additional subchar which is output for
  1285. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1286. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
  1287. *
  1288. * @param dest A buffer for the result string. The result will be zero-terminated if
  1289. * the buffer is large enough.
  1290. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1291. * dest may be NULL and the function will only return the length of the
  1292. * result without writing any of the result string (pre-flighting).
  1293. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1294. * pDestLength!=NULL then *pDestLength is always set to the
  1295. * number of output units corresponding to the transformation of
  1296. * all the input units, even in case of a buffer overflow.
  1297. * @param src The original source string
  1298. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1299. * @param subchar The substitution character to use in place of an illegal input sequence,
  1300. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1301. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1302. * except for surrogate code points (U+D800..U+DFFF).
  1303. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1304. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1305. * Set to 0 if no substitutions occur or subchar<0.
  1306. * pNumSubstitutions can be NULL.
  1307. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1308. * pass the U_SUCCESS() test, or else the function returns
  1309. * immediately. Check for U_FAILURE() on output or use with
  1310. * function chaining. (See User Guide for details.)
  1311. * @return The pointer to destination buffer.
  1312. * @see u_strFromUTF8
  1313. * @see u_strFromUTF8Lenient
  1314. * @see u_strToUTF8WithSub
  1315. * @stable ICU 3.6
  1316. */
  1317. U_STABLE UChar* U_EXPORT2
  1318. u_strFromUTF8WithSub(UChar *dest,
  1319. int32_t destCapacity,
  1320. int32_t *pDestLength,
  1321. const char *src,
  1322. int32_t srcLength,
  1323. UChar32 subchar, int32_t *pNumSubstitutions,
  1324. UErrorCode *pErrorCode);
  1325. /**
  1326. * Convert a UTF-8 string to UTF-16.
  1327. *
  1328. * Same as u_strFromUTF8() except that this function is designed to be very fast,
  1329. * which it achieves by being lenient about malformed UTF-8 sequences.
  1330. * This function is intended for use in environments where UTF-8 text is
  1331. * expected to be well-formed.
  1332. *
  1333. * Its semantics are:
  1334. * - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text.
  1335. * - The function will not read beyond the input string, nor write beyond
  1336. * the destCapacity.
  1337. * - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not
  1338. * be well-formed UTF-16.
  1339. * The function will resynchronize to valid code point boundaries
  1340. * within a small number of code points after an illegal sequence.
  1341. * - Non-shortest forms are not detected and will result in "spoofing" output.
  1342. *
  1343. * For further performance improvement, if srcLength is given (>=0),
  1344. * then it must be destCapacity>=srcLength.
  1345. *
  1346. * There is no inverse u_strToUTF8Lenient() function because there is practically
  1347. * no performance gain from not checking that a UTF-16 string is well-formed.
  1348. *
  1349. * @param dest A buffer for the result string. The result will be zero-terminated if
  1350. * the buffer is large enough.
  1351. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1352. * dest may be NULL and the function will only return the length of the
  1353. * result without writing any of the result string (pre-flighting).
  1354. * Unlike for other ICU functions, if srcLength>=0 then it
  1355. * must be destCapacity>=srcLength.
  1356. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1357. * pDestLength!=NULL then *pDestLength is always set to the
  1358. * number of output units corresponding to the transformation of
  1359. * all the input units, even in case of a buffer overflow.
  1360. * Unlike for other ICU functions, if srcLength>=0 but
  1361. * destCapacity<srcLength, then *pDestLength will be set to srcLength
  1362. * (and U_BUFFER_OVERFLOW_ERROR will be set)
  1363. * regardless of the actual result length.
  1364. * @param src The original source string
  1365. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1366. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1367. * pass the U_SUCCESS() test, or else the function returns
  1368. * immediately. Check for U_FAILURE() on output or use with
  1369. * function chaining. (See User Guide for details.)
  1370. * @return The pointer to destination buffer.
  1371. * @see u_strFromUTF8
  1372. * @see u_strFromUTF8WithSub
  1373. * @see u_strToUTF8WithSub
  1374. * @stable ICU 3.6
  1375. */
  1376. U_STABLE UChar * U_EXPORT2
  1377. u_strFromUTF8Lenient(UChar *dest,
  1378. int32_t destCapacity,
  1379. int32_t *pDestLength,
  1380. const char *src,
  1381. int32_t srcLength,
  1382. UErrorCode *pErrorCode);
  1383. /**
  1384. * Convert a UTF-16 string to UTF-32.
  1385. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1386. *
  1387. * @param dest A buffer for the result string. The result will be zero-terminated if
  1388. * the buffer is large enough.
  1389. * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
  1390. * dest may be NULL and the function will only return the length of the
  1391. * result without writing any of the result string (pre-flighting).
  1392. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1393. * pDestLength!=NULL then *pDestLength is always set to the
  1394. * number of output units corresponding to the transformation of
  1395. * all the input units, even in case of a buffer overflow.
  1396. * @param src The original source string
  1397. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1398. * @param pErrorCode Must be a valid pointer to an error code value,
  1399. * which must not indicate a failure before the function call.
  1400. * @return The pointer to destination buffer.
  1401. * @see u_strToUTF32WithSub
  1402. * @see u_strFromUTF32
  1403. * @stable ICU 2.0
  1404. */
  1405. U_STABLE UChar32* U_EXPORT2
  1406. u_strToUTF32(UChar32 *dest,
  1407. int32_t destCapacity,
  1408. int32_t *pDestLength,
  1409. const UChar *src,
  1410. int32_t srcLength,
  1411. UErrorCode *pErrorCode);
  1412. /**
  1413. * Convert a UTF-32 string to UTF-16.
  1414. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1415. *
  1416. * @param dest A buffer for the result string. The result will be zero-terminated if
  1417. * the buffer is large enough.
  1418. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1419. * dest may be NULL and the function will only return the length of the
  1420. * result without writing any of the result string (pre-flighting).
  1421. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1422. * pDestLength!=NULL then *pDestLength is always set to the
  1423. * number of output units corresponding to the transformation of
  1424. * all the input units, even in case of a buffer overflow.
  1425. * @param src The original source string
  1426. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1427. * @param pErrorCode Must be a valid pointer to an error code value,
  1428. * which must not indicate a failure before the function call.
  1429. * @return The pointer to destination buffer.
  1430. * @see u_strFromUTF32WithSub
  1431. * @see u_strToUTF32
  1432. * @stable ICU 2.0
  1433. */
  1434. U_STABLE UChar* U_EXPORT2
  1435. u_strFromUTF32(UChar *dest,
  1436. int32_t destCapacity,
  1437. int32_t *pDestLength,
  1438. const UChar32 *src,
  1439. int32_t srcLength,
  1440. UErrorCode *pErrorCode);
  1441. /**
  1442. * Convert a UTF-16 string to UTF-32.
  1443. *
  1444. * Same as u_strToUTF32() except for the additional subchar which is output for
  1445. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1446. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32().
  1447. *
  1448. * @param dest A buffer for the result string. The result will be zero-terminated if
  1449. * the buffer is large enough.
  1450. * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
  1451. * dest may be NULL and the function will only return the length of the
  1452. * result without writing any of the result string (pre-flighting).
  1453. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1454. * pDestLength!=NULL then *pDestLength is always set to the
  1455. * number of output units corresponding to the transformation of
  1456. * all the input units, even in case of a buffer overflow.
  1457. * @param src The original source string
  1458. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1459. * @param subchar The substitution character to use in place of an illegal input sequence,
  1460. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1461. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1462. * except for surrogate code points (U+D800..U+DFFF).
  1463. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1464. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1465. * Set to 0 if no substitutions occur or subchar<0.
  1466. * pNumSubstitutions can be NULL.
  1467. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1468. * pass the U_SUCCESS() test, or else the function returns
  1469. * immediately. Check for U_FAILURE() on output or use with
  1470. * function chaining. (See User Guide for details.)
  1471. * @return The pointer to destination buffer.
  1472. * @see u_strToUTF32
  1473. * @see u_strFromUTF32WithSub
  1474. * @stable ICU 4.2
  1475. */
  1476. U_STABLE UChar32* U_EXPORT2
  1477. u_strToUTF32WithSub(UChar32 *dest,
  1478. int32_t destCapacity,
  1479. int32_t *pDestLength,
  1480. const UChar *src,
  1481. int32_t srcLength,
  1482. UChar32 subchar, int32_t *pNumSubstitutions,
  1483. UErrorCode *pErrorCode);
  1484. /**
  1485. * Convert a UTF-32 string to UTF-16.
  1486. *
  1487. * Same as u_strFromUTF32() except for the additional subchar which is output for
  1488. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1489. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32().
  1490. *
  1491. * @param dest A buffer for the result string. The result will be zero-terminated if
  1492. * the buffer is large enough.
  1493. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1494. * dest may be NULL and the function will only return the length of the
  1495. * result without writing any of the result string (pre-flighting).
  1496. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1497. * pDestLength!=NULL then *pDestLength is always set to the
  1498. * number of output units corresponding to the transformation of
  1499. * all the input units, even in case of a buffer overflow.
  1500. * @param src The original source string
  1501. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1502. * @param subchar The substitution character to use in place of an illegal input sequence,
  1503. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1504. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1505. * except for surrogate code points (U+D800..U+DFFF).
  1506. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1507. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1508. * Set to 0 if no substitutions occur or subchar<0.
  1509. * pNumSubstitutions can be NULL.
  1510. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1511. * pass the U_SUCCESS() test, or else the function returns
  1512. * immediately. Check for U_FAILURE() on output or use with
  1513. * function chaining. (See User Guide for details.)
  1514. * @return The pointer to destination buffer.
  1515. * @see u_strFromUTF32
  1516. * @see u_strToUTF32WithSub
  1517. * @stable ICU 4.2
  1518. */
  1519. U_STABLE UChar* U_EXPORT2
  1520. u_strFromUTF32WithSub(UChar *dest,
  1521. int32_t destCapacity,
  1522. int32_t *pDestLength,
  1523. const UChar32 *src,
  1524. int32_t srcLength,
  1525. UChar32 subchar, int32_t *pNumSubstitutions,
  1526. UErrorCode *pErrorCode);
  1527. /**
  1528. * Convert a 16-bit Unicode string to Java Modified UTF-8.
  1529. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
  1530. *
  1531. * This function behaves according to the documentation for Java DataOutput.writeUTF()
  1532. * except that it does not encode the output length in the destination buffer
  1533. * and does not have an output length restriction.
  1534. * See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String)
  1535. *
  1536. * The input string need not be well-formed UTF-16.
  1537. * (Therefore there is no subchar parameter.)
  1538. *
  1539. * @param dest A buffer for the result string. The result will be zero-terminated if
  1540. * the buffer is large enough.
  1541. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1542. * dest may be NULL and the function will only return the length of the
  1543. * result without writing any of the result string (pre-flighting).
  1544. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1545. * pDestLength!=NULL then *pDestLength is always set to the
  1546. * number of output units corresponding to the transformation of
  1547. * all the input units, even in case of a buffer overflow.
  1548. * @param src The original source string
  1549. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1550. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1551. * pass the U_SUCCESS() test, or else the function returns
  1552. * immediately. Check for U_FAILURE() on output or use with
  1553. * function chaining. (See User Guide for details.)
  1554. * @return The pointer to destination buffer.
  1555. * @stable ICU 4.4
  1556. * @see u_strToUTF8WithSub
  1557. * @see u_strFromJavaModifiedUTF8WithSub
  1558. */
  1559. U_STABLE char* U_EXPORT2
  1560. u_strToJavaModifiedUTF8(
  1561. char *dest,
  1562. int32_t destCapacity,
  1563. int32_t *pDestLength,
  1564. const UChar *src,
  1565. int32_t srcLength,
  1566. UErrorCode *pErrorCode);
  1567. /**
  1568. * Convert a Java Modified UTF-8 string to a 16-bit Unicode string.
  1569. * If the input string is not well-formed and no substitution char is specified,
  1570. * then the U_INVALID_CHAR_FOUND error code is set.
  1571. *
  1572. * This function behaves according to the documentation for Java DataInput.readUTF()
  1573. * except that it takes a length parameter rather than
  1574. * interpreting the first two input bytes as the length.
  1575. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF()
  1576. *
  1577. * The output string may not be well-formed UTF-16.
  1578. *
  1579. * @param dest A buffer for the result string. The result will be zero-terminated if
  1580. * the buffer is large enough.
  1581. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1582. * dest may be NULL and the function will only return the length of the
  1583. * result without writing any of the result string (pre-flighting).
  1584. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1585. * pDestLength!=NULL then *pDestLength is always set to the
  1586. * number of output units corresponding to the transformation of
  1587. * all the input units, even in case of a buffer overflow.
  1588. * @param src The original source string
  1589. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1590. * @param subchar The substitution character to use in place of an illegal input sequence,
  1591. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1592. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1593. * except for surrogate code points (U+D800..U+DFFF).
  1594. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1595. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1596. * Set to 0 if no substitutions occur or subchar<0.
  1597. * pNumSubstitutions can be NULL.
  1598. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1599. * pass the U_SUCCESS() test, or else the function returns
  1600. * immediately. Check for U_FAILURE() on output or use with
  1601. * function chaining. (See User Guide for details.)
  1602. * @return The pointer to destination buffer.
  1603. * @see u_strFromUTF8WithSub
  1604. * @see u_strFromUTF8Lenient
  1605. * @see u_strToJavaModifiedUTF8
  1606. * @stable ICU 4.4
  1607. */
  1608. U_STABLE UChar* U_EXPORT2
  1609. u_strFromJavaModifiedUTF8WithSub(
  1610. UChar *dest,
  1611. int32_t destCapacity,
  1612. int32_t *pDestLength,
  1613. const char *src,
  1614. int32_t srcLength,
  1615. UChar32 subchar, int32_t *pNumSubstitutions,
  1616. UErrorCode *pErrorCode);
  1617. #endif