uset.h 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2002-2014, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: uset.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002mar07
  16. * created by: Markus W. Scherer
  17. *
  18. * C version of UnicodeSet.
  19. */
  20. /**
  21. * \file
  22. * \brief C API: Unicode Set
  23. *
  24. * <p>This is a C wrapper around the C++ UnicodeSet class.</p>
  25. */
  26. #ifndef __USET_H__
  27. #define __USET_H__
  28. #include "unicode/utypes.h"
  29. #include "unicode/uchar.h"
  30. #include "unicode/localpointer.h"
  31. #ifndef USET_DEFINED
  32. #ifndef U_IN_DOXYGEN
  33. #define USET_DEFINED
  34. #endif
  35. /**
  36. * USet is the C API type corresponding to C++ class UnicodeSet.
  37. * Use the uset_* API to manipulate. Create with
  38. * uset_open*, and destroy with uset_close.
  39. * @stable ICU 2.4
  40. */
  41. typedef struct USet USet;
  42. #endif
  43. /**
  44. * Bitmask values to be passed to uset_openPatternOptions() or
  45. * uset_applyPattern() taking an option parameter.
  46. * @stable ICU 2.4
  47. */
  48. enum {
  49. /**
  50. * Ignore white space within patterns unless quoted or escaped.
  51. * @stable ICU 2.4
  52. */
  53. USET_IGNORE_SPACE = 1,
  54. /**
  55. * Enable case insensitive matching. E.g., "[ab]" with this flag
  56. * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
  57. * match all except 'a', 'A', 'b', and 'B'. This performs a full
  58. * closure over case mappings, e.g. U+017F for s.
  59. *
  60. * The resulting set is a superset of the input for the code points but
  61. * not for the strings.
  62. * It performs a case mapping closure of the code points and adds
  63. * full case folding strings for the code points, and reduces strings of
  64. * the original set to their full case folding equivalents.
  65. *
  66. * This is designed for case-insensitive matches, for example
  67. * in regular expressions. The full code point case closure allows checking of
  68. * an input character directly against the closure set.
  69. * Strings are matched by comparing the case-folded form from the closure
  70. * set with an incremental case folding of the string in question.
  71. *
  72. * The closure set will also contain single code points if the original
  73. * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
  74. * This is not necessary (that is, redundant) for the above matching method
  75. * but results in the same closure sets regardless of whether the original
  76. * set contained the code point or a string.
  77. *
  78. * @stable ICU 2.4
  79. */
  80. USET_CASE_INSENSITIVE = 2,
  81. /**
  82. * Enable case insensitive matching. E.g., "[ab]" with this flag
  83. * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
  84. * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
  85. * title-, and uppercase mappings as well as the case folding
  86. * of each existing element in the set.
  87. * @stable ICU 3.2
  88. */
  89. USET_ADD_CASE_MAPPINGS = 4
  90. };
  91. /**
  92. * Argument values for whether span() and similar functions continue while
  93. * the current character is contained vs. not contained in the set.
  94. *
  95. * The functionality is straightforward for sets with only single code points,
  96. * without strings (which is the common case):
  97. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same.
  98. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED.
  99. * - span() and spanBack() partition any string the same way when
  100. * alternating between span(USET_SPAN_NOT_CONTAINED) and
  101. * span(either "contained" condition).
  102. * - Using a complemented (inverted) set and the opposite span conditions
  103. * yields the same results.
  104. *
  105. * When a set contains multi-code point strings, then these statements may not
  106. * be true, depending on the strings in the set (for example, whether they
  107. * overlap with each other) and the string that is processed.
  108. * For a set with strings:
  109. * - The complement of the set contains the opposite set of code points,
  110. * but the same set of strings.
  111. * Therefore, complementing both the set and the span conditions
  112. * may yield different results.
  113. * - When starting spans at different positions in a string
  114. * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
  115. * because a set string may start before the later position.
  116. * - span(USET_SPAN_SIMPLE) may be shorter than
  117. * span(USET_SPAN_CONTAINED) because it will not recursively try
  118. * all possible paths.
  119. * For example, with a set which contains the three strings "xy", "xya" and "ax",
  120. * span("xyax", USET_SPAN_CONTAINED) will return 4 but
  121. * span("xyax", USET_SPAN_SIMPLE) will return 3.
  122. * span(USET_SPAN_SIMPLE) will never be longer than
  123. * span(USET_SPAN_CONTAINED).
  124. * - With either "contained" condition, span() and spanBack() may partition
  125. * a string in different ways.
  126. * For example, with a set which contains the two strings "ab" and "ba",
  127. * and when processing the string "aba",
  128. * span() will yield contained/not-contained boundaries of { 0, 2, 3 }
  129. * while spanBack() will yield boundaries of { 0, 1, 3 }.
  130. *
  131. * Note: If it is important to get the same boundaries whether iterating forward
  132. * or backward through a string, then either only span() should be used and
  133. * the boundaries cached for backward operation, or an ICU BreakIterator
  134. * could be used.
  135. *
  136. * Note: Unpaired surrogates are treated like surrogate code points.
  137. * Similarly, set strings match only on code point boundaries,
  138. * never in the middle of a surrogate pair.
  139. * Illegal UTF-8 sequences are treated like U+FFFD.
  140. * When processing UTF-8 strings, malformed set strings
  141. * (strings with unpaired surrogates which cannot be converted to UTF-8)
  142. * are ignored.
  143. *
  144. * @stable ICU 3.8
  145. */
  146. typedef enum USetSpanCondition {
  147. /**
  148. * Continues a span() while there is no set element at the current position.
  149. * Increments by one code point at a time.
  150. * Stops before the first set element (character or string).
  151. * (For code points only, this is like while contains(current)==FALSE).
  152. *
  153. * When span() returns, the substring between where it started and the position
  154. * it returned consists only of characters that are not in the set,
  155. * and none of its strings overlap with the span.
  156. *
  157. * @stable ICU 3.8
  158. */
  159. USET_SPAN_NOT_CONTAINED = 0,
  160. /**
  161. * Spans the longest substring that is a concatenation of set elements (characters or strings).
  162. * (For characters only, this is like while contains(current)==TRUE).
  163. *
  164. * When span() returns, the substring between where it started and the position
  165. * it returned consists only of set elements (characters or strings) that are in the set.
  166. *
  167. * If a set contains strings, then the span will be the longest substring for which there
  168. * exists at least one non-overlapping concatenation of set elements (characters or strings).
  169. * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
  170. * (Java/ICU/Perl regex stops at the first match of an OR.)
  171. *
  172. * @stable ICU 3.8
  173. */
  174. USET_SPAN_CONTAINED = 1,
  175. /**
  176. * Continues a span() while there is a set element at the current position.
  177. * Increments by the longest matching element at each position.
  178. * (For characters only, this is like while contains(current)==TRUE).
  179. *
  180. * When span() returns, the substring between where it started and the position
  181. * it returned consists only of set elements (characters or strings) that are in the set.
  182. *
  183. * If a set only contains single characters, then this is the same
  184. * as USET_SPAN_CONTAINED.
  185. *
  186. * If a set contains strings, then the span will be the longest substring
  187. * with a match at each position with the longest single set element (character or string).
  188. *
  189. * Use this span condition together with other longest-match algorithms,
  190. * such as ICU converters (ucnv_getUnicodeSet()).
  191. *
  192. * @stable ICU 3.8
  193. */
  194. USET_SPAN_SIMPLE = 2,
  195. #ifndef U_HIDE_DEPRECATED_API
  196. /**
  197. * One more than the last span condition.
  198. * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
  199. */
  200. USET_SPAN_CONDITION_COUNT
  201. #endif // U_HIDE_DEPRECATED_API
  202. } USetSpanCondition;
  203. enum {
  204. /**
  205. * Capacity of USerializedSet::staticArray.
  206. * Enough for any single-code point set.
  207. * Also provides padding for nice sizeof(USerializedSet).
  208. * @stable ICU 2.4
  209. */
  210. USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
  211. };
  212. /**
  213. * A serialized form of a Unicode set. Limited manipulations are
  214. * possible directly on a serialized set. See below.
  215. * @stable ICU 2.4
  216. */
  217. typedef struct USerializedSet {
  218. /**
  219. * The serialized Unicode Set.
  220. * @stable ICU 2.4
  221. */
  222. const uint16_t *array;
  223. /**
  224. * The length of the array that contains BMP characters.
  225. * @stable ICU 2.4
  226. */
  227. int32_t bmpLength;
  228. /**
  229. * The total length of the array.
  230. * @stable ICU 2.4
  231. */
  232. int32_t length;
  233. /**
  234. * A small buffer for the array to reduce memory allocations.
  235. * @stable ICU 2.4
  236. */
  237. uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY];
  238. } USerializedSet;
  239. /*********************************************************************
  240. * USet API
  241. *********************************************************************/
  242. /**
  243. * Create an empty USet object.
  244. * Equivalent to uset_open(1, 0).
  245. * @return a newly created USet. The caller must call uset_close() on
  246. * it when done.
  247. * @stable ICU 4.2
  248. */
  249. U_STABLE USet* U_EXPORT2
  250. uset_openEmpty(void);
  251. /**
  252. * Creates a USet object that contains the range of characters
  253. * start..end, inclusive. If <code>start > end</code>
  254. * then an empty set is created (same as using uset_openEmpty()).
  255. * @param start first character of the range, inclusive
  256. * @param end last character of the range, inclusive
  257. * @return a newly created USet. The caller must call uset_close() on
  258. * it when done.
  259. * @stable ICU 2.4
  260. */
  261. U_STABLE USet* U_EXPORT2
  262. uset_open(UChar32 start, UChar32 end);
  263. /**
  264. * Creates a set from the given pattern. See the UnicodeSet class
  265. * description for the syntax of the pattern language.
  266. * @param pattern a string specifying what characters are in the set
  267. * @param patternLength the length of the pattern, or -1 if null
  268. * terminated
  269. * @param ec the error code
  270. * @stable ICU 2.4
  271. */
  272. U_STABLE USet* U_EXPORT2
  273. uset_openPattern(const UChar* pattern, int32_t patternLength,
  274. UErrorCode* ec);
  275. /**
  276. * Creates a set from the given pattern. See the UnicodeSet class
  277. * description for the syntax of the pattern language.
  278. * @param pattern a string specifying what characters are in the set
  279. * @param patternLength the length of the pattern, or -1 if null
  280. * terminated
  281. * @param options bitmask for options to apply to the pattern.
  282. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
  283. * @param ec the error code
  284. * @stable ICU 2.4
  285. */
  286. U_STABLE USet* U_EXPORT2
  287. uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
  288. uint32_t options,
  289. UErrorCode* ec);
  290. /**
  291. * Disposes of the storage used by a USet object. This function should
  292. * be called exactly once for objects returned by uset_open().
  293. * @param set the object to dispose of
  294. * @stable ICU 2.4
  295. */
  296. U_STABLE void U_EXPORT2
  297. uset_close(USet* set);
  298. #if U_SHOW_CPLUSPLUS_API
  299. U_NAMESPACE_BEGIN
  300. /**
  301. * \class LocalUSetPointer
  302. * "Smart pointer" class, closes a USet via uset_close().
  303. * For most methods see the LocalPointerBase base class.
  304. *
  305. * @see LocalPointerBase
  306. * @see LocalPointer
  307. * @stable ICU 4.4
  308. */
  309. U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close);
  310. U_NAMESPACE_END
  311. #endif
  312. /**
  313. * Returns a copy of this object.
  314. * If this set is frozen, then the clone will be frozen as well.
  315. * Use uset_cloneAsThawed() for a mutable clone of a frozen set.
  316. * @param set the original set
  317. * @return the newly allocated copy of the set
  318. * @see uset_cloneAsThawed
  319. * @stable ICU 3.8
  320. */
  321. U_STABLE USet * U_EXPORT2
  322. uset_clone(const USet *set);
  323. /**
  324. * Determines whether the set has been frozen (made immutable) or not.
  325. * See the ICU4J Freezable interface for details.
  326. * @param set the set
  327. * @return TRUE/FALSE for whether the set has been frozen
  328. * @see uset_freeze
  329. * @see uset_cloneAsThawed
  330. * @stable ICU 3.8
  331. */
  332. U_STABLE UBool U_EXPORT2
  333. uset_isFrozen(const USet *set);
  334. /**
  335. * Freeze the set (make it immutable).
  336. * Once frozen, it cannot be unfrozen and is therefore thread-safe
  337. * until it is deleted.
  338. * See the ICU4J Freezable interface for details.
  339. * Freezing the set may also make some operations faster, for example
  340. * uset_contains() and uset_span().
  341. * A frozen set will not be modified. (It remains frozen.)
  342. * @param set the set
  343. * @return the same set, now frozen
  344. * @see uset_isFrozen
  345. * @see uset_cloneAsThawed
  346. * @stable ICU 3.8
  347. */
  348. U_STABLE void U_EXPORT2
  349. uset_freeze(USet *set);
  350. /**
  351. * Clone the set and make the clone mutable.
  352. * See the ICU4J Freezable interface for details.
  353. * @param set the set
  354. * @return the mutable clone
  355. * @see uset_freeze
  356. * @see uset_isFrozen
  357. * @see uset_clone
  358. * @stable ICU 3.8
  359. */
  360. U_STABLE USet * U_EXPORT2
  361. uset_cloneAsThawed(const USet *set);
  362. /**
  363. * Causes the USet object to represent the range <code>start - end</code>.
  364. * If <code>start > end</code> then this USet is set to an empty range.
  365. * A frozen set will not be modified.
  366. * @param set the object to set to the given range
  367. * @param start first character in the set, inclusive
  368. * @param end last character in the set, inclusive
  369. * @stable ICU 3.2
  370. */
  371. U_STABLE void U_EXPORT2
  372. uset_set(USet* set,
  373. UChar32 start, UChar32 end);
  374. /**
  375. * Modifies the set to represent the set specified by the given
  376. * pattern. See the UnicodeSet class description for the syntax of
  377. * the pattern language. See also the User Guide chapter about UnicodeSet.
  378. * <em>Empties the set passed before applying the pattern.</em>
  379. * A frozen set will not be modified.
  380. * @param set The set to which the pattern is to be applied.
  381. * @param pattern A pointer to UChar string specifying what characters are in the set.
  382. * The character at pattern[0] must be a '['.
  383. * @param patternLength The length of the UChar string. -1 if NUL terminated.
  384. * @param options A bitmask for options to apply to the pattern.
  385. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
  386. * @param status Returns an error if the pattern cannot be parsed.
  387. * @return Upon successful parse, the value is either
  388. * the index of the character after the closing ']'
  389. * of the parsed pattern.
  390. * If the status code indicates failure, then the return value
  391. * is the index of the error in the source.
  392. *
  393. * @stable ICU 2.8
  394. */
  395. U_STABLE int32_t U_EXPORT2
  396. uset_applyPattern(USet *set,
  397. const UChar *pattern, int32_t patternLength,
  398. uint32_t options,
  399. UErrorCode *status);
  400. /**
  401. * Modifies the set to contain those code points which have the given value
  402. * for the given binary or enumerated property, as returned by
  403. * u_getIntPropertyValue. Prior contents of this set are lost.
  404. * A frozen set will not be modified.
  405. *
  406. * @param set the object to contain the code points defined by the property
  407. *
  408. * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
  409. * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
  410. * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
  411. *
  412. * @param value a value in the range u_getIntPropertyMinValue(prop)..
  413. * u_getIntPropertyMaxValue(prop), with one exception. If prop is
  414. * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
  415. * rather a mask value produced by U_GET_GC_MASK(). This allows grouped
  416. * categories such as [:L:] to be represented.
  417. *
  418. * @param ec error code input/output parameter
  419. *
  420. * @stable ICU 3.2
  421. */
  422. U_STABLE void U_EXPORT2
  423. uset_applyIntPropertyValue(USet* set,
  424. UProperty prop, int32_t value, UErrorCode* ec);
  425. /**
  426. * Modifies the set to contain those code points which have the
  427. * given value for the given property. Prior contents of this
  428. * set are lost.
  429. * A frozen set will not be modified.
  430. *
  431. * @param set the object to contain the code points defined by the given
  432. * property and value alias
  433. *
  434. * @param prop a string specifying a property alias, either short or long.
  435. * The name is matched loosely. See PropertyAliases.txt for names and a
  436. * description of loose matching. If the value string is empty, then this
  437. * string is interpreted as either a General_Category value alias, a Script
  438. * value alias, a binary property alias, or a special ID. Special IDs are
  439. * matched loosely and correspond to the following sets:
  440. *
  441. * "ANY" = [\\u0000-\\U0010FFFF],
  442. * "ASCII" = [\\u0000-\\u007F],
  443. * "Assigned" = [:^Cn:].
  444. *
  445. * @param propLength the length of the prop, or -1 if NULL
  446. *
  447. * @param value a string specifying a value alias, either short or long.
  448. * The name is matched loosely. See PropertyValueAliases.txt for names
  449. * and a description of loose matching. In addition to aliases listed,
  450. * numeric values and canonical combining classes may be expressed
  451. * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string
  452. * may also be empty.
  453. *
  454. * @param valueLength the length of the value, or -1 if NULL
  455. *
  456. * @param ec error code input/output parameter
  457. *
  458. * @stable ICU 3.2
  459. */
  460. U_STABLE void U_EXPORT2
  461. uset_applyPropertyAlias(USet* set,
  462. const UChar *prop, int32_t propLength,
  463. const UChar *value, int32_t valueLength,
  464. UErrorCode* ec);
  465. /**
  466. * Return true if the given position, in the given pattern, appears
  467. * to be the start of a UnicodeSet pattern.
  468. *
  469. * @param pattern a string specifying the pattern
  470. * @param patternLength the length of the pattern, or -1 if NULL
  471. * @param pos the given position
  472. * @stable ICU 3.2
  473. */
  474. U_STABLE UBool U_EXPORT2
  475. uset_resemblesPattern(const UChar *pattern, int32_t patternLength,
  476. int32_t pos);
  477. /**
  478. * Returns a string representation of this set. If the result of
  479. * calling this function is passed to a uset_openPattern(), it
  480. * will produce another set that is equal to this one.
  481. * @param set the set
  482. * @param result the string to receive the rules, may be NULL
  483. * @param resultCapacity the capacity of result, may be 0 if result is NULL
  484. * @param escapeUnprintable if TRUE then convert unprintable
  485. * character to their hex escape representations, \\uxxxx or
  486. * \\Uxxxxxxxx. Unprintable characters are those other than
  487. * U+000A, U+0020..U+007E.
  488. * @param ec error code.
  489. * @return length of string, possibly larger than resultCapacity
  490. * @stable ICU 2.4
  491. */
  492. U_STABLE int32_t U_EXPORT2
  493. uset_toPattern(const USet* set,
  494. UChar* result, int32_t resultCapacity,
  495. UBool escapeUnprintable,
  496. UErrorCode* ec);
  497. /**
  498. * Adds the given character to the given USet. After this call,
  499. * uset_contains(set, c) will return TRUE.
  500. * A frozen set will not be modified.
  501. * @param set the object to which to add the character
  502. * @param c the character to add
  503. * @stable ICU 2.4
  504. */
  505. U_STABLE void U_EXPORT2
  506. uset_add(USet* set, UChar32 c);
  507. /**
  508. * Adds all of the elements in the specified set to this set if
  509. * they're not already present. This operation effectively
  510. * modifies this set so that its value is the <i>union</i> of the two
  511. * sets. The behavior of this operation is unspecified if the specified
  512. * collection is modified while the operation is in progress.
  513. * A frozen set will not be modified.
  514. *
  515. * @param set the object to which to add the set
  516. * @param additionalSet the source set whose elements are to be added to this set.
  517. * @stable ICU 2.6
  518. */
  519. U_STABLE void U_EXPORT2
  520. uset_addAll(USet* set, const USet *additionalSet);
  521. /**
  522. * Adds the given range of characters to the given USet. After this call,
  523. * uset_contains(set, start, end) will return TRUE.
  524. * A frozen set will not be modified.
  525. * @param set the object to which to add the character
  526. * @param start the first character of the range to add, inclusive
  527. * @param end the last character of the range to add, inclusive
  528. * @stable ICU 2.2
  529. */
  530. U_STABLE void U_EXPORT2
  531. uset_addRange(USet* set, UChar32 start, UChar32 end);
  532. /**
  533. * Adds the given string to the given USet. After this call,
  534. * uset_containsString(set, str, strLen) will return TRUE.
  535. * A frozen set will not be modified.
  536. * @param set the object to which to add the character
  537. * @param str the string to add
  538. * @param strLen the length of the string or -1 if null terminated.
  539. * @stable ICU 2.4
  540. */
  541. U_STABLE void U_EXPORT2
  542. uset_addString(USet* set, const UChar* str, int32_t strLen);
  543. /**
  544. * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
  545. * If this set already any particular character, it has no effect on that character.
  546. * A frozen set will not be modified.
  547. * @param set the object to which to add the character
  548. * @param str the source string
  549. * @param strLen the length of the string or -1 if null terminated.
  550. * @stable ICU 3.4
  551. */
  552. U_STABLE void U_EXPORT2
  553. uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen);
  554. /**
  555. * Removes the given character from the given USet. After this call,
  556. * uset_contains(set, c) will return FALSE.
  557. * A frozen set will not be modified.
  558. * @param set the object from which to remove the character
  559. * @param c the character to remove
  560. * @stable ICU 2.4
  561. */
  562. U_STABLE void U_EXPORT2
  563. uset_remove(USet* set, UChar32 c);
  564. /**
  565. * Removes the given range of characters from the given USet. After this call,
  566. * uset_contains(set, start, end) will return FALSE.
  567. * A frozen set will not be modified.
  568. * @param set the object to which to add the character
  569. * @param start the first character of the range to remove, inclusive
  570. * @param end the last character of the range to remove, inclusive
  571. * @stable ICU 2.2
  572. */
  573. U_STABLE void U_EXPORT2
  574. uset_removeRange(USet* set, UChar32 start, UChar32 end);
  575. /**
  576. * Removes the given string to the given USet. After this call,
  577. * uset_containsString(set, str, strLen) will return FALSE.
  578. * A frozen set will not be modified.
  579. * @param set the object to which to add the character
  580. * @param str the string to remove
  581. * @param strLen the length of the string or -1 if null terminated.
  582. * @stable ICU 2.4
  583. */
  584. U_STABLE void U_EXPORT2
  585. uset_removeString(USet* set, const UChar* str, int32_t strLen);
  586. /**
  587. * Removes from this set all of its elements that are contained in the
  588. * specified set. This operation effectively modifies this
  589. * set so that its value is the <i>asymmetric set difference</i> of
  590. * the two sets.
  591. * A frozen set will not be modified.
  592. * @param set the object from which the elements are to be removed
  593. * @param removeSet the object that defines which elements will be
  594. * removed from this set
  595. * @stable ICU 3.2
  596. */
  597. U_STABLE void U_EXPORT2
  598. uset_removeAll(USet* set, const USet* removeSet);
  599. /**
  600. * Retain only the elements in this set that are contained in the
  601. * specified range. If <code>start > end</code> then an empty range is
  602. * retained, leaving the set empty. This is equivalent to
  603. * a boolean logic AND, or a set INTERSECTION.
  604. * A frozen set will not be modified.
  605. *
  606. * @param set the object for which to retain only the specified range
  607. * @param start first character, inclusive, of range to be retained
  608. * to this set.
  609. * @param end last character, inclusive, of range to be retained
  610. * to this set.
  611. * @stable ICU 3.2
  612. */
  613. U_STABLE void U_EXPORT2
  614. uset_retain(USet* set, UChar32 start, UChar32 end);
  615. /**
  616. * Retains only the elements in this set that are contained in the
  617. * specified set. In other words, removes from this set all of
  618. * its elements that are not contained in the specified set. This
  619. * operation effectively modifies this set so that its value is
  620. * the <i>intersection</i> of the two sets.
  621. * A frozen set will not be modified.
  622. *
  623. * @param set the object on which to perform the retain
  624. * @param retain set that defines which elements this set will retain
  625. * @stable ICU 3.2
  626. */
  627. U_STABLE void U_EXPORT2
  628. uset_retainAll(USet* set, const USet* retain);
  629. /**
  630. * Reallocate this objects internal structures to take up the least
  631. * possible space, without changing this object's value.
  632. * A frozen set will not be modified.
  633. *
  634. * @param set the object on which to perfrom the compact
  635. * @stable ICU 3.2
  636. */
  637. U_STABLE void U_EXPORT2
  638. uset_compact(USet* set);
  639. /**
  640. * Inverts this set. This operation modifies this set so that
  641. * its value is its complement. This operation does not affect
  642. * the multicharacter strings, if any.
  643. * A frozen set will not be modified.
  644. * @param set the set
  645. * @stable ICU 2.4
  646. */
  647. U_STABLE void U_EXPORT2
  648. uset_complement(USet* set);
  649. /**
  650. * Complements in this set all elements contained in the specified
  651. * set. Any character in the other set will be removed if it is
  652. * in this set, or will be added if it is not in this set.
  653. * A frozen set will not be modified.
  654. *
  655. * @param set the set with which to complement
  656. * @param complement set that defines which elements will be xor'ed
  657. * from this set.
  658. * @stable ICU 3.2
  659. */
  660. U_STABLE void U_EXPORT2
  661. uset_complementAll(USet* set, const USet* complement);
  662. /**
  663. * Removes all of the elements from this set. This set will be
  664. * empty after this call returns.
  665. * A frozen set will not be modified.
  666. * @param set the set
  667. * @stable ICU 2.4
  668. */
  669. U_STABLE void U_EXPORT2
  670. uset_clear(USet* set);
  671. /**
  672. * Close this set over the given attribute. For the attribute
  673. * USET_CASE, the result is to modify this set so that:
  674. *
  675. * 1. For each character or string 'a' in this set, all strings or
  676. * characters 'b' such that foldCase(a) == foldCase(b) are added
  677. * to this set.
  678. *
  679. * 2. For each string 'e' in the resulting set, if e !=
  680. * foldCase(e), 'e' will be removed.
  681. *
  682. * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
  683. *
  684. * (Here foldCase(x) refers to the operation u_strFoldCase, and a
  685. * == b denotes that the contents are the same, not pointer
  686. * comparison.)
  687. *
  688. * A frozen set will not be modified.
  689. *
  690. * @param set the set
  691. *
  692. * @param attributes bitmask for attributes to close over.
  693. * Currently only the USET_CASE bit is supported. Any undefined bits
  694. * are ignored.
  695. * @stable ICU 4.2
  696. */
  697. U_STABLE void U_EXPORT2
  698. uset_closeOver(USet* set, int32_t attributes);
  699. /**
  700. * Remove all strings from this set.
  701. *
  702. * @param set the set
  703. * @stable ICU 4.2
  704. */
  705. U_STABLE void U_EXPORT2
  706. uset_removeAllStrings(USet* set);
  707. /**
  708. * Returns TRUE if the given USet contains no characters and no
  709. * strings.
  710. * @param set the set
  711. * @return true if set is empty
  712. * @stable ICU 2.4
  713. */
  714. U_STABLE UBool U_EXPORT2
  715. uset_isEmpty(const USet* set);
  716. /**
  717. * Returns TRUE if the given USet contains the given character.
  718. * This function works faster with a frozen set.
  719. * @param set the set
  720. * @param c The codepoint to check for within the set
  721. * @return true if set contains c
  722. * @stable ICU 2.4
  723. */
  724. U_STABLE UBool U_EXPORT2
  725. uset_contains(const USet* set, UChar32 c);
  726. /**
  727. * Returns TRUE if the given USet contains all characters c
  728. * where start <= c && c <= end.
  729. * @param set the set
  730. * @param start the first character of the range to test, inclusive
  731. * @param end the last character of the range to test, inclusive
  732. * @return TRUE if set contains the range
  733. * @stable ICU 2.2
  734. */
  735. U_STABLE UBool U_EXPORT2
  736. uset_containsRange(const USet* set, UChar32 start, UChar32 end);
  737. /**
  738. * Returns TRUE if the given USet contains the given string.
  739. * @param set the set
  740. * @param str the string
  741. * @param strLen the length of the string or -1 if null terminated.
  742. * @return true if set contains str
  743. * @stable ICU 2.4
  744. */
  745. U_STABLE UBool U_EXPORT2
  746. uset_containsString(const USet* set, const UChar* str, int32_t strLen);
  747. /**
  748. * Returns the index of the given character within this set, where
  749. * the set is ordered by ascending code point. If the character
  750. * is not in this set, return -1. The inverse of this method is
  751. * <code>charAt()</code>.
  752. * @param set the set
  753. * @param c the character to obtain the index for
  754. * @return an index from 0..size()-1, or -1
  755. * @stable ICU 3.2
  756. */
  757. U_STABLE int32_t U_EXPORT2
  758. uset_indexOf(const USet* set, UChar32 c);
  759. /**
  760. * Returns the character at the given index within this set, where
  761. * the set is ordered by ascending code point. If the index is
  762. * out of range, return (UChar32)-1. The inverse of this method is
  763. * <code>indexOf()</code>.
  764. * @param set the set
  765. * @param charIndex an index from 0..size()-1 to obtain the char for
  766. * @return the character at the given index, or (UChar32)-1.
  767. * @stable ICU 3.2
  768. */
  769. U_STABLE UChar32 U_EXPORT2
  770. uset_charAt(const USet* set, int32_t charIndex);
  771. /**
  772. * Returns the number of characters and strings contained in the given
  773. * USet.
  774. * @param set the set
  775. * @return a non-negative integer counting the characters and strings
  776. * contained in set
  777. * @stable ICU 2.4
  778. */
  779. U_STABLE int32_t U_EXPORT2
  780. uset_size(const USet* set);
  781. /**
  782. * Returns the number of items in this set. An item is either a range
  783. * of characters or a single multicharacter string.
  784. * @param set the set
  785. * @return a non-negative integer counting the character ranges
  786. * and/or strings contained in set
  787. * @stable ICU 2.4
  788. */
  789. U_STABLE int32_t U_EXPORT2
  790. uset_getItemCount(const USet* set);
  791. /**
  792. * Returns an item of this set. An item is either a range of
  793. * characters or a single multicharacter string.
  794. * @param set the set
  795. * @param itemIndex a non-negative integer in the range 0..
  796. * uset_getItemCount(set)-1
  797. * @param start pointer to variable to receive first character
  798. * in range, inclusive
  799. * @param end pointer to variable to receive last character in range,
  800. * inclusive
  801. * @param str buffer to receive the string, may be NULL
  802. * @param strCapacity capacity of str, or 0 if str is NULL
  803. * @param ec error code
  804. * @return the length of the string (>= 2), or 0 if the item is a
  805. * range, in which case it is the range *start..*end, or -1 if
  806. * itemIndex is out of range
  807. * @stable ICU 2.4
  808. */
  809. U_STABLE int32_t U_EXPORT2
  810. uset_getItem(const USet* set, int32_t itemIndex,
  811. UChar32* start, UChar32* end,
  812. UChar* str, int32_t strCapacity,
  813. UErrorCode* ec);
  814. /**
  815. * Returns true if set1 contains all the characters and strings
  816. * of set2. It answers the question, 'Is set1 a superset of set2?'
  817. * @param set1 set to be checked for containment
  818. * @param set2 set to be checked for containment
  819. * @return true if the test condition is met
  820. * @stable ICU 3.2
  821. */
  822. U_STABLE UBool U_EXPORT2
  823. uset_containsAll(const USet* set1, const USet* set2);
  824. /**
  825. * Returns true if this set contains all the characters
  826. * of the given string. This is does not check containment of grapheme
  827. * clusters, like uset_containsString.
  828. * @param set set of characters to be checked for containment
  829. * @param str string containing codepoints to be checked for containment
  830. * @param strLen the length of the string or -1 if null terminated.
  831. * @return true if the test condition is met
  832. * @stable ICU 3.4
  833. */
  834. U_STABLE UBool U_EXPORT2
  835. uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen);
  836. /**
  837. * Returns true if set1 contains none of the characters and strings
  838. * of set2. It answers the question, 'Is set1 a disjoint set of set2?'
  839. * @param set1 set to be checked for containment
  840. * @param set2 set to be checked for containment
  841. * @return true if the test condition is met
  842. * @stable ICU 3.2
  843. */
  844. U_STABLE UBool U_EXPORT2
  845. uset_containsNone(const USet* set1, const USet* set2);
  846. /**
  847. * Returns true if set1 contains some of the characters and strings
  848. * of set2. It answers the question, 'Does set1 and set2 have an intersection?'
  849. * @param set1 set to be checked for containment
  850. * @param set2 set to be checked for containment
  851. * @return true if the test condition is met
  852. * @stable ICU 3.2
  853. */
  854. U_STABLE UBool U_EXPORT2
  855. uset_containsSome(const USet* set1, const USet* set2);
  856. /**
  857. * Returns the length of the initial substring of the input string which
  858. * consists only of characters and strings that are contained in this set
  859. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  860. * or only of characters and strings that are not contained
  861. * in this set (USET_SPAN_NOT_CONTAINED).
  862. * See USetSpanCondition for details.
  863. * Similar to the strspn() C library function.
  864. * Unpaired surrogates are treated according to contains() of their surrogate code points.
  865. * This function works faster with a frozen set and with a non-negative string length argument.
  866. * @param set the set
  867. * @param s start of the string
  868. * @param length of the string; can be -1 for NUL-terminated
  869. * @param spanCondition specifies the containment condition
  870. * @return the length of the initial substring according to the spanCondition;
  871. * 0 if the start of the string does not fit the spanCondition
  872. * @stable ICU 3.8
  873. * @see USetSpanCondition
  874. */
  875. U_STABLE int32_t U_EXPORT2
  876. uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
  877. /**
  878. * Returns the start of the trailing substring of the input string which
  879. * consists only of characters and strings that are contained in this set
  880. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  881. * or only of characters and strings that are not contained
  882. * in this set (USET_SPAN_NOT_CONTAINED).
  883. * See USetSpanCondition for details.
  884. * Unpaired surrogates are treated according to contains() of their surrogate code points.
  885. * This function works faster with a frozen set and with a non-negative string length argument.
  886. * @param set the set
  887. * @param s start of the string
  888. * @param length of the string; can be -1 for NUL-terminated
  889. * @param spanCondition specifies the containment condition
  890. * @return the start of the trailing substring according to the spanCondition;
  891. * the string length if the end of the string does not fit the spanCondition
  892. * @stable ICU 3.8
  893. * @see USetSpanCondition
  894. */
  895. U_STABLE int32_t U_EXPORT2
  896. uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
  897. /**
  898. * Returns the length of the initial substring of the input string which
  899. * consists only of characters and strings that are contained in this set
  900. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  901. * or only of characters and strings that are not contained
  902. * in this set (USET_SPAN_NOT_CONTAINED).
  903. * See USetSpanCondition for details.
  904. * Similar to the strspn() C library function.
  905. * Malformed byte sequences are treated according to contains(0xfffd).
  906. * This function works faster with a frozen set and with a non-negative string length argument.
  907. * @param set the set
  908. * @param s start of the string (UTF-8)
  909. * @param length of the string; can be -1 for NUL-terminated
  910. * @param spanCondition specifies the containment condition
  911. * @return the length of the initial substring according to the spanCondition;
  912. * 0 if the start of the string does not fit the spanCondition
  913. * @stable ICU 3.8
  914. * @see USetSpanCondition
  915. */
  916. U_STABLE int32_t U_EXPORT2
  917. uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
  918. /**
  919. * Returns the start of the trailing substring of the input string which
  920. * consists only of characters and strings that are contained in this set
  921. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  922. * or only of characters and strings that are not contained
  923. * in this set (USET_SPAN_NOT_CONTAINED).
  924. * See USetSpanCondition for details.
  925. * Malformed byte sequences are treated according to contains(0xfffd).
  926. * This function works faster with a frozen set and with a non-negative string length argument.
  927. * @param set the set
  928. * @param s start of the string (UTF-8)
  929. * @param length of the string; can be -1 for NUL-terminated
  930. * @param spanCondition specifies the containment condition
  931. * @return the start of the trailing substring according to the spanCondition;
  932. * the string length if the end of the string does not fit the spanCondition
  933. * @stable ICU 3.8
  934. * @see USetSpanCondition
  935. */
  936. U_STABLE int32_t U_EXPORT2
  937. uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
  938. /**
  939. * Returns true if set1 contains all of the characters and strings
  940. * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'
  941. * @param set1 set to be checked for containment
  942. * @param set2 set to be checked for containment
  943. * @return true if the test condition is met
  944. * @stable ICU 3.2
  945. */
  946. U_STABLE UBool U_EXPORT2
  947. uset_equals(const USet* set1, const USet* set2);
  948. /*********************************************************************
  949. * Serialized set API
  950. *********************************************************************/
  951. /**
  952. * Serializes this set into an array of 16-bit integers. Serialization
  953. * (currently) only records the characters in the set; multicharacter
  954. * strings are ignored.
  955. *
  956. * The array
  957. * has following format (each line is one 16-bit integer):
  958. *
  959. * length = (n+2*m) | (m!=0?0x8000:0)
  960. * bmpLength = n; present if m!=0
  961. * bmp[0]
  962. * bmp[1]
  963. * ...
  964. * bmp[n-1]
  965. * supp-high[0]
  966. * supp-low[0]
  967. * supp-high[1]
  968. * supp-low[1]
  969. * ...
  970. * supp-high[m-1]
  971. * supp-low[m-1]
  972. *
  973. * The array starts with a header. After the header are n bmp
  974. * code points, then m supplementary code points. Either n or m
  975. * or both may be zero. n+2*m is always <= 0x7FFF.
  976. *
  977. * If there are no supplementary characters (if m==0) then the
  978. * header is one 16-bit integer, 'length', with value n.
  979. *
  980. * If there are supplementary characters (if m!=0) then the header
  981. * is two 16-bit integers. The first, 'length', has value
  982. * (n+2*m)|0x8000. The second, 'bmpLength', has value n.
  983. *
  984. * After the header the code points are stored in ascending order.
  985. * Supplementary code points are stored as most significant 16
  986. * bits followed by least significant 16 bits.
  987. *
  988. * @param set the set
  989. * @param dest pointer to buffer of destCapacity 16-bit integers.
  990. * May be NULL only if destCapacity is zero.
  991. * @param destCapacity size of dest, or zero. Must not be negative.
  992. * @param pErrorCode pointer to the error code. Will be set to
  993. * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to
  994. * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity.
  995. * @return the total length of the serialized format, including
  996. * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
  997. * than U_BUFFER_OVERFLOW_ERROR.
  998. * @stable ICU 2.4
  999. */
  1000. U_STABLE int32_t U_EXPORT2
  1001. uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);
  1002. /**
  1003. * Given a serialized array, fill in the given serialized set object.
  1004. * @param fillSet pointer to result
  1005. * @param src pointer to start of array
  1006. * @param srcLength length of array
  1007. * @return true if the given array is valid, otherwise false
  1008. * @stable ICU 2.4
  1009. */
  1010. U_STABLE UBool U_EXPORT2
  1011. uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);
  1012. /**
  1013. * Set the USerializedSet to contain the given character (and nothing
  1014. * else).
  1015. * @param fillSet pointer to result
  1016. * @param c The codepoint to set
  1017. * @stable ICU 2.4
  1018. */
  1019. U_STABLE void U_EXPORT2
  1020. uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);
  1021. /**
  1022. * Returns TRUE if the given USerializedSet contains the given
  1023. * character.
  1024. * @param set the serialized set
  1025. * @param c The codepoint to check for within the set
  1026. * @return true if set contains c
  1027. * @stable ICU 2.4
  1028. */
  1029. U_STABLE UBool U_EXPORT2
  1030. uset_serializedContains(const USerializedSet* set, UChar32 c);
  1031. /**
  1032. * Returns the number of disjoint ranges of characters contained in
  1033. * the given serialized set. Ignores any strings contained in the
  1034. * set.
  1035. * @param set the serialized set
  1036. * @return a non-negative integer counting the character ranges
  1037. * contained in set
  1038. * @stable ICU 2.4
  1039. */
  1040. U_STABLE int32_t U_EXPORT2
  1041. uset_getSerializedRangeCount(const USerializedSet* set);
  1042. /**
  1043. * Returns a range of characters contained in the given serialized
  1044. * set.
  1045. * @param set the serialized set
  1046. * @param rangeIndex a non-negative integer in the range 0..
  1047. * uset_getSerializedRangeCount(set)-1
  1048. * @param pStart pointer to variable to receive first character
  1049. * in range, inclusive
  1050. * @param pEnd pointer to variable to receive last character in range,
  1051. * inclusive
  1052. * @return true if rangeIndex is valid, otherwise false
  1053. * @stable ICU 2.4
  1054. */
  1055. U_STABLE UBool U_EXPORT2
  1056. uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
  1057. UChar32* pStart, UChar32* pEnd);
  1058. #endif