uspoof.h 66 KB


  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ***************************************************************************
  5. * Copyright (C) 2008-2016, International Business Machines Corporation
  6. * and others. All Rights Reserved.
  7. ***************************************************************************
  8. * file name: uspoof.h
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2008Feb13
  14. * created by: Andy Heninger
  15. *
  16. * Unicode Spoof Detection
  17. */
  18. #ifndef USPOOF_H
  19. #define USPOOF_H
  20. #include "unicode/utypes.h"
  21. #include "unicode/uset.h"
  22. #include "unicode/parseerr.h"
  23. #include "unicode/localpointer.h"
  24. #if !UCONFIG_NO_NORMALIZATION
  25. #if U_SHOW_CPLUSPLUS_API
  26. #include "unicode/unistr.h"
  27. #include "unicode/uniset.h"
  28. #endif
  29. /**
  30. * \file
  31. * \brief Unicode Security and Spoofing Detection, C API.
  32. *
  33. * <p>
  34. * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
  35. * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
  36. *
  37. * <ol>
  38. * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
  39. * &quot;&Eta;arvest&quot;, where the second string starts with the Greek capital letter Eta.</li>
  40. * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
  41. * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
  42. * </ol>
  43. *
  44. * <p>
  45. * Although originally designed as a method for flagging suspicious identifier strings such as URLs,
  46. * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
  47. * content filters.
  48. *
  49. * <p>
  50. * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++.
  51. *
  52. * <h2>Confusables</h2>
  53. *
  54. * <p>
  55. * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings:
  56. *
  57. * \code{.c}
  58. * UErrorCode status = U_ZERO_ERROR;
  59. * UChar* str1 = (UChar*) u"Harvest";
  60. * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
  61. *
  62. * USpoofChecker* sc = uspoof_open(&status);
  63. * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
  64. *
  65. * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
  66. * UBool result = bitmask != 0;
  67. * // areConfusable: 1 (status: U_ZERO_ERROR)
  68. * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
  69. * uspoof_close(sc);
  70. * \endcode
  71. *
  72. * <p>
  73. * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
  74. * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
  75. * confusability test; and the following line extracts the result out of the return value. For best performance,
  76. * the instance should be created once (e.g., upon application startup), and the efficient
  77. * {@link uspoof_areConfusable} method can be used at runtime.
  78. *
  79. * <p>
  80. * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call
  81. * {@link uspoof_close} when the object goes out of scope:
  82. *
  83. * \code{.cpp}
  84. * UErrorCode status = U_ZERO_ERROR;
  85. * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
  86. * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status);
  87. * // ...
  88. * \endcode
  89. *
  90. * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can
  91. * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so
  92. * the following snippet is equivalent to the example above:
  93. *
  94. * \code{.c}
  95. * UErrorCode status = U_ZERO_ERROR;
  96. * UChar* str1 = (UChar*) u"Harvest";
  97. * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA
  98. *
  99. * USpoofChecker* sc = uspoof_open(&status);
  100. * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
  101. *
  102. * // Get skeleton 1
  103. * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
  104. * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
  105. * status = U_ZERO_ERROR;
  106. * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
  107. *
  108. * // Get skeleton 2
  109. * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
  110. * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
  111. * status = U_ZERO_ERROR;
  112. * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
  113. *
  114. * // Are the skeletons the same?
  115. * UBool result = u_strcmp(skel1, skel2) == 0;
  116. * // areConfusable: 1 (status: U_ZERO_ERROR)
  117. * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
  118. * uspoof_close(sc);
  119. * free(skel1);
  120. * free(skel2);
  121. * \endcode
  122. *
  123. * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
  124. * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below:
  125. *
  126. * \code{.c}
  127. * UErrorCode status = U_ZERO_ERROR;
  128. * #define DICTIONARY_LENGTH 2
  129. * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
  130. * UChar* skeletons[DICTIONARY_LENGTH];
  131. * UChar* str = (UChar*) u"1orern";
  132. *
  133. * // Setup:
  134. * USpoofChecker* sc = uspoof_open(&status);
  135. * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
  136. * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
  137. * UChar* word = dictionary[i];
  138. * int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
  139. * skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
  140. * status = U_ZERO_ERROR;
  141. * uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
  142. * }
  143. *
  144. * // Live Check:
  145. * {
  146. * int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
  147. * UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
  148. * status = U_ZERO_ERROR;
  149. * uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
  150. * UBool result = FALSE;
  151. * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
  152. * result = u_strcmp(skel, skeletons[i]) == 0;
  153. * if (result == TRUE) { break; }
  154. * }
  155. * // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
  156. * printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
  157. * free(skel);
  158. * }
  159. *
  160. * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
  161. * free(skeletons[i]);
  162. * }
  163. * uspoof_close(sc);
  164. * \endcode
  165. *
  166. * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
  167. * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
  168. * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
  169. *
  170. * <h2>Spoof Detection</h2>
  171. *
  172. * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a
  173. * string:
  174. *
  175. * \code{.c}
  176. * UErrorCode status = U_ZERO_ERROR;
  177. * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
  178. *
  179. * // Get the default set of allowable characters:
  180. * USet* allowed = uset_openEmpty();
  181. * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
  182. * uset_addAll(allowed, uspoof_getInclusionSet(&status));
  183. *
  184. * USpoofChecker* sc = uspoof_open(&status);
  185. * uspoof_setAllowedChars(sc, allowed, &status);
  186. * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
  187. *
  188. * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
  189. * UBool result = bitmask != 0;
  190. * // fails checks: 1 (status: U_ZERO_ERROR)
  191. * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
  192. * uspoof_close(sc);
  193. * uset_close(allowed);
  194. * \endcode
  195. *
  196. * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at
  197. * startup, and call the cheaper {@link uspoof_check} online. We specify the set of
  198. * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39.
  199. *
  200. * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings,
  201. * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers.
  202. *
  203. * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks
  204. * is available in the returned bitmask. For complete information, use the {@link uspoof_check2} class of functions
  205. * with a {@link USpoofCheckResult} parameter:
  206. *
  207. * \code{.c}
  208. * UErrorCode status = U_ZERO_ERROR;
  209. * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A
  210. *
  211. * // Get the default set of allowable characters:
  212. * USet* allowed = uset_openEmpty();
  213. * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
  214. * uset_addAll(allowed, uspoof_getInclusionSet(&status));
  215. *
  216. * USpoofChecker* sc = uspoof_open(&status);
  217. * uspoof_setAllowedChars(sc, allowed, &status);
  218. * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
  219. *
  220. * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status);
  221. * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status);
  222. *
  223. * int32_t failures1 = bitmask;
  224. * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
  225. * assert(failures1 == failures2);
  226. * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
  227. * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
  228. *
  229. * // Cleanup:
  230. * uspoof_close(sc);
  231. * uset_close(allowed);
  232. * uspoof_closeCheckResult(checkResult);
  233. * \endcode
  234. *
  235. * C++ users can take advantage of a few syntactical conveniences. The following snippet is functionally
  236. * equivalent to the one above:
  237. *
  238. * \code{.cpp}
  239. * UErrorCode status = U_ZERO_ERROR;
  240. * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
  241. *
  242. * // Get the default set of allowable characters:
  243. * UnicodeSet allowed;
  244. * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
  245. * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
  246. *
  247. * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
  248. * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
  249. * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
  250. *
  251. * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
  252. * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
  253. *
  254. * int32_t failures1 = bitmask;
  255. * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
  256. * assert(failures1 == failures2);
  257. * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
  258. * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
  259. *
  260. * // Explicit cleanup not necessary.
  261. * \endcode
  262. *
  263. * The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
  264. * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
  265. *
  266. * <ul>
  267. * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
  268. * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
  269. * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
  270. * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
  271. * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
  272. * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
  273. * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li>
  274. * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
  275. * </ul>
  276. *
  277. * <p>
  278. * These checks can be enabled independently of each other. For example, if you were interested in checking for only the
  279. * INVISIBLE and MIXED_NUMBERS conditions, you could do:
  280. *
  281. * \code{.c}
  282. * UErrorCode status = U_ZERO_ERROR;
  283. * UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR
  284. *
  285. * USpoofChecker* sc = uspoof_open(&status);
  286. * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
  287. *
  288. * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
  289. * UBool result = bitmask != 0;
  290. * // fails checks: 1 (status: U_ZERO_ERROR)
  291. * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
  292. * uspoof_close(sc);
  293. * \endcode
  294. *
  295. * Here is an example in C++ showing how to compute the restriction level of a string:
  296. *
  297. * \code{.cpp}
  298. * UErrorCode status = U_ZERO_ERROR;
  299. * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A
  300. *
  301. * // Get the default set of allowable characters:
  302. * UnicodeSet allowed;
  303. * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
  304. * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
  305. *
  306. * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
  307. * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
  308. * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
  309. * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status);
  310. *
  311. * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
  312. * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
  313. *
  314. * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
  315. * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
  316. * assert((restrictionLevel & bitmask) == restrictionLevel);
  317. * // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
  318. * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
  319. * \endcode
  320. *
  321. * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since
  322. * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
  323. *
  324. * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
  325. * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
  326. * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
  327. * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
  328. * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
  329. * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
  330. * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of
  331. * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code
  332. * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
  333. * scripts.
  334. *
  335. * <h2>Additional Information</h2>
  336. *
  337. * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
  338. *
  339. * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
  340. * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
  341. * using the same USpoofChecker instance.
  342. *
  343. * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
  344. * thread safe. Those that take a non-const USpoofChecker are not thread safe..
  345. *
  346. * @stable ICU 4.6
  347. */
  348. U_CDECL_BEGIN
  349. struct USpoofChecker;
  350. /**
  351. * @stable ICU 4.2
  352. */
  353. typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */
  354. struct USpoofCheckResult;
  355. /**
  356. * @see uspoof_openCheckResult
  357. * @stable ICU 58
  358. */
  359. typedef struct USpoofCheckResult USpoofCheckResult;
  360. /**
  361. * Enum for the kinds of checks that USpoofChecker can perform.
  362. * These enum values are used both to select the set of checks that
  363. * will be performed, and to report results from the check function.
  364. *
  365. * @stable ICU 4.2
  366. */
  367. typedef enum USpoofChecks {
  368. /**
  369. * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
  370. * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
  371. * 4.
  372. *
  373. * @see uspoof_areConfusable
  374. * @stable ICU 4.2
  375. */
  376. USPOOF_SINGLE_SCRIPT_CONFUSABLE = 1,
  377. /**
  378. * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
  379. * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
  380. * 39 section 4.
  381. *
  382. * @see uspoof_areConfusable
  383. * @stable ICU 4.2
  384. */
  385. USPOOF_MIXED_SCRIPT_CONFUSABLE = 2,
  386. /**
  387. * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
  388. * that the two strings are visually confusable and that they are not from the same script but both of them are
  389. * single-script strings, according to UTS 39 section 4.
  390. *
  391. * @see uspoof_areConfusable
  392. * @stable ICU 4.2
  393. */
  394. USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4,
  395. /**
  396. * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables. You may set
  397. * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
  398. * make {@link uspoof_areConfusable} return only those types of confusables.
  399. *
  400. * @see uspoof_areConfusable
  401. * @see uspoof_getSkeleton
  402. * @stable ICU 58
  403. */
  404. USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
  405. #ifndef U_HIDE_DEPRECATED_API
  406. /**
  407. * This flag is deprecated and no longer affects the behavior of SpoofChecker.
  408. *
  409. * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated.
  410. */
  411. USPOOF_ANY_CASE = 8,
  412. #endif /* U_HIDE_DEPRECATED_API */
  413. /**
  414. * Check that an identifier is no looser than the specified RestrictionLevel.
  415. * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE.
  416. *
  417. * If USPOOF_AUX_INFO is enabled the actual restriction level of the
  418. * identifier being tested will also be returned by uspoof_check().
  419. *
  420. * @see URestrictionLevel
  421. * @see uspoof_setRestrictionLevel
  422. * @see USPOOF_AUX_INFO
  423. *
  424. * @stable ICU 51
  425. */
  426. USPOOF_RESTRICTION_LEVEL = 16,
  427. #ifndef U_HIDE_DEPRECATED_API
  428. /** Check that an identifier contains only characters from a
  429. * single script (plus chars from the common and inherited scripts.)
  430. * Applies to checks of a single identifier check only.
  431. * @deprecated ICU 51 Use RESTRICTION_LEVEL instead.
  432. */
  433. USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL,
  434. #endif /* U_HIDE_DEPRECATED_API */
  435. /** Check an identifier for the presence of invisible characters,
  436. * such as zero-width spaces, or character sequences that are
  437. * likely not to display, such as multiple occurrences of the same
  438. * non-spacing mark. This check does not test the input string as a whole
  439. * for conformance to any particular syntax for identifiers.
  440. */
  441. USPOOF_INVISIBLE = 32,
  442. /** Check that an identifier contains only characters from a specified set
  443. * of acceptable characters. See {@link uspoof_setAllowedChars} and
  444. * {@link uspoof_setAllowedLocales}. Note that a string that fails this check
  445. * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
  446. */
  447. USPOOF_CHAR_LIMIT = 64,
  448. /**
  449. * Check that an identifier does not mix numbers from different numbering systems.
  450. * For more information, see UTS 39 section 5.3.
  451. *
  452. * @stable ICU 51
  453. */
  454. USPOOF_MIXED_NUMBERS = 128,
  455. /**
  456. * Check that an identifier does not have a combining character following a character in which that
  457. * combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
  458. *
  459. * More specifically, the following characters are forbidden from preceding a U+0307:
  460. * <ul>
  461. * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
  462. * <li>Latin lowercase letter 'l'</li>
  463. * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
  464. * <li>Any character whose confusable prototype ends with such a character
  465. * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
  466. * </ul>
  467. * In addition, combining characters are allowed between the above characters and U+0307 except those
  468. * with combining class 0 or combining class "Above" (230, same class as U+0307).
  469. *
  470. * This list and the number of combing characters considered by this check may grow over time.
  471. *
  472. * @stable ICU 62
  473. */
  474. USPOOF_HIDDEN_OVERLAY = 256,
  475. /**
  476. * Enable all spoof checks.
  477. *
  478. * @stable ICU 4.6
  479. */
  480. USPOOF_ALL_CHECKS = 0xFFFF,
  481. /**
  482. * Enable the return of auxillary (non-error) information in the
  483. * upper bits of the check results value.
  484. *
  485. * If this "check" is not enabled, the results of {@link uspoof_check} will be
  486. * zero when an identifier passes all of the enabled checks.
  487. *
  488. * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will
  489. * be zero when an identifier passes all checks.
  490. *
  491. * @stable ICU 51
  492. */
  493. USPOOF_AUX_INFO = 0x40000000
  494. } USpoofChecks;
  495. /**
  496. * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and
  497. * for returned identifier restriction levels in check results.
  498. *
  499. * @stable ICU 51
  500. *
  501. * @see uspoof_setRestrictionLevel
  502. * @see uspoof_check
  503. */
  504. typedef enum URestrictionLevel {
  505. /**
  506. * All characters in the string are in the identifier profile and all characters in the string are in the
  507. * ASCII range.
  508. *
  509. * @stable ICU 51
  510. */
  511. USPOOF_ASCII = 0x10000000,
  512. /**
  513. * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and
  514. * the string is single-script, according to the definition in UTS 39 section 5.1.
  515. *
  516. * @stable ICU 53
  517. */
  518. USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000,
  519. /**
  520. * The string classifies as Single Script, or all characters in the string are in the identifier profile and
  521. * the string is covered by any of the following sets of scripts, according to the definition in UTS 39
  522. * section 5.1:
  523. * <ul>
  524. * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
  525. * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
  526. * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
  527. * </ul>
  528. * This is the default restriction in ICU.
  529. *
  530. * @stable ICU 51
  531. */
  532. USPOOF_HIGHLY_RESTRICTIVE = 0x30000000,
  533. /**
  534. * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
  535. * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
  536. * Greek, and Cherokee.
  537. *
  538. * @stable ICU 51
  539. */
  540. USPOOF_MODERATELY_RESTRICTIVE = 0x40000000,
  541. /**
  542. * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts.
  543. *
  544. * @stable ICU 51
  545. */
  546. USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000,
  547. /**
  548. * Any valid identifiers, including characters outside of the Identifier Profile.
  549. *
  550. * @stable ICU 51
  551. */
  552. USPOOF_UNRESTRICTIVE = 0x60000000,
  553. /**
  554. * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}.
  555. *
  556. * @stable ICU 53
  557. */
  558. USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000,
  559. #ifndef U_HIDE_INTERNAL_API
  560. /**
  561. * An undefined restriction level.
  562. * @internal
  563. */
  564. USPOOF_UNDEFINED_RESTRICTIVE = -1
  565. #endif /* U_HIDE_INTERNAL_API */
  566. } URestrictionLevel;
  567. /**
  568. * Create a Unicode Spoof Checker, configured to perform all
  569. * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT.
  570. * Note that additional checks may be added in the future,
  571. * resulting in the changes to the default checking behavior.
  572. *
  573. * @param status The error code, set if this function encounters a problem.
  574. * @return the newly created Spoof Checker
  575. * @stable ICU 4.2
  576. */
  577. U_STABLE USpoofChecker * U_EXPORT2
  578. uspoof_open(UErrorCode *status);
  579. /**
  580. * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory.
  581. * Inverse of uspoof_serialize().
  582. * The memory containing the serialized data must remain valid and unchanged
  583. * as long as the spoof checker, or any cloned copies of the spoof checker,
  584. * are in use. Ownership of the memory remains with the caller.
  585. * The spoof checker (and any clones) must be closed prior to deleting the
  586. * serialized data.
  587. *
  588. * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data
  589. * @param length the number of bytes available at data;
  590. * can be more than necessary
  591. * @param pActualLength receives the actual number of bytes at data taken up by the data;
  592. * can be NULL
  593. * @param pErrorCode ICU error code
  594. * @return the spoof checker.
  595. *
  596. * @see uspoof_open
  597. * @see uspoof_serialize
  598. * @stable ICU 4.2
  599. */
  600. U_STABLE USpoofChecker * U_EXPORT2
  601. uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
  602. UErrorCode *pErrorCode);
  603. /**
  604. * Open a Spoof Checker from the source form of the spoof data.
  605. * The input corresponds to the Unicode data file confusables.txt
  606. * as described in Unicode UAX #39. The syntax of the source data
  607. * is as described in UAX #39 for this file, and the content of
  608. * this file is acceptable input.
  609. *
  610. * The character encoding of the (char *) input text is UTF-8.
  611. *
  612. * @param confusables a pointer to the confusable characters definitions,
  613. * as found in file confusables.txt from unicode.org.
  614. * @param confusablesLen The length of the confusables text, or -1 if the
  615. * input string is zero terminated.
  616. * @param confusablesWholeScript
  617. * Deprecated in ICU 58. No longer used.
  618. * @param confusablesWholeScriptLen
  619. * Deprecated in ICU 58. No longer used.
  620. * @param errType In the event of an error in the input, indicates
  621. * which of the input files contains the error.
  622. * The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or
  623. * USPOOF_WHOLE_SCRIPT_CONFUSABLE, or
  624. * zero if no errors are found.
  625. * @param pe In the event of an error in the input, receives the position
  626. * in the input text (line, offset) of the error.
  627. * @param status an in/out ICU UErrorCode. Among the possible errors is
  628. * U_PARSE_ERROR, which is used to report syntax errors
  629. * in the input.
  630. * @return A spoof checker that uses the rules from the input files.
  631. * @stable ICU 4.2
  632. */
  633. U_STABLE USpoofChecker * U_EXPORT2
  634. uspoof_openFromSource(const char *confusables, int32_t confusablesLen,
  635. const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
  636. int32_t *errType, UParseError *pe, UErrorCode *status);
  637. /**
  638. * Close a Spoof Checker, freeing any memory that was being held by
  639. * its implementation.
  640. * @stable ICU 4.2
  641. */
  642. U_STABLE void U_EXPORT2
  643. uspoof_close(USpoofChecker *sc);
  644. /**
  645. * Clone a Spoof Checker. The clone will be set to perform the same checks
  646. * as the original source.
  647. *
  648. * @param sc The source USpoofChecker
  649. * @param status The error code, set if this function encounters a problem.
  650. * @return
  651. * @stable ICU 4.2
  652. */
  653. U_STABLE USpoofChecker * U_EXPORT2
  654. uspoof_clone(const USpoofChecker *sc, UErrorCode *status);
  655. /**
  656. * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method
  657. * overwrites any checks that may have already been enabled. By default, all checks are enabled.
  658. *
  659. * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For
  660. * example, to fail strings containing characters outside of the set specified by {@link uspoof_setAllowedChars} and
  661. * also strings that contain digits from mixed numbering systems:
  662. *
  663. * <pre>
  664. * {@code
  665. * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS);
  666. * }
  667. * </pre>
  668. *
  669. * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from
  670. * ALL_CHECKS. For example, if you are not planning to use the {@link uspoof_areConfusable} functionality,
  671. * it is good practice to disable the CONFUSABLE check:
  672. *
  673. * <pre>
  674. * {@code
  675. * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE);
  676. * }
  677. * </pre>
  678. *
  679. * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and
  680. * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
  681. * enable onto the existing bitmask specified by this method. For more details, see the documentation of those
  682. * methods.
  683. *
  684. * @param sc The USpoofChecker
  685. * @param checks The set of checks that this spoof checker will perform.
  686. * The value is a bit set, obtained by OR-ing together
  687. * values from enum USpoofChecks.
  688. * @param status The error code, set if this function encounters a problem.
  689. * @stable ICU 4.2
  690. *
  691. */
  692. U_STABLE void U_EXPORT2
  693. uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status);
  694. /**
  695. * Get the set of checks that this Spoof Checker has been configured to perform.
  696. *
  697. * @param sc The USpoofChecker
  698. * @param status The error code, set if this function encounters a problem.
  699. * @return The set of checks that this spoof checker will perform.
  700. * The value is a bit set, obtained by OR-ing together
  701. * values from enum USpoofChecks.
  702. * @stable ICU 4.2
  703. *
  704. */
  705. U_STABLE int32_t U_EXPORT2
  706. uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
  707. /**
  708. * Set the loosest restriction level allowed for strings. The default if this is not called is
  709. * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
  710. * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
  711. * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
  712. *
  713. * @param sc The USpoofChecker
  714. * @param restrictionLevel The loosest restriction level allowed.
  715. * @see URestrictionLevel
  716. * @stable ICU 51
  717. */
  718. U_STABLE void U_EXPORT2
  719. uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
  720. /**
  721. * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}.
  722. *
  723. * @return The restriction level
  724. * @see URestrictionLevel
  725. * @stable ICU 51
  726. */
  727. U_STABLE URestrictionLevel U_EXPORT2
  728. uspoof_getRestrictionLevel(const USpoofChecker *sc);
  729. /**
  730. * Limit characters that are acceptable in identifiers being checked to those
  731. * normally used with the languages associated with the specified locales.
  732. * Any previously specified list of locales is replaced by the new settings.
  733. *
  734. * A set of languages is determined from the locale(s), and
  735. * from those a set of acceptable Unicode scripts is determined.
  736. * Characters from this set of scripts, along with characters from
  737. * the "common" and "inherited" Unicode Script categories
  738. * will be permitted.
  739. *
  740. * Supplying an empty string removes all restrictions;
  741. * characters from any script will be allowed.
  742. *
  743. * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this
  744. * USpoofChecker when calling this function with a non-empty list
  745. * of locales.
  746. *
  747. * The Unicode Set of characters that will be allowed is accessible
  748. * via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales()
  749. * will <i>replace</i> any previously applied set of allowed characters.
  750. *
  751. * Adjustments, such as additions or deletions of certain classes of characters,
  752. * can be made to the result of uspoof_setAllowedLocales() by
  753. * fetching the resulting set with uspoof_getAllowedChars(),
  754. * manipulating it with the Unicode Set API, then resetting the
  755. * spoof detectors limits with uspoof_setAllowedChars().
  756. *
  757. * @param sc The USpoofChecker
  758. * @param localesList A list list of locales, from which the language
  759. * and associated script are extracted. The locales
  760. * are comma-separated if there is more than one.
  761. * White space may not appear within an individual locale,
  762. * but is ignored otherwise.
  763. * The locales are syntactically like those from the
  764. * HTTP Accept-Language header.
  765. * If the localesList is empty, no restrictions will be placed on
  766. * the allowed characters.
  767. *
  768. * @param status The error code, set if this function encounters a problem.
  769. * @stable ICU 4.2
  770. */
  771. U_STABLE void U_EXPORT2
  772. uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status);
  773. /**
  774. * Get a list of locales for the scripts that are acceptable in strings
  775. * to be checked. If no limitations on scripts have been specified,
  776. * an empty string will be returned.
  777. *
  778. * uspoof_setAllowedChars() will reset the list of allowed to be empty.
  779. *
  780. * The format of the returned list is the same as that supplied to
  781. * uspoof_setAllowedLocales(), but returned list may not be identical
  782. * to the originally specified string; the string may be reformatted,
  783. * and information other than languages from
  784. * the originally specified locales may be omitted.
  785. *
  786. * @param sc The USpoofChecker
  787. * @param status The error code, set if this function encounters a problem.
  788. * @return A string containing a list of locales corresponding
  789. * to the acceptable scripts, formatted like an
  790. * HTTP Accept Language value.
  791. *
  792. * @stable ICU 4.2
  793. */
  794. U_STABLE const char * U_EXPORT2
  795. uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status);
  796. /**
  797. * Limit the acceptable characters to those specified by a Unicode Set.
  798. * Any previously specified character limit is
  799. * is replaced by the new settings. This includes limits on
  800. * characters that were set with the uspoof_setAllowedLocales() function.
  801. *
  802. * The USPOOF_CHAR_LIMIT test is automatically enabled for this
  803. * USpoofChecker by this function.
  804. *
  805. * @param sc The USpoofChecker
  806. * @param chars A Unicode Set containing the list of
  807. * characters that are permitted. Ownership of the set
  808. * remains with the caller. The incoming set is cloned by
  809. * this function, so there are no restrictions on modifying
  810. * or deleting the USet after calling this function.
  811. * @param status The error code, set if this function encounters a problem.
  812. * @stable ICU 4.2
  813. */
  814. U_STABLE void U_EXPORT2
  815. uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status);
  816. /**
  817. * Get a USet for the characters permitted in an identifier.
  818. * This corresponds to the limits imposed by the Set Allowed Characters
  819. * functions. Limitations imposed by other checks will not be
  820. * reflected in the set returned by this function.
  821. *
  822. * The returned set will be frozen, meaning that it cannot be modified
  823. * by the caller.
  824. *
  825. * Ownership of the returned set remains with the Spoof Detector. The
  826. * returned set will become invalid if the spoof detector is closed,
  827. * or if a new set of allowed characters is specified.
  828. *
  829. *
  830. * @param sc The USpoofChecker
  831. * @param status The error code, set if this function encounters a problem.
  832. * @return A USet containing the characters that are permitted by
  833. * the USPOOF_CHAR_LIMIT test.
  834. * @stable ICU 4.2
  835. */
  836. U_STABLE const USet * U_EXPORT2
  837. uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status);
  838. /**
  839. * Check the specified string for possible security issues.
  840. * The text to be checked will typically be an identifier of some sort.
  841. * The set of checks to be performed is specified with uspoof_setChecks().
  842. *
  843. * \note
  844. * Consider using the newer API, {@link uspoof_check2}, instead.
  845. * The newer API exposes additional information from the check procedure
  846. * and is otherwise identical to this method.
  847. *
  848. * @param sc The USpoofChecker
  849. * @param id The identifier to be checked for possible security issues,
  850. * in UTF-16 format.
  851. * @param length the length of the string to be checked, expressed in
  852. * 16 bit UTF-16 code units, or -1 if the string is
  853. * zero terminated.
  854. * @param position Deprecated in ICU 51. Always returns zero.
  855. * Originally, an out parameter for the index of the first
  856. * string position that failed a check.
  857. * This parameter may be NULL.
  858. * @param status The error code, set if an error occurred while attempting to
  859. * perform the check.
  860. * Spoofing or security issues detected with the input string are
  861. * not reported here, but through the function's return value.
  862. * @return An integer value with bits set for any potential security
  863. * or spoofing issues detected. The bits are defined by
  864. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  865. * will be zero if the input string passes all of the
  866. * enabled checks.
  867. * @see uspoof_check2
  868. * @stable ICU 4.2
  869. */
  870. U_STABLE int32_t U_EXPORT2
  871. uspoof_check(const USpoofChecker *sc,
  872. const UChar *id, int32_t length,
  873. int32_t *position,
  874. UErrorCode *status);
  875. /**
  876. * Check the specified string for possible security issues.
  877. * The text to be checked will typically be an identifier of some sort.
  878. * The set of checks to be performed is specified with uspoof_setChecks().
  879. *
  880. * \note
  881. * Consider using the newer API, {@link uspoof_check2UTF8}, instead.
  882. * The newer API exposes additional information from the check procedure
  883. * and is otherwise identical to this method.
  884. *
  885. * @param sc The USpoofChecker
  886. * @param id A identifier to be checked for possible security issues, in UTF8 format.
  887. * @param length the length of the string to be checked, or -1 if the string is
  888. * zero terminated.
  889. * @param position Deprecated in ICU 51. Always returns zero.
  890. * Originally, an out parameter for the index of the first
  891. * string position that failed a check.
  892. * This parameter may be NULL.
  893. * @param status The error code, set if an error occurred while attempting to
  894. * perform the check.
  895. * Spoofing or security issues detected with the input string are
  896. * not reported here, but through the function's return value.
  897. * If the input contains invalid UTF-8 sequences,
  898. * a status of U_INVALID_CHAR_FOUND will be returned.
  899. * @return An integer value with bits set for any potential security
  900. * or spoofing issues detected. The bits are defined by
  901. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  902. * will be zero if the input string passes all of the
  903. * enabled checks.
  904. * @see uspoof_check2UTF8
  905. * @stable ICU 4.2
  906. */
  907. U_STABLE int32_t U_EXPORT2
  908. uspoof_checkUTF8(const USpoofChecker *sc,
  909. const char *id, int32_t length,
  910. int32_t *position,
  911. UErrorCode *status);
  912. /**
  913. * Check the specified string for possible security issues.
  914. * The text to be checked will typically be an identifier of some sort.
  915. * The set of checks to be performed is specified with uspoof_setChecks().
  916. *
  917. * @param sc The USpoofChecker
  918. * @param id The identifier to be checked for possible security issues,
  919. * in UTF-16 format.
  920. * @param length the length of the string to be checked, or -1 if the string is
  921. * zero terminated.
  922. * @param checkResult An instance of USpoofCheckResult to be filled with
  923. * details about the identifier. Can be NULL.
  924. * @param status The error code, set if an error occurred while attempting to
  925. * perform the check.
  926. * Spoofing or security issues detected with the input string are
  927. * not reported here, but through the function's return value.
  928. * @return An integer value with bits set for any potential security
  929. * or spoofing issues detected. The bits are defined by
  930. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  931. * will be zero if the input string passes all of the
  932. * enabled checks. Any information in this bitmask will be
  933. * consistent with the information saved in the optional
  934. * checkResult parameter.
  935. * @see uspoof_openCheckResult
  936. * @see uspoof_check2UTF8
  937. * @see uspoof_check2UnicodeString
  938. * @stable ICU 58
  939. */
  940. U_STABLE int32_t U_EXPORT2
  941. uspoof_check2(const USpoofChecker *sc,
  942. const UChar* id, int32_t length,
  943. USpoofCheckResult* checkResult,
  944. UErrorCode *status);
  945. /**
  946. * Check the specified string for possible security issues.
  947. * The text to be checked will typically be an identifier of some sort.
  948. * The set of checks to be performed is specified with uspoof_setChecks().
  949. *
  950. * This version of {@link uspoof_check} accepts a USpoofCheckResult, which
  951. * returns additional information about the identifier. For more
  952. * information, see {@link uspoof_openCheckResult}.
  953. *
  954. * @param sc The USpoofChecker
  955. * @param id A identifier to be checked for possible security issues, in UTF8 format.
  956. * @param length the length of the string to be checked, or -1 if the string is
  957. * zero terminated.
  958. * @param checkResult An instance of USpoofCheckResult to be filled with
  959. * details about the identifier. Can be NULL.
  960. * @param status The error code, set if an error occurred while attempting to
  961. * perform the check.
  962. * Spoofing or security issues detected with the input string are
  963. * not reported here, but through the function's return value.
  964. * @return An integer value with bits set for any potential security
  965. * or spoofing issues detected. The bits are defined by
  966. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  967. * will be zero if the input string passes all of the
  968. * enabled checks. Any information in this bitmask will be
  969. * consistent with the information saved in the optional
  970. * checkResult parameter.
  971. * @see uspoof_openCheckResult
  972. * @see uspoof_check2
  973. * @see uspoof_check2UnicodeString
  974. * @stable ICU 58
  975. */
  976. U_STABLE int32_t U_EXPORT2
  977. uspoof_check2UTF8(const USpoofChecker *sc,
  978. const char *id, int32_t length,
  979. USpoofCheckResult* checkResult,
  980. UErrorCode *status);
  981. /**
  982. * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return
  983. * information about the identifier. Information includes:
  984. * <ul>
  985. * <li>A bitmask of the checks that failed</li>
  986. * <li>The identifier's restriction level (UTS 39 section 5.2)</li>
  987. * <li>The set of numerics in the string (UTS 39 section 5.3)</li>
  988. * </ul>
  989. * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call
  990. * of {@link uspoof_check2}.
  991. *
  992. * @param status The error code, set if this function encounters a problem.
  993. * @return the newly created USpoofCheckResult
  994. * @see uspoof_check2
  995. * @see uspoof_check2UTF8
  996. * @see uspoof_check2UnicodeString
  997. * @stable ICU 58
  998. */
  999. U_STABLE USpoofCheckResult* U_EXPORT2
  1000. uspoof_openCheckResult(UErrorCode *status);
  1001. /**
  1002. * Close a USpoofCheckResult, freeing any memory that was being held by
  1003. * its implementation.
  1004. *
  1005. * @param checkResult The instance of USpoofCheckResult to close
  1006. * @stable ICU 58
  1007. */
  1008. U_STABLE void U_EXPORT2
  1009. uspoof_closeCheckResult(USpoofCheckResult *checkResult);
  1010. /**
  1011. * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
  1012. * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on.
  1013. *
  1014. * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
  1015. * @param status The error code, set if an error occurred.
  1016. * @return An integer value with bits set for any potential security
  1017. * or spoofing issues detected. The bits are defined by
  1018. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  1019. * will be zero if the input string passes all of the
  1020. * enabled checks.
  1021. * @see uspoof_setChecks
  1022. * @stable ICU 58
  1023. */
  1024. U_STABLE int32_t U_EXPORT2
  1025. uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status);
  1026. /**
  1027. * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check
  1028. * was enabled; otherwise, undefined.
  1029. *
  1030. * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
  1031. * @param status The error code, set if an error occurred.
  1032. * @return The restriction level contained in the USpoofCheckResult
  1033. * @see uspoof_setRestrictionLevel
  1034. * @stable ICU 58
  1035. */
  1036. U_STABLE URestrictionLevel U_EXPORT2
  1037. uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status);
  1038. /**
  1039. * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled;
  1040. * otherwise, undefined. The set will contain the zero digit from each decimal number system found
  1041. * in the input string. Ownership of the returned USet remains with the USpoofCheckResult.
  1042. * The USet will be free'd when {@link uspoof_closeCheckResult} is called.
  1043. *
  1044. * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
  1045. * @return The set of numerics contained in the USpoofCheckResult
  1046. * @param status The error code, set if an error occurred.
  1047. * @stable ICU 58
  1048. */
  1049. U_STABLE const USet* U_EXPORT2
  1050. uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status);
  1051. /**
  1052. * Check the whether two specified strings are visually confusable.
  1053. *
  1054. * If the strings are confusable, the return value will be nonzero, as long as
  1055. * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
  1056. *
  1057. * The bits in the return value correspond to flags for each of the classes of
  1058. * confusables applicable to the two input strings. According to UTS 39
  1059. * section 4, the possible flags are:
  1060. *
  1061. * <ul>
  1062. * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
  1063. * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
  1064. * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
  1065. * </ul>
  1066. *
  1067. * If one or more of the above flags were not listed in uspoof_setChecks(), this
  1068. * function will never report that class of confusable. The check
  1069. * {@link USPOOF_CONFUSABLE} enables all three flags.
  1070. *
  1071. *
  1072. * @param sc The USpoofChecker
  1073. * @param id1 The first of the two identifiers to be compared for
  1074. * confusability. The strings are in UTF-16 format.
  1075. * @param length1 the length of the first identifer, expressed in
  1076. * 16 bit UTF-16 code units, or -1 if the string is
  1077. * nul terminated.
  1078. * @param id2 The second of the two identifiers to be compared for
  1079. * confusability. The identifiers are in UTF-16 format.
  1080. * @param length2 The length of the second identifiers, expressed in
  1081. * 16 bit UTF-16 code units, or -1 if the string is
  1082. * nul terminated.
  1083. * @param status The error code, set if an error occurred while attempting to
  1084. * perform the check.
  1085. * Confusability of the identifiers is not reported here,
  1086. * but through this function's return value.
  1087. * @return An integer value with bit(s) set corresponding to
  1088. * the type of confusability found, as defined by
  1089. * enum USpoofChecks. Zero is returned if the identifiers
  1090. * are not confusable.
  1091. *
  1092. * @stable ICU 4.2
  1093. */
  1094. U_STABLE int32_t U_EXPORT2
  1095. uspoof_areConfusable(const USpoofChecker *sc,
  1096. const UChar *id1, int32_t length1,
  1097. const UChar *id2, int32_t length2,
  1098. UErrorCode *status);
  1099. /**
  1100. * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
  1101. *
  1102. * @param sc The USpoofChecker
  1103. * @param id1 The first of the two identifiers to be compared for
  1104. * confusability. The strings are in UTF-8 format.
  1105. * @param length1 the length of the first identifiers, in bytes, or -1
  1106. * if the string is nul terminated.
  1107. * @param id2 The second of the two identifiers to be compared for
  1108. * confusability. The strings are in UTF-8 format.
  1109. * @param length2 The length of the second string in bytes, or -1
  1110. * if the string is nul terminated.
  1111. * @param status The error code, set if an error occurred while attempting to
  1112. * perform the check.
  1113. * Confusability of the strings is not reported here,
  1114. * but through this function's return value.
  1115. * @return An integer value with bit(s) set corresponding to
  1116. * the type of confusability found, as defined by
  1117. * enum USpoofChecks. Zero is returned if the strings
  1118. * are not confusable.
  1119. *
  1120. * @stable ICU 4.2
  1121. *
  1122. * @see uspoof_areConfusable
  1123. */
  1124. U_STABLE int32_t U_EXPORT2
  1125. uspoof_areConfusableUTF8(const USpoofChecker *sc,
  1126. const char *id1, int32_t length1,
  1127. const char *id2, int32_t length2,
  1128. UErrorCode *status);
  1129. /**
  1130. * Get the "skeleton" for an identifier.
  1131. * Skeletons are a transformation of the input identifier;
  1132. * Two identifiers are confusable if their skeletons are identical.
  1133. * See Unicode UAX #39 for additional information.
  1134. *
  1135. * Using skeletons directly makes it possible to quickly check
  1136. * whether an identifier is confusable with any of some large
  1137. * set of existing identifiers, by creating an efficiently
  1138. * searchable collection of the skeletons.
  1139. *
  1140. * @param sc The USpoofChecker
  1141. * @param type Deprecated in ICU 58. You may pass any number.
  1142. * Originally, controlled which of the Unicode confusable data
  1143. * tables to use.
  1144. * @param id The input identifier whose skeleton will be computed.
  1145. * @param length The length of the input identifier, expressed in 16 bit
  1146. * UTF-16 code units, or -1 if the string is zero terminated.
  1147. * @param dest The output buffer, to receive the skeleton string.
  1148. * @param destCapacity The length of the output buffer, in 16 bit units.
  1149. * The destCapacity may be zero, in which case the function will
  1150. * return the actual length of the skeleton.
  1151. * @param status The error code, set if an error occurred while attempting to
  1152. * perform the check.
  1153. * @return The length of the skeleton string. The returned length
  1154. * is always that of the complete skeleton, even when the
  1155. * supplied buffer is too small (or of zero length)
  1156. *
  1157. * @stable ICU 4.2
  1158. * @see uspoof_areConfusable
  1159. */
  1160. U_STABLE int32_t U_EXPORT2
  1161. uspoof_getSkeleton(const USpoofChecker *sc,
  1162. uint32_t type,
  1163. const UChar *id, int32_t length,
  1164. UChar *dest, int32_t destCapacity,
  1165. UErrorCode *status);
  1166. /**
  1167. * Get the "skeleton" for an identifier.
  1168. * Skeletons are a transformation of the input identifier;
  1169. * Two identifiers are confusable if their skeletons are identical.
  1170. * See Unicode UAX #39 for additional information.
  1171. *
  1172. * Using skeletons directly makes it possible to quickly check
  1173. * whether an identifier is confusable with any of some large
  1174. * set of existing identifiers, by creating an efficiently
  1175. * searchable collection of the skeletons.
  1176. *
  1177. * @param sc The USpoofChecker
  1178. * @param type Deprecated in ICU 58. You may pass any number.
  1179. * Originally, controlled which of the Unicode confusable data
  1180. * tables to use.
  1181. * @param id The UTF-8 format identifier whose skeleton will be computed.
  1182. * @param length The length of the input string, in bytes,
  1183. * or -1 if the string is zero terminated.
  1184. * @param dest The output buffer, to receive the skeleton string.
  1185. * @param destCapacity The length of the output buffer, in bytes.
  1186. * The destCapacity may be zero, in which case the function will
  1187. * return the actual length of the skeleton.
  1188. * @param status The error code, set if an error occurred while attempting to
  1189. * perform the check. Possible Errors include U_INVALID_CHAR_FOUND
  1190. * for invalid UTF-8 sequences, and
  1191. * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
  1192. * to hold the complete skeleton.
  1193. * @return The length of the skeleton string, in bytes. The returned length
  1194. * is always that of the complete skeleton, even when the
  1195. * supplied buffer is too small (or of zero length)
  1196. *
  1197. * @stable ICU 4.2
  1198. */
  1199. U_STABLE int32_t U_EXPORT2
  1200. uspoof_getSkeletonUTF8(const USpoofChecker *sc,
  1201. uint32_t type,
  1202. const char *id, int32_t length,
  1203. char *dest, int32_t destCapacity,
  1204. UErrorCode *status);
  1205. /**
  1206. * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
  1207. * in http://unicode.org/Public/security/latest/xidmodifications.txt
  1208. * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
  1209. *
  1210. * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
  1211. * be deleted by the caller.
  1212. *
  1213. * @param status The error code, set if a problem occurs while creating the set.
  1214. *
  1215. * @stable ICU 51
  1216. */
  1217. U_STABLE const USet * U_EXPORT2
  1218. uspoof_getInclusionSet(UErrorCode *status);
  1219. /**
  1220. * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
  1221. * in http://unicode.org/Public/security/latest/xidmodifications.txt
  1222. * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
  1223. *
  1224. * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
  1225. * be deleted by the caller.
  1226. *
  1227. * @param status The error code, set if a problem occurs while creating the set.
  1228. *
  1229. * @stable ICU 51
  1230. */
  1231. U_STABLE const USet * U_EXPORT2
  1232. uspoof_getRecommendedSet(UErrorCode *status);
  1233. /**
  1234. * Serialize the data for a spoof detector into a chunk of memory.
  1235. * The flattened spoof detection tables can later be used to efficiently
  1236. * instantiate a new Spoof Detector.
  1237. *
  1238. * The serialized spoof checker includes only the data compiled from the
  1239. * Unicode data tables by uspoof_openFromSource(); it does not include
  1240. * include any other state or configuration that may have been set.
  1241. *
  1242. * @param sc the Spoof Detector whose data is to be serialized.
  1243. * @param data a pointer to 32-bit-aligned memory to be filled with the data,
  1244. * can be NULL if capacity==0
  1245. * @param capacity the number of bytes available at data,
  1246. * or 0 for preflighting
  1247. * @param status an in/out ICU UErrorCode; possible errors include:
  1248. * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
  1249. * - U_ILLEGAL_ARGUMENT_ERROR the data or capacity parameters are bad
  1250. * @return the number of bytes written or needed for the spoof data
  1251. *
  1252. * @see utrie2_openFromSerialized()
  1253. * @stable ICU 4.2
  1254. */
  1255. U_STABLE int32_t U_EXPORT2
  1256. uspoof_serialize(USpoofChecker *sc,
  1257. void *data, int32_t capacity,
  1258. UErrorCode *status);
  1259. U_CDECL_END
  1260. #if U_SHOW_CPLUSPLUS_API
  1261. U_NAMESPACE_BEGIN
  1262. /**
  1263. * \class LocalUSpoofCheckerPointer
  1264. * "Smart pointer" class, closes a USpoofChecker via uspoof_close().
  1265. * For most methods see the LocalPointerBase base class.
  1266. *
  1267. * @see LocalPointerBase
  1268. * @see LocalPointer
  1269. * @stable ICU 4.4
  1270. */
  1271. /**
  1272. * \cond
  1273. * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
  1274. * For now, suppress with a Doxygen cond
  1275. */
  1276. U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close);
  1277. /** \endcond */
  1278. /**
  1279. * \class LocalUSpoofCheckResultPointer
  1280. * "Smart pointer" class, closes a USpoofCheckResult via `uspoof_closeCheckResult()`.
  1281. * For most methods see the LocalPointerBase base class.
  1282. *
  1283. * @see LocalPointerBase
  1284. * @see LocalPointer
  1285. * @stable ICU 58
  1286. */
  1287. /**
  1288. * \cond
  1289. * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
  1290. * For now, suppress with a Doxygen cond
  1291. */
  1292. U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult);
  1293. /** \endcond */
  1294. U_NAMESPACE_END
  1295. /**
  1296. * Limit the acceptable characters to those specified by a Unicode Set.
  1297. * Any previously specified character limit is
  1298. * is replaced by the new settings. This includes limits on
  1299. * characters that were set with the uspoof_setAllowedLocales() function.
  1300. *
  1301. * The USPOOF_CHAR_LIMIT test is automatically enabled for this
  1302. * USoofChecker by this function.
  1303. *
  1304. * @param sc The USpoofChecker
  1305. * @param chars A Unicode Set containing the list of
  1306. * characters that are permitted. Ownership of the set
  1307. * remains with the caller. The incoming set is cloned by
  1308. * this function, so there are no restrictions on modifying
  1309. * or deleting the UnicodeSet after calling this function.
  1310. * @param status The error code, set if this function encounters a problem.
  1311. * @stable ICU 4.2
  1312. */
  1313. U_STABLE void U_EXPORT2
  1314. uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status);
  1315. /**
  1316. * Get a UnicodeSet for the characters permitted in an identifier.
  1317. * This corresponds to the limits imposed by the Set Allowed Characters /
  1318. * UnicodeSet functions. Limitations imposed by other checks will not be
  1319. * reflected in the set returned by this function.
  1320. *
  1321. * The returned set will be frozen, meaning that it cannot be modified
  1322. * by the caller.
  1323. *
  1324. * Ownership of the returned set remains with the Spoof Detector. The
  1325. * returned set will become invalid if the spoof detector is closed,
  1326. * or if a new set of allowed characters is specified.
  1327. *
  1328. *
  1329. * @param sc The USpoofChecker
  1330. * @param status The error code, set if this function encounters a problem.
  1331. * @return A UnicodeSet containing the characters that are permitted by
  1332. * the USPOOF_CHAR_LIMIT test.
  1333. * @stable ICU 4.2
  1334. */
  1335. U_STABLE const icu::UnicodeSet * U_EXPORT2
  1336. uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status);
  1337. /**
  1338. * Check the specified string for possible security issues.
  1339. * The text to be checked will typically be an identifier of some sort.
  1340. * The set of checks to be performed is specified with uspoof_setChecks().
  1341. *
  1342. * \note
  1343. * Consider using the newer API, {@link uspoof_check2UnicodeString}, instead.
  1344. * The newer API exposes additional information from the check procedure
  1345. * and is otherwise identical to this method.
  1346. *
  1347. * @param sc The USpoofChecker
  1348. * @param id A identifier to be checked for possible security issues.
  1349. * @param position Deprecated in ICU 51. Always returns zero.
  1350. * Originally, an out parameter for the index of the first
  1351. * string position that failed a check.
  1352. * This parameter may be NULL.
  1353. * @param status The error code, set if an error occurred while attempting to
  1354. * perform the check.
  1355. * Spoofing or security issues detected with the input string are
  1356. * not reported here, but through the function's return value.
  1357. * @return An integer value with bits set for any potential security
  1358. * or spoofing issues detected. The bits are defined by
  1359. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  1360. * will be zero if the input string passes all of the
  1361. * enabled checks.
  1362. * @see uspoof_check2UnicodeString
  1363. * @stable ICU 4.2
  1364. */
  1365. U_STABLE int32_t U_EXPORT2
  1366. uspoof_checkUnicodeString(const USpoofChecker *sc,
  1367. const icu::UnicodeString &id,
  1368. int32_t *position,
  1369. UErrorCode *status);
  1370. /**
  1371. * Check the specified string for possible security issues.
  1372. * The text to be checked will typically be an identifier of some sort.
  1373. * The set of checks to be performed is specified with uspoof_setChecks().
  1374. *
  1375. * @param sc The USpoofChecker
  1376. * @param id A identifier to be checked for possible security issues.
  1377. * @param checkResult An instance of USpoofCheckResult to be filled with
  1378. * details about the identifier. Can be NULL.
  1379. * @param status The error code, set if an error occurred while attempting to
  1380. * perform the check.
  1381. * Spoofing or security issues detected with the input string are
  1382. * not reported here, but through the function's return value.
  1383. * @return An integer value with bits set for any potential security
  1384. * or spoofing issues detected. The bits are defined by
  1385. * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS)
  1386. * will be zero if the input string passes all of the
  1387. * enabled checks. Any information in this bitmask will be
  1388. * consistent with the information saved in the optional
  1389. * checkResult parameter.
  1390. * @see uspoof_openCheckResult
  1391. * @see uspoof_check2
  1392. * @see uspoof_check2UTF8
  1393. * @stable ICU 58
  1394. */
  1395. U_STABLE int32_t U_EXPORT2
  1396. uspoof_check2UnicodeString(const USpoofChecker *sc,
  1397. const icu::UnicodeString &id,
  1398. USpoofCheckResult* checkResult,
  1399. UErrorCode *status);
  1400. /**
  1401. * A version of {@link uspoof_areConfusable} accepting UnicodeStrings.
  1402. *
  1403. * @param sc The USpoofChecker
  1404. * @param s1 The first of the two identifiers to be compared for
  1405. * confusability. The strings are in UTF-8 format.
  1406. * @param s2 The second of the two identifiers to be compared for
  1407. * confusability. The strings are in UTF-8 format.
  1408. * @param status The error code, set if an error occurred while attempting to
  1409. * perform the check.
  1410. * Confusability of the identifiers is not reported here,
  1411. * but through this function's return value.
  1412. * @return An integer value with bit(s) set corresponding to
  1413. * the type of confusability found, as defined by
  1414. * enum USpoofChecks. Zero is returned if the identifiers
  1415. * are not confusable.
  1416. *
  1417. * @stable ICU 4.2
  1418. *
  1419. * @see uspoof_areConfusable
  1420. */
  1421. U_STABLE int32_t U_EXPORT2
  1422. uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
  1423. const icu::UnicodeString &s1,
  1424. const icu::UnicodeString &s2,
  1425. UErrorCode *status);
  1426. /**
  1427. * Get the "skeleton" for an identifier.
  1428. * Skeletons are a transformation of the input identifier;
  1429. * Two identifiers are confusable if their skeletons are identical.
  1430. * See Unicode UAX #39 for additional information.
  1431. *
  1432. * Using skeletons directly makes it possible to quickly check
  1433. * whether an identifier is confusable with any of some large
  1434. * set of existing identifiers, by creating an efficiently
  1435. * searchable collection of the skeletons.
  1436. *
  1437. * @param sc The USpoofChecker.
  1438. * @param type Deprecated in ICU 58. You may pass any number.
  1439. * Originally, controlled which of the Unicode confusable data
  1440. * tables to use.
  1441. * @param id The input identifier whose skeleton will be computed.
  1442. * @param dest The output identifier, to receive the skeleton string.
  1443. * @param status The error code, set if an error occurred while attempting to
  1444. * perform the check.
  1445. * @return A reference to the destination (skeleton) string.
  1446. *
  1447. * @stable ICU 4.2
  1448. */
  1449. U_I18N_API icu::UnicodeString & U_EXPORT2
  1450. uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
  1451. uint32_t type,
  1452. const icu::UnicodeString &id,
  1453. icu::UnicodeString &dest,
  1454. UErrorCode *status);
  1455. /**
  1456. * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
  1457. * in http://unicode.org/Public/security/latest/xidmodifications.txt
  1458. * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
  1459. *
  1460. * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
  1461. * be deleted by the caller.
  1462. *
  1463. * @param status The error code, set if a problem occurs while creating the set.
  1464. *
  1465. * @stable ICU 51
  1466. */
  1467. U_STABLE const icu::UnicodeSet * U_EXPORT2
  1468. uspoof_getInclusionUnicodeSet(UErrorCode *status);
  1469. /**
  1470. * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
  1471. * in http://unicode.org/Public/security/latest/xidmodifications.txt
  1472. * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
  1473. *
  1474. * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
  1475. * be deleted by the caller.
  1476. *
  1477. * @param status The error code, set if a problem occurs while creating the set.
  1478. *
  1479. * @stable ICU 51
  1480. */
  1481. U_STABLE const icu::UnicodeSet * U_EXPORT2
  1482. uspoof_getRecommendedUnicodeSet(UErrorCode *status);
  1483. #endif /* U_SHOW_CPLUSPLUS_API */
  1484. #endif /* UCONFIG_NO_NORMALIZATION */
  1485. #endif /* USPOOF_H */