uparse.h 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2000-2010, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: uparse.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2000apr18
  16. * created by: Markus W. Scherer
  17. *
  18. * This file provides a parser for files that are delimited by one single
  19. * character like ';' or TAB. Example: the Unicode Character Properties files
  20. * like UnicodeData.txt are semicolon-delimited.
  21. */
  22. #ifndef __UPARSE_H__
  23. #define __UPARSE_H__
  24. #include "unicode/utypes.h"
  25. /**
  26. * Is c an invariant-character whitespace?
  27. * @param c invariant character
  28. */
  29. #define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n')
  30. U_CDECL_BEGIN
  31. /**
  32. * Skip space ' ' and TAB '\t' characters.
  33. *
  34. * @param s Pointer to characters.
  35. * @return Pointer to first character at or after s that is not a space or TAB.
  36. */
  37. U_CAPI const char * U_EXPORT2
  38. u_skipWhitespace(const char *s);
  39. /**
  40. * Trim whitespace (including line endings) from the end of the string.
  41. *
  42. * @param s Pointer to the string.
  43. * @return Pointer to the new end of the string.
  44. */
  45. U_CAPI char * U_EXPORT2
  46. u_rtrim(char *s);
  47. /** Function type for u_parseDelimitedFile(). */
  48. typedef void U_CALLCONV
  49. UParseLineFn(void *context,
  50. char *fields[][2],
  51. int32_t fieldCount,
  52. UErrorCode *pErrorCode);
  53. /**
  54. * Parser for files that are similar to UnicodeData.txt:
  55. * This function opens the file and reads it line by line. It skips empty lines
  56. * and comment lines that start with a '#'.
  57. * All other lines are separated into fields with one delimiter character
  58. * (semicolon for Unicode Properties files) between two fields. The last field in
  59. * a line does not need to be terminated with a delimiter.
  60. *
  61. * For each line, after segmenting it, a line function is called.
  62. * It gets passed the array of field start and limit pointers that is
  63. * passed into this parser and filled by it for each line.
  64. * For each field i of the line, the start pointer in fields[i][0]
  65. * points to the beginning of the field, while the limit pointer in fields[i][1]
  66. * points behind the field, i.e., to the delimiter or the line end.
  67. *
  68. * The context parameter of the line function is
  69. * the same as the one for the parse function.
  70. *
  71. * The line function may modify the contents of the fields including the
  72. * limit characters.
  73. *
  74. * If the file cannot be opened, or there is a parsing error or a field function
  75. * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code.
  76. */
  77. U_CAPI void U_EXPORT2
  78. u_parseDelimitedFile(const char *filename, char delimiter,
  79. char *fields[][2], int32_t fieldCount,
  80. UParseLineFn *lineFn, void *context,
  81. UErrorCode *pErrorCode);
  82. /**
  83. * Parse a string of code points like 0061 0308 0300.
  84. * s must end with either ';' or NUL.
  85. *
  86. * @return Number of code points.
  87. */
  88. U_CAPI int32_t U_EXPORT2
  89. u_parseCodePoints(const char *s,
  90. uint32_t *dest, int32_t destCapacity,
  91. UErrorCode *pErrorCode);
  92. /**
  93. * Parse a list of code points like 0061 0308 0300
  94. * into a UChar * string.
  95. * s must end with either ';' or NUL.
  96. *
  97. * Set the first code point in *pFirst.
  98. *
  99. * @param s Input char * string.
  100. * @param dest Output string buffer.
  101. * @param destCapacity Capacity of dest in numbers of UChars.
  102. * @param pFirst If pFirst!=NULL the *pFirst will be set to the first
  103. * code point in the string.
  104. * @param pErrorCode ICU error code.
  105. * @return The length of the string in numbers of UChars.
  106. */
  107. U_CAPI int32_t U_EXPORT2
  108. u_parseString(const char *s,
  109. UChar *dest, int32_t destCapacity,
  110. uint32_t *pFirst,
  111. UErrorCode *pErrorCode);
  112. /**
  113. * Parse a code point range like
  114. * 0085 or
  115. * 4E00..9FA5.
  116. *
  117. * s must contain such a range and end with either ';' or NUL.
  118. *
  119. * @return Length of code point range, end-start+1
  120. */
  121. U_CAPI int32_t U_EXPORT2
  122. u_parseCodePointRange(const char *s,
  123. uint32_t *pStart, uint32_t *pEnd,
  124. UErrorCode *pErrorCode);
  125. /**
  126. * Same as u_parseCodePointRange() but the range may be terminated by
  127. * any character. The position of the terminating character is returned via
  128. * the *terminator output parameter.
  129. */
  130. U_CAPI int32_t U_EXPORT2
  131. u_parseCodePointRangeAnyTerminator(const char *s,
  132. uint32_t *pStart, uint32_t *pEnd,
  133. const char **terminator,
  134. UErrorCode *pErrorCode);
  135. U_CAPI int32_t U_EXPORT2
  136. u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status);
  137. U_CDECL_END
  138. #endif