ppucd.h 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2011-2013, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * file name: ppucd.h
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2011dec11
  14. * created by: Markus W. Scherer
  15. */
  16. #ifndef __PPUCD_H__
  17. #define __PPUCD_H__
  18. #include "unicode/utypes.h"
  19. #include "unicode/uniset.h"
  20. #include "unicode/unistr.h"
  21. #include <stdio.h>
  22. /** Additions to the uchar.h enum UProperty. */
  23. enum {
  24. /** Name_Alias */
  25. PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
  26. PPUCD_CONDITIONAL_CASE_MAPPINGS,
  27. PPUCD_TURKIC_CASE_FOLDING
  28. };
  29. U_NAMESPACE_BEGIN
  30. class U_TOOLUTIL_API PropertyNames {
  31. public:
  32. virtual ~PropertyNames();
  33. virtual int32_t getPropertyEnum(const char *name) const;
  34. virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
  35. };
  36. struct U_TOOLUTIL_API UniProps {
  37. UniProps();
  38. ~UniProps();
  39. int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
  40. UChar32 start, end;
  41. UBool binProps[UCHAR_BINARY_LIMIT];
  42. int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
  43. UVersionInfo age;
  44. UChar32 bmg, bpb;
  45. UChar32 scf, slc, stc, suc;
  46. int32_t digitValue;
  47. const char *numericValue;
  48. const char *name;
  49. const char *nameAlias;
  50. UnicodeString cf, lc, tc, uc;
  51. UnicodeSet scx;
  52. };
  53. class U_TOOLUTIL_API PreparsedUCD {
  54. public:
  55. enum LineType {
  56. /** No line, end of file. */
  57. NO_LINE,
  58. /** Empty line. (Might contain a comment.) */
  59. EMPTY_LINE,
  60. /** ucd;6.1.0 */
  61. UNICODE_VERSION_LINE,
  62. /** property;Binary;Alpha;Alphabetic */
  63. PROPERTY_LINE,
  64. /** binary;N;No;F;False */
  65. BINARY_LINE,
  66. /** value;gc;Zs;Space_Separator */
  67. VALUE_LINE,
  68. /** defaults;0000..10FFFF;age=NA;bc=L;... */
  69. DEFAULTS_LINE,
  70. /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
  71. BLOCK_LINE,
  72. /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
  73. CP_LINE,
  74. /** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */
  75. UNASSIGNED_LINE,
  76. /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
  77. ALG_NAMES_RANGE_LINE,
  78. LINE_TYPE_COUNT
  79. };
  80. /**
  81. * Constructor.
  82. * Prepare this object for a new, empty package.
  83. */
  84. PreparsedUCD(const char *filename, UErrorCode &errorCode);
  85. /** Destructor. */
  86. ~PreparsedUCD();
  87. /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
  88. void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
  89. /**
  90. * Reads a line from the preparsed UCD file.
  91. * Splits the line by replacing each ';' with a NUL.
  92. */
  93. LineType readLine(UErrorCode &errorCode);
  94. /** Returns the number of the line read by readLine(). */
  95. int32_t getLineNumber() const { return lineNumber; }
  96. /** Returns the line's next field, or NULL. */
  97. const char *nextField();
  98. /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
  99. const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
  100. /** Returns TRUE if the current line has property values. */
  101. UBool lineHasPropertyValues() const {
  102. return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE;
  103. }
  104. /**
  105. * Parses properties from the current line.
  106. * Clears newValues and sets UProperty codes for property values mentioned
  107. * on the current line (as opposed to being inherited).
  108. * Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
  109. * The returned UniProps are usable until the next line of the same type is read.
  110. */
  111. const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
  112. /**
  113. * Returns the code point range for the current algnamesrange line.
  114. * Calls & parses nextField().
  115. * Further nextField() calls will yield the range's type & prefix string.
  116. * Returns U_SUCCESS(errorCode).
  117. */
  118. UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
  119. private:
  120. UBool isLineBufferAvailable(int32_t i) {
  121. return defaultLineIndex!=i && blockLineIndex!=i;
  122. }
  123. /** Resets the field iterator and returns the line's first field (the line type field). */
  124. const char *firstField();
  125. UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
  126. UErrorCode &errorCode);
  127. UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
  128. UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
  129. void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
  130. void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
  131. static const int32_t kNumLineBuffers=3;
  132. PropertyNames *icuPnames; // owned
  133. const PropertyNames *pnames; // aliased
  134. FILE *file;
  135. int32_t defaultLineIndex, blockLineIndex, lineIndex;
  136. int32_t lineNumber;
  137. LineType lineType;
  138. char *fieldLimit;
  139. char *lineLimit;
  140. UVersionInfo ucdVersion;
  141. UniProps defaultProps, blockProps, cpProps;
  142. UnicodeSet blockValues;
  143. // Multiple lines so that default and block properties can maintain pointers
  144. // into their line buffers.
  145. char lines[kNumLineBuffers][4096];
  146. };
  147. U_NAMESPACE_END
  148. #endif // __PPUCD_H__