123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- * Copyright (C) 2011-2013, International Business Machines
- * Corporation and others. All Rights Reserved.
- *******************************************************************************
- * file name: ppucd.h
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 2011dec11
- * created by: Markus W. Scherer
- */
- #ifndef __PPUCD_H__
- #define __PPUCD_H__
- #include "unicode/utypes.h"
- #include "unicode/uniset.h"
- #include "unicode/unistr.h"
- #include <stdio.h>
- /** Additions to the uchar.h enum UProperty. */
- enum {
- /** Name_Alias */
- PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
- PPUCD_CONDITIONAL_CASE_MAPPINGS,
- PPUCD_TURKIC_CASE_FOLDING
- };
- U_NAMESPACE_BEGIN
- class U_TOOLUTIL_API PropertyNames {
- public:
- virtual ~PropertyNames();
- virtual int32_t getPropertyEnum(const char *name) const;
- virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const;
- };
- struct U_TOOLUTIL_API UniProps {
- UniProps();
- ~UniProps();
- int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
- UChar32 start, end;
- UBool binProps[UCHAR_BINARY_LIMIT];
- int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
- UVersionInfo age;
- UChar32 bmg, bpb;
- UChar32 scf, slc, stc, suc;
- int32_t digitValue;
- const char *numericValue;
- const char *name;
- const char *nameAlias;
- UnicodeString cf, lc, tc, uc;
- UnicodeSet scx;
- };
- class U_TOOLUTIL_API PreparsedUCD {
- public:
- enum LineType {
- /** No line, end of file. */
- NO_LINE,
- /** Empty line. (Might contain a comment.) */
- EMPTY_LINE,
- /** ucd;6.1.0 */
- UNICODE_VERSION_LINE,
- /** property;Binary;Alpha;Alphabetic */
- PROPERTY_LINE,
- /** binary;N;No;F;False */
- BINARY_LINE,
- /** value;gc;Zs;Space_Separator */
- VALUE_LINE,
- /** defaults;0000..10FFFF;age=NA;bc=L;... */
- DEFAULTS_LINE,
- /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
- BLOCK_LINE,
- /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
- CP_LINE,
- /** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */
- UNASSIGNED_LINE,
- /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
- ALG_NAMES_RANGE_LINE,
- LINE_TYPE_COUNT
- };
- /**
- * Constructor.
- * Prepare this object for a new, empty package.
- */
- PreparsedUCD(const char *filename, UErrorCode &errorCode);
- /** Destructor. */
- ~PreparsedUCD();
- /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */
- void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
- /**
- * Reads a line from the preparsed UCD file.
- * Splits the line by replacing each ';' with a NUL.
- */
- LineType readLine(UErrorCode &errorCode);
- /** Returns the number of the line read by readLine(). */
- int32_t getLineNumber() const { return lineNumber; }
- /** Returns the line's next field, or NULL. */
- const char *nextField();
- /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
- const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
- /** Returns TRUE if the current line has property values. */
- UBool lineHasPropertyValues() const {
- return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE;
- }
- /**
- * Parses properties from the current line.
- * Clears newValues and sets UProperty codes for property values mentioned
- * on the current line (as opposed to being inherited).
- * Returns a pointer to the filled-in UniProps, or NULL if something went wrong.
- * The returned UniProps are usable until the next line of the same type is read.
- */
- const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
- /**
- * Returns the code point range for the current algnamesrange line.
- * Calls & parses nextField().
- * Further nextField() calls will yield the range's type & prefix string.
- * Returns U_SUCCESS(errorCode).
- */
- UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
- private:
- UBool isLineBufferAvailable(int32_t i) {
- return defaultLineIndex!=i && blockLineIndex!=i;
- }
- /** Resets the field iterator and returns the line's first field (the line type field). */
- const char *firstField();
- UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
- UErrorCode &errorCode);
- UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
- UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
- void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
- void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
- static const int32_t kNumLineBuffers=3;
- PropertyNames *icuPnames; // owned
- const PropertyNames *pnames; // aliased
- FILE *file;
- int32_t defaultLineIndex, blockLineIndex, lineIndex;
- int32_t lineNumber;
- LineType lineType;
- char *fieldLimit;
- char *lineLimit;
- UVersionInfo ucdVersion;
- UniProps defaultProps, blockProps, cpProps;
- UnicodeSet blockValues;
- // Multiple lines so that default and block properties can maintain pointers
- // into their line buffers.
- char lines[kNumLineBuffers][4096];
- };
- U_NAMESPACE_END
- #endif // __PPUCD_H__
|