123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- **********************************************************************
- * Copyright (C) 2001-2015 IBM and others. All rights reserved.
- **********************************************************************
- * Date Name Description
- * 08/13/2001 synwee Creation.
- **********************************************************************
- */
- #ifndef USRCHIMP_H
- #define USRCHIMP_H
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_COLLATION
- #include "unicode/normalizer2.h"
- #include "unicode/ucol.h"
- #include "unicode/ucoleitr.h"
- #include "unicode/ubrk.h"
- /* mask off anything but primary order */
- #define UCOL_PRIMARYORDERMASK 0xffff0000
- /* mask off anything but secondary order */
- #define UCOL_SECONDARYORDERMASK 0x0000ff00
- /* mask off anything but tertiary order */
- #define UCOL_TERTIARYORDERMASK 0x000000ff
- /* primary order shift */
- #define UCOL_PRIMARYORDERSHIFT 16
- /* secondary order shift */
- #define UCOL_SECONDARYORDERSHIFT 8
- #define UCOL_IGNORABLE 0
- /* get weights from a CE */
- #define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
- #define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
- #define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
- #define UCOL_CONTINUATION_MARKER 0xC0
- #define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
- /**
- * This indicates an error has occured during processing or there are no more CEs
- * to be returned.
- */
- #define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX)
- U_NAMESPACE_BEGIN
- class CollationElementIterator;
- class Collator;
- struct PCEI
- {
- uint64_t ce;
- int32_t low;
- int32_t high;
- };
- struct PCEBuffer
- {
- PCEI defaultBuffer[16];
- PCEI *buffer;
- int32_t bufferIndex;
- int32_t bufferSize;
- PCEBuffer();
- ~PCEBuffer();
- void reset();
- UBool isEmpty() const;
- void put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
- const PCEI *get();
- };
- class UCollationPCE : public UMemory {
- private:
- PCEBuffer pceBuffer;
- CollationElementIterator *cei;
- UCollationStrength strength;
- UBool toShift;
- UBool isShifted;
- uint32_t variableTop;
- public:
- UCollationPCE(UCollationElements *elems);
- UCollationPCE(CollationElementIterator *iter);
- ~UCollationPCE();
- void init(UCollationElements *elems);
- void init(CollationElementIterator *iter);
- /**
- * Get the processed ordering priority of the next collation element in the text.
- * A single character may contain more than one collation element.
- *
- * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
- * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
- * @param status A pointer to an UErrorCode to receive any errors.
- * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
- * if an error has occured or if the end of string has been reached
- */
- int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
- /**
- * Get the processed ordering priority of the previous collation element in the text.
- * A single character may contain more than one collation element.
- *
- * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
- * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
- * @param status A pointer to an UErrorCode to receive any errors. Noteably
- * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
- * buffer has been exhausted.
- * @return The previous collation elements ordering, otherwise returns
- * UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
- * string has been reached.
- */
- int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
- private:
- void init(const Collator &coll);
- uint64_t processCE(uint32_t ce);
- };
- U_NAMESPACE_END
- #define INITIAL_ARRAY_SIZE_ 256
- #define MAX_TABLE_SIZE_ 257
- struct USearch {
- // required since collation element iterator does not have a getText API
- const UChar *text;
- int32_t textLength; // exact length
- UBool isOverlap;
- UBool isCanonicalMatch;
- int16_t elementComparisonType;
- UBreakIterator *internalBreakIter; //internal character breakiterator
- UBreakIterator *breakIter;
- // value USEARCH_DONE is the default value
- // if we are not at the start of the text or the end of the text,
- // depending on the iteration direction and matchedIndex is USEARCH_DONE
- // it means that we can't find any more matches in that particular direction
- int32_t matchedIndex;
- int32_t matchedLength;
- UBool isForwardSearching;
- UBool reset;
- };
- struct UPattern {
- const UChar *text;
- int32_t textLength; // exact length
- // length required for backwards ce comparison
- int32_t cesLength;
- int32_t *ces;
- int32_t cesBuffer[INITIAL_ARRAY_SIZE_];
- int32_t pcesLength;
- int64_t *pces;
- int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
- UBool hasPrefixAccents;
- UBool hasSuffixAccents;
- int16_t defaultShiftSize;
- int16_t shift[MAX_TABLE_SIZE_];
- int16_t backShift[MAX_TABLE_SIZE_];
- };
- struct UStringSearch {
- struct USearch *search;
- struct UPattern pattern;
- const UCollator *collator;
- const icu::Normalizer2 *nfd;
- // positions within the collation element iterator is used to determine
- // if we are at the start of the text.
- UCollationElements *textIter;
- icu::UCollationPCE *textProcessedIter;
- // utility collation element, used throughout program for temporary
- // iteration.
- UCollationElements *utilIter;
- UBool ownCollator;
- UCollationStrength strength;
- uint32_t ceMask;
- uint32_t variableTop;
- UBool toShift;
- UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
- UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
- };
- /**
- * Exact matches without checking for the ends for extra accents.
- * The match after the position within the collation element iterator is to be
- * found.
- * After a match is found the offset in the collation element iterator will be
- * shifted to the start of the match.
- * Implementation note:
- * For tertiary we can't use the collator->tertiaryMask, that is a
- * preprocessed mask that takes into account case options. since we are only
- * concerned with exact matches, we don't need that.
- * Alternate handling - since only the 16 most significant digits is only used,
- * we can safely do a compare without masking if the ce is a variable, we mask
- * and get only the primary values no shifting to quartenary is required since
- * all primary values less than variabletop will need to be masked off anyway.
- * If the end character is composite and the pattern ce does not match the text
- * ce, we skip it until we find a match in the end composite character or when
- * it has passed the character. This is so that we can match pattern "a" with
- * the text "\u00e6"
- * @param strsrch string search data
- * @param status error status if any
- * @return TRUE if an exact match is found, FALSE otherwise
- */
- U_CFUNC
- UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
- /**
- * Canonical matches.
- * According to the definition, matches found here will include the whole span
- * of beginning and ending accents if it overlaps that region.
- * @param strsrch string search data
- * @param status error status if any
- * @return TRUE if a canonical match is found, FALSE otherwise
- */
- U_CFUNC
- UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
- /**
- * Gets the previous match.
- * Comments follows from handleNextExact
- * @param strsrch string search data
- * @param status error status if any
- * @return True if a exact math is found, FALSE otherwise.
- */
- U_CFUNC
- UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
- /**
- * Canonical matches.
- * According to the definition, matches found here will include the whole span
- * of beginning and ending accents if it overlaps that region.
- * @param strsrch string search data
- * @param status error status if any
- * @return TRUE if a canonical match is found, FALSE otherwise
- */
- U_CFUNC
- UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
- UErrorCode *status);
- #endif /* #if !UCONFIG_NO_COLLATION */
- #endif
|