unisetspan.h 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2007, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: unisetspan.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2007mar01
  16. * created by: Markus W. Scherer
  17. */
  18. #ifndef __UNISETSPAN_H__
  19. #define __UNISETSPAN_H__
  20. #include "unicode/utypes.h"
  21. #include "unicode/uniset.h"
  22. U_NAMESPACE_BEGIN
  23. /*
  24. * Implement span() etc. for a set with strings.
  25. * Avoid recursion because of its exponential complexity.
  26. * Instead, try multiple paths at once and track them with an IndexList.
  27. */
  28. class UnicodeSetStringSpan : public UMemory {
  29. public:
  30. /*
  31. * Which span() variant will be used?
  32. * The object is either built for one variant and used once,
  33. * or built for all and may be used many times.
  34. */
  35. enum {
  36. FWD = 0x20,
  37. BACK = 0x10,
  38. UTF16 = 8,
  39. UTF8 = 4,
  40. CONTAINED = 2,
  41. NOT_CONTAINED = 1,
  42. ALL = 0x3f,
  43. FWD_UTF16_CONTAINED = FWD | UTF16 | CONTAINED,
  44. FWD_UTF16_NOT_CONTAINED = FWD | UTF16 | NOT_CONTAINED,
  45. FWD_UTF8_CONTAINED = FWD | UTF8 | CONTAINED,
  46. FWD_UTF8_NOT_CONTAINED = FWD | UTF8 | NOT_CONTAINED,
  47. BACK_UTF16_CONTAINED = BACK | UTF16 | CONTAINED,
  48. BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED,
  49. BACK_UTF8_CONTAINED = BACK | UTF8 | CONTAINED,
  50. BACK_UTF8_NOT_CONTAINED = BACK | UTF8 | NOT_CONTAINED
  51. };
  52. UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which);
  53. // Copy constructor. Assumes which==ALL for a frozen set.
  54. UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings);
  55. ~UnicodeSetStringSpan();
  56. /*
  57. * Do the strings need to be checked in span() etc.?
  58. * @return TRUE if strings need to be checked (call span() here),
  59. * FALSE if not (use a BMPSet for best performance).
  60. */
  61. inline UBool needsStringSpanUTF16();
  62. inline UBool needsStringSpanUTF8();
  63. // For fast UnicodeSet::contains(c).
  64. inline UBool contains(UChar32 c) const;
  65. int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
  66. int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
  67. int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
  68. int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
  69. private:
  70. // Special spanLength byte values.
  71. enum {
  72. // The spanLength is >=0xfe.
  73. LONG_SPAN=0xfe,
  74. // All code points in the string are contained in the parent set.
  75. ALL_CP_CONTAINED=0xff
  76. };
  77. // Add a starting or ending string character to the spanNotSet
  78. // so that a character span ends before any string.
  79. void addToSpanNotSet(UChar32 c);
  80. int32_t spanNot(const UChar *s, int32_t length) const;
  81. int32_t spanNotBack(const UChar *s, int32_t length) const;
  82. int32_t spanNotUTF8(const uint8_t *s, int32_t length) const;
  83. int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const;
  84. // Set for span(). Same as parent but without strings.
  85. UnicodeSet spanSet;
  86. // Set for span(not contained).
  87. // Same as spanSet, plus characters that start or end strings.
  88. UnicodeSet *pSpanNotSet;
  89. // The strings of the parent set.
  90. const UVector &strings;
  91. // Pointer to the UTF-8 string lengths.
  92. // Also pointer to further allocated storage for meta data and
  93. // UTF-8 string contents as necessary.
  94. int32_t *utf8Lengths;
  95. // Pointer to the part of the (utf8Lengths) memory block that stores
  96. // the lengths of span(), spanBack() etc. for each string.
  97. uint8_t *spanLengths;
  98. // Pointer to the part of the (utf8Lengths) memory block that stores
  99. // the UTF-8 versions of the parent set's strings.
  100. uint8_t *utf8;
  101. // Number of bytes for all UTF-8 versions of strings together.
  102. int32_t utf8Length;
  103. // Maximum lengths of relevant strings.
  104. int32_t maxLength16;
  105. int32_t maxLength8;
  106. // Set up for all variants of span()?
  107. UBool all;
  108. // Memory for small numbers and lengths of strings.
  109. // For example, for 8 strings:
  110. // 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters
  111. // = 112 bytes = int32_t[28].
  112. int32_t staticLengths[32];
  113. };
  114. UBool UnicodeSetStringSpan::needsStringSpanUTF16() {
  115. return (UBool)(maxLength16!=0);
  116. }
  117. UBool UnicodeSetStringSpan::needsStringSpanUTF8() {
  118. return (UBool)(maxLength8!=0);
  119. }
  120. UBool UnicodeSetStringSpan::contains(UChar32 c) const {
  121. return spanSet.contains(c);
  122. }
  123. U_NAMESPACE_END
  124. #endif