locdistance.h 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. // © 2019 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html#License
  3. // locdistance.h
  4. // created: 2019may08 Markus W. Scherer
  5. #ifndef __LOCDISTANCE_H__
  6. #define __LOCDISTANCE_H__
  7. #include "unicode/utypes.h"
  8. #include "unicode/bytestrie.h"
  9. #include "unicode/localematcher.h"
  10. #include "unicode/locid.h"
  11. #include "unicode/uobject.h"
  12. #include "lsr.h"
  13. U_NAMESPACE_BEGIN
  14. struct LocaleDistanceData;
  15. /**
  16. * Offline-built data for LocaleMatcher.
  17. * Mostly but not only the data for mapping locales to their maximized forms.
  18. */
  19. class LocaleDistance final : public UMemory {
  20. public:
  21. static const LocaleDistance *getSingleton(UErrorCode &errorCode);
  22. static int32_t shiftDistance(int32_t distance) {
  23. return distance << DISTANCE_SHIFT;
  24. }
  25. static int32_t getShiftedDistance(int32_t indexAndDistance) {
  26. return indexAndDistance & DISTANCE_MASK;
  27. }
  28. static double getDistanceDouble(int32_t indexAndDistance) {
  29. double shiftedDistance = getShiftedDistance(indexAndDistance);
  30. return shiftedDistance / (1 << DISTANCE_SHIFT);
  31. }
  32. static int32_t getIndex(int32_t indexAndDistance) {
  33. // assert indexAndDistance >= 0;
  34. return indexAndDistance >> INDEX_SHIFT;
  35. }
  36. /**
  37. * Finds the supported LSR with the smallest distance from the desired one.
  38. * Equivalent LSR subtags must be normalized into a canonical form.
  39. *
  40. * <p>Returns the index of the lowest-distance supported LSR in the high bits
  41. * (negative if none has a distance below the threshold),
  42. * and its distance (0..ABOVE_THRESHOLD) in the low bits.
  43. */
  44. int32_t getBestIndexAndDistance(const LSR &desired,
  45. const LSR **supportedLSRs, int32_t supportedLSRsLength,
  46. int32_t shiftedThreshold,
  47. ULocMatchFavorSubtag favorSubtag,
  48. ULocMatchDirection direction) const;
  49. UBool isParadigmLSR(const LSR &lsr) const;
  50. int32_t getDefaultScriptDistance() const {
  51. return defaultScriptDistance;
  52. }
  53. int32_t getDefaultDemotionPerDesiredLocale() const {
  54. return defaultDemotionPerDesiredLocale;
  55. }
  56. private:
  57. // The distance is shifted left to gain some fraction bits.
  58. static constexpr int32_t DISTANCE_SHIFT = 3;
  59. static constexpr int32_t DISTANCE_FRACTION_MASK = 7;
  60. // 7 bits for 0..100
  61. static constexpr int32_t DISTANCE_INT_SHIFT = 7;
  62. static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT;
  63. static constexpr int32_t DISTANCE_MASK = 0x3ff;
  64. // tic constexpr int32_t MAX_INDEX = 0x1fffff; // avoids sign bit
  65. static constexpr int32_t INDEX_NEG_1 = 0xfffffc00;
  66. static int32_t getDistanceFloor(int32_t indexAndDistance) {
  67. return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
  68. }
  69. LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely);
  70. LocaleDistance(const LocaleDistance &other) = delete;
  71. LocaleDistance &operator=(const LocaleDistance &other) = delete;
  72. static void initLocaleDistance(UErrorCode &errorCode);
  73. UBool isMatch(const LSR &desired, const LSR &supported,
  74. int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const {
  75. const LSR *pSupp = &supported;
  76. return getBestIndexAndDistance(
  77. desired, &pSupp, 1,
  78. shiftedThreshold, favorSubtag, ULOCMATCH_DIRECTION_WITH_ONE_WAY) >= 0;
  79. }
  80. static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState,
  81. const char *desired, const char *supported);
  82. static int32_t getRegionPartitionsDistance(
  83. BytesTrie &iter, uint64_t startState,
  84. const char *desiredPartitions, const char *supportedPartitions,
  85. int32_t threshold);
  86. static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState);
  87. static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue);
  88. const char *partitionsForRegion(const LSR &lsr) const {
  89. // ill-formed region -> one non-matching string
  90. int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex];
  91. return partitionArrays[pIndex];
  92. }
  93. int32_t getDefaultRegionDistance() const {
  94. return defaultRegionDistance;
  95. }
  96. const XLikelySubtags &likelySubtags;
  97. // The trie maps each dlang+slang+dscript+sscript+dregion+sregion
  98. // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
  99. // There is also a trie value for each subsequence of whole subtags.
  100. // One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"".
  101. BytesTrie trie;
  102. /**
  103. * Maps each region to zero or more single-character partitions.
  104. */
  105. const uint8_t *regionToPartitionsIndex;
  106. const char **partitionArrays;
  107. /**
  108. * Used to get the paradigm region for a cluster, if there is one.
  109. */
  110. const LSR *paradigmLSRs;
  111. int32_t paradigmLSRsLength;
  112. int32_t defaultLanguageDistance;
  113. int32_t defaultScriptDistance;
  114. int32_t defaultRegionDistance;
  115. int32_t minRegionDistance;
  116. int32_t defaultDemotionPerDesiredLocale;
  117. };
  118. U_NAMESPACE_END
  119. #endif // __LOCDISTANCE_H__