genmbcs.h 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2000-2008, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: genmbcs.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2000jul10
  16. * created by: Markus W. Scherer
  17. */
  18. #ifndef __GENMBCS_H__
  19. #define __GENMBCS_H__
  20. #include "makeconv.h"
  21. enum {
  22. /*
  23. * TODO: Consider using ucnvmbcs.h constants.
  24. * However, not all values need to be exactly the same, for example
  25. * the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX
  26. * may be higher in makeconv than in the runtime code because that
  27. * affects only a small number of .cnv files [if any] but all
  28. * runtime UConverterSharedData objects.
  29. */
  30. MBCS_STAGE_2_SHIFT=4,
  31. MBCS_STAGE_2_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits in stage 2 */
  32. MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
  33. MBCS_STAGE_2_BLOCK_MASK=0x3f, /* for after shifting by MBCS_STAGE_2_SHIFT */
  34. MBCS_STAGE_1_SHIFT=10,
  35. MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */
  36. MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */
  37. MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */
  38. MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE,
  39. MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT,
  40. MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */
  41. MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */
  42. MBCS_STAGE_3_BLOCK_SIZE=16, /* =16=1<<4 for 4 bits in stage 3 */
  43. MBCS_STAGE_3_BLOCK_MASK=0xf,
  44. MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */
  45. MBCS_STAGE_3_GRANULARITY=16, /* =1<<4: MBCS stage 2 indexes are shifted left 4 */
  46. MBCS_STAGE_3_SBCS_SIZE=0x10000, /* max 64k mappings for SBCS */
  47. MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for MBCS */
  48. /*
  49. * SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures.
  50. * Possible values are 0x01ff..0xffff, in steps of 0x100.
  51. *
  52. * Unlike for MBCS, this constant only affects the stage 3 block allocation size;
  53. * there is no additional stage 1/2 table stored in the .cnv file.
  54. * The max value should be at least 0x7ff to cover 2-byte UTF-8.
  55. * 0xfff also covers a number other small scripts which have legacy charsets
  56. * (like Thai).
  57. * Higher values up to 0x1fff are harmless and potentially useful because
  58. * that covers small-script blocks which usually have either dense mappings
  59. * or no mappings at all.
  60. * Starting at U+2000, there are mostly symbols and format characters
  61. * with a low density of SBCS mappings, which would result in more wasted
  62. * stage 3 entries with the larger block size.
  63. */
  64. SBCS_UTF8_MAX=0x1fff,
  65. /*
  66. * MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures.
  67. * Possible values are 0x01ff..0xffff, in steps of 0x100.
  68. *
  69. * Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table
  70. * with extreme input data. The function checks for this overflow.
  71. *
  72. * 0xd7ff is chosen for the majority of common characters including Unihan and Hangul.
  73. * At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc.
  74. * Larger values cause slightly larger MBCS .cnv files.
  75. */
  76. MBCS_UTF8_MAX=0xd7ff,
  77. MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1, /* =0xd800 */
  78. MBCS_UTF8_STAGE_SHIFT=6,
  79. MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits from last trail byte */
  80. MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f,
  81. /* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */
  82. MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */
  83. MBCS_FROM_U_EXT_FLAG=0x10, /* UCMapping.f bit for base table mappings that fit into the base toU table */
  84. MBCS_FROM_U_EXT_MASK=0x0f, /* but need to go into the extension fromU table */
  85. /* =4 number of regular stage 3 blocks for final UTF-8 trail byte */
  86. MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZE,
  87. MBCS_MAX_FALLBACK_COUNT=8192
  88. };
  89. U_CFUNC NewConverter *
  90. MBCSOpen(UCMFile *ucm);
  91. struct MBCSData;
  92. typedef struct MBCSData MBCSData;
  93. /*
  94. * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode()
  95. * for creating an extension-only file.
  96. * Assume maxCharLength>1.
  97. */
  98. U_CFUNC const MBCSData *
  99. MBCSGetDummy(void);
  100. /* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */
  101. U_CFUNC UBool
  102. MBCSOkForBaseFromUnicode(const MBCSData *mbcsData,
  103. const uint8_t *bytes, int32_t length,
  104. UChar32 c, int8_t flag);
  105. U_CFUNC NewConverter *
  106. CnvExtOpen(UCMFile *ucm);
  107. #endif /* __GENMBCS_H__ */