_c_m_a_p.py 60 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576
  1. from fontTools.misc.textTools import bytesjoin, safeEval, readHex
  2. from fontTools.misc.encodingTools import getEncoding
  3. from fontTools.ttLib import getSearchRange
  4. from fontTools.unicode import Unicode
  5. from . import DefaultTable
  6. import sys
  7. import struct
  8. import array
  9. import logging
  10. log = logging.getLogger(__name__)
  11. def _make_map(font, chars, gids):
  12. assert len(chars) == len(gids)
  13. glyphNames = font.getGlyphNameMany(gids)
  14. cmap = {}
  15. for char, gid, name in zip(chars, gids, glyphNames):
  16. if gid == 0:
  17. continue
  18. cmap[char] = name
  19. return cmap
  20. class table__c_m_a_p(DefaultTable.DefaultTable):
  21. """Character to Glyph Index Mapping Table
  22. This class represents the `cmap <https://docs.microsoft.com/en-us/typography/opentype/spec/cmap>`_
  23. table, which maps between input characters (in Unicode or other system encodings)
  24. and glyphs within the font. The ``cmap`` table contains one or more subtables
  25. which determine the mapping of of characters to glyphs across different platforms
  26. and encoding systems.
  27. ``table__c_m_a_p`` objects expose an accessor ``.tables`` which provides access
  28. to the subtables, although it is normally easier to retrieve individual subtables
  29. through the utility methods described below. To add new subtables to a font,
  30. first determine the subtable format (if in doubt use format 4 for glyphs within
  31. the BMP, format 12 for glyphs outside the BMP, and format 14 for Unicode Variation
  32. Sequences) construct subtable objects with ``CmapSubtable.newSubtable(format)``,
  33. and append them to the ``.tables`` list.
  34. Within a subtable, the mapping of characters to glyphs is provided by the ``.cmap``
  35. attribute.
  36. Example::
  37. cmap4_0_3 = CmapSubtable.newSubtable(4)
  38. cmap4_0_3.platformID = 0
  39. cmap4_0_3.platEncID = 3
  40. cmap4_0_3.language = 0
  41. cmap4_0_3.cmap = { 0xC1: "Aacute" }
  42. cmap = newTable("cmap")
  43. cmap.tableVersion = 0
  44. cmap.tables = [cmap4_0_3]
  45. """
  46. def getcmap(self, platformID, platEncID):
  47. """Returns the first subtable which matches the given platform and encoding.
  48. Args:
  49. platformID (int): The platform ID. Use 0 for Unicode, 1 for Macintosh
  50. (deprecated for new fonts), 2 for ISO (deprecated) and 3 for Windows.
  51. encodingID (int): Encoding ID. Interpretation depends on the platform ID.
  52. See the OpenType specification for details.
  53. Returns:
  54. An object which is a subclass of :py:class:`CmapSubtable` if a matching
  55. subtable is found within the font, or ``None`` otherwise.
  56. """
  57. for subtable in self.tables:
  58. if subtable.platformID == platformID and subtable.platEncID == platEncID:
  59. return subtable
  60. return None # not found
  61. def getBestCmap(
  62. self,
  63. cmapPreferences=(
  64. (3, 10),
  65. (0, 6),
  66. (0, 4),
  67. (3, 1),
  68. (0, 3),
  69. (0, 2),
  70. (0, 1),
  71. (0, 0),
  72. ),
  73. ):
  74. """Returns the 'best' Unicode cmap dictionary available in the font
  75. or ``None``, if no Unicode cmap subtable is available.
  76. By default it will search for the following (platformID, platEncID)
  77. pairs in order::
  78. (3, 10), # Windows Unicode full repertoire
  79. (0, 6), # Unicode full repertoire (format 13 subtable)
  80. (0, 4), # Unicode 2.0 full repertoire
  81. (3, 1), # Windows Unicode BMP
  82. (0, 3), # Unicode 2.0 BMP
  83. (0, 2), # Unicode ISO/IEC 10646
  84. (0, 1), # Unicode 1.1
  85. (0, 0) # Unicode 1.0
  86. This particular order matches what HarfBuzz uses to choose what
  87. subtable to use by default. This order prefers the largest-repertoire
  88. subtable, and among those, prefers the Windows-platform over the
  89. Unicode-platform as the former has wider support.
  90. This order can be customized via the ``cmapPreferences`` argument.
  91. """
  92. for platformID, platEncID in cmapPreferences:
  93. cmapSubtable = self.getcmap(platformID, platEncID)
  94. if cmapSubtable is not None:
  95. return cmapSubtable.cmap
  96. return None # None of the requested cmap subtables were found
  97. def buildReversed(self):
  98. """Builds a reverse mapping dictionary
  99. Iterates over all Unicode cmap tables and returns a dictionary mapping
  100. glyphs to sets of codepoints, such as::
  101. {
  102. 'one': {0x31}
  103. 'A': {0x41,0x391}
  104. }
  105. The values are sets of Unicode codepoints because
  106. some fonts map different codepoints to the same glyph.
  107. For example, ``U+0041 LATIN CAPITAL LETTER A`` and ``U+0391
  108. GREEK CAPITAL LETTER ALPHA`` are sometimes the same glyph.
  109. """
  110. result = {}
  111. for subtable in self.tables:
  112. if subtable.isUnicode():
  113. for codepoint, name in subtable.cmap.items():
  114. result.setdefault(name, set()).add(codepoint)
  115. return result
  116. def decompile(self, data, ttFont):
  117. tableVersion, numSubTables = struct.unpack(">HH", data[:4])
  118. self.tableVersion = int(tableVersion)
  119. self.tables = tables = []
  120. seenOffsets = {}
  121. for i in range(numSubTables):
  122. platformID, platEncID, offset = struct.unpack(
  123. ">HHl", data[4 + i * 8 : 4 + (i + 1) * 8]
  124. )
  125. platformID, platEncID = int(platformID), int(platEncID)
  126. format, length = struct.unpack(">HH", data[offset : offset + 4])
  127. if format in [8, 10, 12, 13]:
  128. format, reserved, length = struct.unpack(
  129. ">HHL", data[offset : offset + 8]
  130. )
  131. elif format in [14]:
  132. format, length = struct.unpack(">HL", data[offset : offset + 6])
  133. if not length:
  134. log.error(
  135. "cmap subtable is reported as having zero length: platformID %s, "
  136. "platEncID %s, format %s offset %s. Skipping table.",
  137. platformID,
  138. platEncID,
  139. format,
  140. offset,
  141. )
  142. continue
  143. table = CmapSubtable.newSubtable(format)
  144. table.platformID = platformID
  145. table.platEncID = platEncID
  146. # Note that by default we decompile only the subtable header info;
  147. # any other data gets decompiled only when an attribute of the
  148. # subtable is referenced.
  149. table.decompileHeader(data[offset : offset + int(length)], ttFont)
  150. if offset in seenOffsets:
  151. table.data = None # Mark as decompiled
  152. table.cmap = tables[seenOffsets[offset]].cmap
  153. else:
  154. seenOffsets[offset] = i
  155. tables.append(table)
  156. if ttFont.lazy is False: # Be lazy for None and True
  157. self.ensureDecompiled()
  158. def ensureDecompiled(self, recurse=False):
  159. # The recurse argument is unused, but part of the signature of
  160. # ensureDecompiled across the library.
  161. for st in self.tables:
  162. st.ensureDecompiled()
  163. def compile(self, ttFont):
  164. self.tables.sort() # sort according to the spec; see CmapSubtable.__lt__()
  165. numSubTables = len(self.tables)
  166. totalOffset = 4 + 8 * numSubTables
  167. data = struct.pack(">HH", self.tableVersion, numSubTables)
  168. tableData = b""
  169. seen = (
  170. {}
  171. ) # Some tables are the same object reference. Don't compile them twice.
  172. done = (
  173. {}
  174. ) # Some tables are different objects, but compile to the same data chunk
  175. for table in self.tables:
  176. offset = seen.get(id(table.cmap))
  177. if offset is None:
  178. chunk = table.compile(ttFont)
  179. offset = done.get(chunk)
  180. if offset is None:
  181. offset = seen[id(table.cmap)] = done[chunk] = totalOffset + len(
  182. tableData
  183. )
  184. tableData = tableData + chunk
  185. data = data + struct.pack(">HHl", table.platformID, table.platEncID, offset)
  186. return data + tableData
  187. def toXML(self, writer, ttFont):
  188. writer.simpletag("tableVersion", version=self.tableVersion)
  189. writer.newline()
  190. for table in self.tables:
  191. table.toXML(writer, ttFont)
  192. def fromXML(self, name, attrs, content, ttFont):
  193. if name == "tableVersion":
  194. self.tableVersion = safeEval(attrs["version"])
  195. return
  196. if name[:12] != "cmap_format_":
  197. return
  198. if not hasattr(self, "tables"):
  199. self.tables = []
  200. format = safeEval(name[12:])
  201. table = CmapSubtable.newSubtable(format)
  202. table.platformID = safeEval(attrs["platformID"])
  203. table.platEncID = safeEval(attrs["platEncID"])
  204. table.fromXML(name, attrs, content, ttFont)
  205. self.tables.append(table)
  206. class CmapSubtable(object):
  207. """Base class for all cmap subtable formats.
  208. Subclasses which handle the individual subtable formats are named
  209. ``cmap_format_0``, ``cmap_format_2`` etc. Use :py:meth:`getSubtableClass`
  210. to retrieve the concrete subclass, or :py:meth:`newSubtable` to get a
  211. new subtable object for a given format.
  212. The object exposes a ``.cmap`` attribute, which contains a dictionary mapping
  213. character codepoints to glyph names.
  214. """
  215. @staticmethod
  216. def getSubtableClass(format):
  217. """Return the subtable class for a format."""
  218. return cmap_classes.get(format, cmap_format_unknown)
  219. @staticmethod
  220. def newSubtable(format):
  221. """Return a new instance of a subtable for the given format
  222. ."""
  223. subtableClass = CmapSubtable.getSubtableClass(format)
  224. return subtableClass(format)
  225. def __init__(self, format):
  226. self.format = format
  227. self.data = None
  228. self.ttFont = None
  229. self.platformID = None #: The platform ID of this subtable
  230. self.platEncID = None #: The encoding ID of this subtable (interpretation depends on ``platformID``)
  231. self.language = (
  232. None #: The language ID of this subtable (Macintosh platform only)
  233. )
  234. def ensureDecompiled(self, recurse=False):
  235. # The recurse argument is unused, but part of the signature of
  236. # ensureDecompiled across the library.
  237. if self.data is None:
  238. return
  239. self.decompile(None, None) # use saved data.
  240. self.data = None # Once this table has been decompiled, make sure we don't
  241. # just return the original data. Also avoids recursion when
  242. # called with an attribute that the cmap subtable doesn't have.
  243. def __getattr__(self, attr):
  244. # allow lazy decompilation of subtables.
  245. if attr[:2] == "__": # don't handle requests for member functions like '__lt__'
  246. raise AttributeError(attr)
  247. if self.data is None:
  248. raise AttributeError(attr)
  249. self.ensureDecompiled()
  250. return getattr(self, attr)
  251. def decompileHeader(self, data, ttFont):
  252. format, length, language = struct.unpack(">HHH", data[:6])
  253. assert (
  254. len(data) == length
  255. ), "corrupt cmap table format %d (data length: %d, header length: %d)" % (
  256. format,
  257. len(data),
  258. length,
  259. )
  260. self.format = int(format)
  261. self.length = int(length)
  262. self.language = int(language)
  263. self.data = data[6:]
  264. self.ttFont = ttFont
  265. def toXML(self, writer, ttFont):
  266. writer.begintag(
  267. self.__class__.__name__,
  268. [
  269. ("platformID", self.platformID),
  270. ("platEncID", self.platEncID),
  271. ("language", self.language),
  272. ],
  273. )
  274. writer.newline()
  275. codes = sorted(self.cmap.items())
  276. self._writeCodes(codes, writer)
  277. writer.endtag(self.__class__.__name__)
  278. writer.newline()
  279. def getEncoding(self, default=None):
  280. """Returns the Python encoding name for this cmap subtable based on its platformID,
  281. platEncID, and language. If encoding for these values is not known, by default
  282. ``None`` is returned. That can be overridden by passing a value to the ``default``
  283. argument.
  284. Note that if you want to choose a "preferred" cmap subtable, most of the time
  285. ``self.isUnicode()`` is what you want as that one only returns true for the modern,
  286. commonly used, Unicode-compatible triplets, not the legacy ones.
  287. """
  288. return getEncoding(self.platformID, self.platEncID, self.language, default)
  289. def isUnicode(self):
  290. """Returns true if the characters are interpreted as Unicode codepoints."""
  291. return self.platformID == 0 or (
  292. self.platformID == 3 and self.platEncID in [0, 1, 10]
  293. )
  294. def isSymbol(self):
  295. """Returns true if the subtable is for the Symbol encoding (3,0)"""
  296. return self.platformID == 3 and self.platEncID == 0
  297. def _writeCodes(self, codes, writer):
  298. isUnicode = self.isUnicode()
  299. for code, name in codes:
  300. writer.simpletag("map", code=hex(code), name=name)
  301. if isUnicode:
  302. writer.comment(Unicode[code])
  303. writer.newline()
  304. def __lt__(self, other):
  305. if not isinstance(other, CmapSubtable):
  306. return NotImplemented
  307. # implemented so that list.sort() sorts according to the spec.
  308. selfTuple = (
  309. getattr(self, "platformID", None),
  310. getattr(self, "platEncID", None),
  311. getattr(self, "language", None),
  312. self.__dict__,
  313. )
  314. otherTuple = (
  315. getattr(other, "platformID", None),
  316. getattr(other, "platEncID", None),
  317. getattr(other, "language", None),
  318. other.__dict__,
  319. )
  320. return selfTuple < otherTuple
  321. class cmap_format_0(CmapSubtable):
  322. def decompile(self, data, ttFont):
  323. # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
  324. # If not, someone is calling the subtable decompile() directly, and must provide both args.
  325. if data is not None and ttFont is not None:
  326. self.decompileHeader(data, ttFont)
  327. else:
  328. assert (
  329. data is None and ttFont is None
  330. ), "Need both data and ttFont arguments"
  331. data = (
  332. self.data
  333. ) # decompileHeader assigns the data after the header to self.data
  334. assert 262 == self.length, "Format 0 cmap subtable not 262 bytes"
  335. gids = array.array("B")
  336. gids.frombytes(self.data)
  337. charCodes = list(range(len(gids)))
  338. self.cmap = _make_map(self.ttFont, charCodes, gids)
  339. def compile(self, ttFont):
  340. if self.data:
  341. return struct.pack(">HHH", 0, 262, self.language) + self.data
  342. cmap = self.cmap
  343. assert set(cmap.keys()).issubset(range(256))
  344. getGlyphID = ttFont.getGlyphID
  345. valueList = [getGlyphID(cmap[i]) if i in cmap else 0 for i in range(256)]
  346. gids = array.array("B", valueList)
  347. data = struct.pack(">HHH", 0, 262, self.language) + gids.tobytes()
  348. assert len(data) == 262
  349. return data
  350. def fromXML(self, name, attrs, content, ttFont):
  351. self.language = safeEval(attrs["language"])
  352. if not hasattr(self, "cmap"):
  353. self.cmap = {}
  354. cmap = self.cmap
  355. for element in content:
  356. if not isinstance(element, tuple):
  357. continue
  358. name, attrs, content = element
  359. if name != "map":
  360. continue
  361. cmap[safeEval(attrs["code"])] = attrs["name"]
  362. subHeaderFormat = ">HHhH"
  363. class SubHeader(object):
  364. def __init__(self):
  365. self.firstCode = None
  366. self.entryCount = None
  367. self.idDelta = None
  368. self.idRangeOffset = None
  369. self.glyphIndexArray = []
  370. class cmap_format_2(CmapSubtable):
  371. def setIDDelta(self, subHeader):
  372. subHeader.idDelta = 0
  373. # find the minGI which is not zero.
  374. minGI = subHeader.glyphIndexArray[0]
  375. for gid in subHeader.glyphIndexArray:
  376. if (gid != 0) and (gid < minGI):
  377. minGI = gid
  378. # The lowest gid in glyphIndexArray, after subtracting idDelta, must be 1.
  379. # idDelta is a short, and must be between -32K and 32K. minGI can be between 1 and 64K.
  380. # We would like to pick an idDelta such that the first glyphArray GID is 1,
  381. # so that we are more likely to be able to combine glypharray GID subranges.
  382. # This means that we have a problem when minGI is > 32K
  383. # Since the final gi is reconstructed from the glyphArray GID by:
  384. # (short)finalGID = (gid + idDelta) % 0x10000),
  385. # we can get from a glypharray GID of 1 to a final GID of 65K by subtracting 2, and casting the
  386. # negative number to an unsigned short.
  387. if minGI > 1:
  388. if minGI > 0x7FFF:
  389. subHeader.idDelta = -(0x10000 - minGI) - 1
  390. else:
  391. subHeader.idDelta = minGI - 1
  392. idDelta = subHeader.idDelta
  393. for i in range(subHeader.entryCount):
  394. gid = subHeader.glyphIndexArray[i]
  395. if gid > 0:
  396. subHeader.glyphIndexArray[i] = gid - idDelta
  397. def decompile(self, data, ttFont):
  398. # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
  399. # If not, someone is calling the subtable decompile() directly, and must provide both args.
  400. if data is not None and ttFont is not None:
  401. self.decompileHeader(data, ttFont)
  402. else:
  403. assert (
  404. data is None and ttFont is None
  405. ), "Need both data and ttFont arguments"
  406. data = (
  407. self.data
  408. ) # decompileHeader assigns the data after the header to self.data
  409. subHeaderKeys = []
  410. maxSubHeaderindex = 0
  411. # get the key array, and determine the number of subHeaders.
  412. allKeys = array.array("H")
  413. allKeys.frombytes(data[:512])
  414. data = data[512:]
  415. if sys.byteorder != "big":
  416. allKeys.byteswap()
  417. subHeaderKeys = [key // 8 for key in allKeys]
  418. maxSubHeaderindex = max(subHeaderKeys)
  419. # Load subHeaders
  420. subHeaderList = []
  421. pos = 0
  422. for i in range(maxSubHeaderindex + 1):
  423. subHeader = SubHeader()
  424. (
  425. subHeader.firstCode,
  426. subHeader.entryCount,
  427. subHeader.idDelta,
  428. subHeader.idRangeOffset,
  429. ) = struct.unpack(subHeaderFormat, data[pos : pos + 8])
  430. pos += 8
  431. giDataPos = pos + subHeader.idRangeOffset - 2
  432. giList = array.array("H")
  433. giList.frombytes(data[giDataPos : giDataPos + subHeader.entryCount * 2])
  434. if sys.byteorder != "big":
  435. giList.byteswap()
  436. subHeader.glyphIndexArray = giList
  437. subHeaderList.append(subHeader)
  438. # How this gets processed.
  439. # Charcodes may be one or two bytes.
  440. # The first byte of a charcode is mapped through the subHeaderKeys, to select
  441. # a subHeader. For any subheader but 0, the next byte is then mapped through the
  442. # selected subheader. If subheader Index 0 is selected, then the byte itself is
  443. # mapped through the subheader, and there is no second byte.
  444. # Then assume that the subsequent byte is the first byte of the next charcode,and repeat.
  445. #
  446. # Each subheader references a range in the glyphIndexArray whose length is entryCount.
  447. # The range in glyphIndexArray referenced by a sunheader may overlap with the range in glyphIndexArray
  448. # referenced by another subheader.
  449. # The only subheader that will be referenced by more than one first-byte value is the subheader
  450. # that maps the entire range of glyphID values to glyphIndex 0, e.g notdef:
  451. # {firstChar 0, EntryCount 0,idDelta 0,idRangeOffset xx}
  452. # A byte being mapped though a subheader is treated as in index into a mapping of array index to font glyphIndex.
  453. # A subheader specifies a subrange within (0...256) by the
  454. # firstChar and EntryCount values. If the byte value is outside the subrange, then the glyphIndex is zero
  455. # (e.g. glyph not in font).
  456. # If the byte index is in the subrange, then an offset index is calculated as (byteIndex - firstChar).
  457. # The index to glyphIndex mapping is a subrange of the glyphIndexArray. You find the start of the subrange by
  458. # counting idRangeOffset bytes from the idRangeOffset word. The first value in this subrange is the
  459. # glyphIndex for the index firstChar. The offset index should then be used in this array to get the glyphIndex.
  460. # Example for Logocut-Medium
  461. # first byte of charcode = 129; selects subheader 1.
  462. # subheader 1 = {firstChar 64, EntryCount 108,idDelta 42,idRangeOffset 0252}
  463. # second byte of charCode = 66
  464. # the index offset = 66-64 = 2.
  465. # The subrange of the glyphIndexArray starting at 0x0252 bytes from the idRangeOffset word is:
  466. # [glyphIndexArray index], [subrange array index] = glyphIndex
  467. # [256], [0]=1 from charcode [129, 64]
  468. # [257], [1]=2 from charcode [129, 65]
  469. # [258], [2]=3 from charcode [129, 66]
  470. # [259], [3]=4 from charcode [129, 67]
  471. # So, the glyphIndex = 3 from the array. Then if idDelta is not zero and the glyph ID is not zero,
  472. # add it to the glyphID to get the final glyphIndex
  473. # value. In this case the final glyph index = 3+ 42 -> 45 for the final glyphIndex. Whew!
  474. self.data = b""
  475. cmap = {}
  476. notdefGI = 0
  477. for firstByte in range(256):
  478. subHeadindex = subHeaderKeys[firstByte]
  479. subHeader = subHeaderList[subHeadindex]
  480. if subHeadindex == 0:
  481. if (firstByte < subHeader.firstCode) or (
  482. firstByte >= subHeader.firstCode + subHeader.entryCount
  483. ):
  484. continue # gi is notdef.
  485. else:
  486. charCode = firstByte
  487. offsetIndex = firstByte - subHeader.firstCode
  488. gi = subHeader.glyphIndexArray[offsetIndex]
  489. if gi != 0:
  490. gi = (gi + subHeader.idDelta) % 0x10000
  491. else:
  492. continue # gi is notdef.
  493. cmap[charCode] = gi
  494. else:
  495. if subHeader.entryCount:
  496. charCodeOffset = firstByte * 256 + subHeader.firstCode
  497. for offsetIndex in range(subHeader.entryCount):
  498. charCode = charCodeOffset + offsetIndex
  499. gi = subHeader.glyphIndexArray[offsetIndex]
  500. if gi != 0:
  501. gi = (gi + subHeader.idDelta) % 0x10000
  502. else:
  503. continue
  504. cmap[charCode] = gi
  505. # If not subHeader.entryCount, then all char codes with this first byte are
  506. # mapped to .notdef. We can skip this subtable, and leave the glyphs un-encoded, which is the
  507. # same as mapping it to .notdef.
  508. gids = list(cmap.values())
  509. charCodes = list(cmap.keys())
  510. self.cmap = _make_map(self.ttFont, charCodes, gids)
  511. def compile(self, ttFont):
  512. if self.data:
  513. return (
  514. struct.pack(">HHH", self.format, self.length, self.language) + self.data
  515. )
  516. kEmptyTwoCharCodeRange = -1
  517. notdefGI = 0
  518. items = sorted(self.cmap.items())
  519. charCodes = [item[0] for item in items]
  520. names = [item[1] for item in items]
  521. nameMap = ttFont.getReverseGlyphMap()
  522. try:
  523. gids = [nameMap[name] for name in names]
  524. except KeyError:
  525. nameMap = ttFont.getReverseGlyphMap(rebuild=True)
  526. try:
  527. gids = [nameMap[name] for name in names]
  528. except KeyError:
  529. # allow virtual GIDs in format 2 tables
  530. gids = []
  531. for name in names:
  532. try:
  533. gid = nameMap[name]
  534. except KeyError:
  535. try:
  536. if name[:3] == "gid":
  537. gid = int(name[3:])
  538. else:
  539. gid = ttFont.getGlyphID(name)
  540. except:
  541. raise KeyError(name)
  542. gids.append(gid)
  543. # Process the (char code to gid) item list in char code order.
  544. # By definition, all one byte char codes map to subheader 0.
  545. # For all the two byte char codes, we assume that the first byte maps maps to the empty subhead (with an entry count of 0,
  546. # which defines all char codes in its range to map to notdef) unless proven otherwise.
  547. # Note that since the char code items are processed in char code order, all the char codes with the
  548. # same first byte are in sequential order.
  549. subHeaderKeys = [
  550. kEmptyTwoCharCodeRange for x in range(256)
  551. ] # list of indices into subHeaderList.
  552. subHeaderList = []
  553. # We force this subheader entry 0 to exist in the subHeaderList in the case where some one comes up
  554. # with a cmap where all the one byte char codes map to notdef,
  555. # with the result that the subhead 0 would not get created just by processing the item list.
  556. charCode = charCodes[0]
  557. if charCode > 255:
  558. subHeader = SubHeader()
  559. subHeader.firstCode = 0
  560. subHeader.entryCount = 0
  561. subHeader.idDelta = 0
  562. subHeader.idRangeOffset = 0
  563. subHeaderList.append(subHeader)
  564. lastFirstByte = -1
  565. items = zip(charCodes, gids)
  566. for charCode, gid in items:
  567. if gid == 0:
  568. continue
  569. firstbyte = charCode >> 8
  570. secondByte = charCode & 0x00FF
  571. if (
  572. firstbyte != lastFirstByte
  573. ): # Need to update the current subhead, and start a new one.
  574. if lastFirstByte > -1:
  575. # fix GI's and iDelta of current subheader.
  576. self.setIDDelta(subHeader)
  577. # If it was sunheader 0 for one-byte charCodes, then we need to set the subHeaderKeys value to zero
  578. # for the indices matching the char codes.
  579. if lastFirstByte == 0:
  580. for index in range(subHeader.entryCount):
  581. charCode = subHeader.firstCode + index
  582. subHeaderKeys[charCode] = 0
  583. assert subHeader.entryCount == len(
  584. subHeader.glyphIndexArray
  585. ), "Error - subhead entry count does not match len of glyphID subrange."
  586. # init new subheader
  587. subHeader = SubHeader()
  588. subHeader.firstCode = secondByte
  589. subHeader.entryCount = 1
  590. subHeader.glyphIndexArray.append(gid)
  591. subHeaderList.append(subHeader)
  592. subHeaderKeys[firstbyte] = len(subHeaderList) - 1
  593. lastFirstByte = firstbyte
  594. else:
  595. # need to fill in with notdefs all the code points between the last charCode and the current charCode.
  596. codeDiff = secondByte - (subHeader.firstCode + subHeader.entryCount)
  597. for i in range(codeDiff):
  598. subHeader.glyphIndexArray.append(notdefGI)
  599. subHeader.glyphIndexArray.append(gid)
  600. subHeader.entryCount = subHeader.entryCount + codeDiff + 1
  601. # fix GI's and iDelta of last subheader that we we added to the subheader array.
  602. self.setIDDelta(subHeader)
  603. # Now we add a final subheader for the subHeaderKeys which maps to empty two byte charcode ranges.
  604. subHeader = SubHeader()
  605. subHeader.firstCode = 0
  606. subHeader.entryCount = 0
  607. subHeader.idDelta = 0
  608. subHeader.idRangeOffset = 2
  609. subHeaderList.append(subHeader)
  610. emptySubheadIndex = len(subHeaderList) - 1
  611. for index in range(256):
  612. if subHeaderKeys[index] == kEmptyTwoCharCodeRange:
  613. subHeaderKeys[index] = emptySubheadIndex
  614. # Since this is the last subheader, the GlyphIndex Array starts two bytes after the start of the
  615. # idRangeOffset word of this subHeader. We can safely point to the first entry in the GlyphIndexArray,
  616. # since the first subrange of the GlyphIndexArray is for subHeader 0, which always starts with
  617. # charcode 0 and GID 0.
  618. idRangeOffset = (
  619. len(subHeaderList) - 1
  620. ) * 8 + 2 # offset to beginning of glyphIDArray from first subheader idRangeOffset.
  621. subheadRangeLen = (
  622. len(subHeaderList) - 1
  623. ) # skip last special empty-set subheader; we've already hardocodes its idRangeOffset to 2.
  624. for index in range(subheadRangeLen):
  625. subHeader = subHeaderList[index]
  626. subHeader.idRangeOffset = 0
  627. for j in range(index):
  628. prevSubhead = subHeaderList[j]
  629. if (
  630. prevSubhead.glyphIndexArray == subHeader.glyphIndexArray
  631. ): # use the glyphIndexArray subarray
  632. subHeader.idRangeOffset = (
  633. prevSubhead.idRangeOffset - (index - j) * 8
  634. )
  635. subHeader.glyphIndexArray = []
  636. break
  637. if subHeader.idRangeOffset == 0: # didn't find one.
  638. subHeader.idRangeOffset = idRangeOffset
  639. idRangeOffset = (
  640. idRangeOffset - 8
  641. ) + subHeader.entryCount * 2 # one less subheader, one more subArray.
  642. else:
  643. idRangeOffset = idRangeOffset - 8 # one less subheader
  644. # Now we can write out the data!
  645. length = (
  646. 6 + 512 + 8 * len(subHeaderList)
  647. ) # header, 256 subHeaderKeys, and subheader array.
  648. for subhead in subHeaderList[:-1]:
  649. length = (
  650. length + len(subhead.glyphIndexArray) * 2
  651. ) # We can't use subhead.entryCount, as some of the subhead may share subArrays.
  652. dataList = [struct.pack(">HHH", 2, length, self.language)]
  653. for index in subHeaderKeys:
  654. dataList.append(struct.pack(">H", index * 8))
  655. for subhead in subHeaderList:
  656. dataList.append(
  657. struct.pack(
  658. subHeaderFormat,
  659. subhead.firstCode,
  660. subhead.entryCount,
  661. subhead.idDelta,
  662. subhead.idRangeOffset,
  663. )
  664. )
  665. for subhead in subHeaderList[:-1]:
  666. for gi in subhead.glyphIndexArray:
  667. dataList.append(struct.pack(">H", gi))
  668. data = bytesjoin(dataList)
  669. assert len(data) == length, (
  670. "Error: cmap format 2 is not same length as calculated! actual: "
  671. + str(len(data))
  672. + " calc : "
  673. + str(length)
  674. )
  675. return data
  676. def fromXML(self, name, attrs, content, ttFont):
  677. self.language = safeEval(attrs["language"])
  678. if not hasattr(self, "cmap"):
  679. self.cmap = {}
  680. cmap = self.cmap
  681. for element in content:
  682. if not isinstance(element, tuple):
  683. continue
  684. name, attrs, content = element
  685. if name != "map":
  686. continue
  687. cmap[safeEval(attrs["code"])] = attrs["name"]
  688. cmap_format_4_format = ">7H"
  689. # uint16 endCode[segCount] # Ending character code for each segment, last = 0xFFFF.
  690. # uint16 reservedPad # This value should be zero
  691. # uint16 startCode[segCount] # Starting character code for each segment
  692. # uint16 idDelta[segCount] # Delta for all character codes in segment
  693. # uint16 idRangeOffset[segCount] # Offset in bytes to glyph indexArray, or 0
  694. # uint16 glyphIndexArray[variable] # Glyph index array
  695. def splitRange(startCode, endCode, cmap):
  696. # Try to split a range of character codes into subranges with consecutive
  697. # glyph IDs in such a way that the cmap4 subtable can be stored "most"
  698. # efficiently. I can't prove I've got the optimal solution, but it seems
  699. # to do well with the fonts I tested: none became bigger, many became smaller.
  700. if startCode == endCode:
  701. return [], [endCode]
  702. lastID = cmap[startCode]
  703. lastCode = startCode
  704. inOrder = None
  705. orderedBegin = None
  706. subRanges = []
  707. # Gather subranges in which the glyph IDs are consecutive.
  708. for code in range(startCode + 1, endCode + 1):
  709. glyphID = cmap[code]
  710. if glyphID - 1 == lastID:
  711. if inOrder is None or not inOrder:
  712. inOrder = 1
  713. orderedBegin = lastCode
  714. else:
  715. if inOrder:
  716. inOrder = 0
  717. subRanges.append((orderedBegin, lastCode))
  718. orderedBegin = None
  719. lastID = glyphID
  720. lastCode = code
  721. if inOrder:
  722. subRanges.append((orderedBegin, lastCode))
  723. assert lastCode == endCode
  724. # Now filter out those new subranges that would only make the data bigger.
  725. # A new segment cost 8 bytes, not using a new segment costs 2 bytes per
  726. # character.
  727. newRanges = []
  728. for b, e in subRanges:
  729. if b == startCode and e == endCode:
  730. break # the whole range, we're fine
  731. if b == startCode or e == endCode:
  732. threshold = 4 # split costs one more segment
  733. else:
  734. threshold = 8 # split costs two more segments
  735. if (e - b + 1) > threshold:
  736. newRanges.append((b, e))
  737. subRanges = newRanges
  738. if not subRanges:
  739. return [], [endCode]
  740. if subRanges[0][0] != startCode:
  741. subRanges.insert(0, (startCode, subRanges[0][0] - 1))
  742. if subRanges[-1][1] != endCode:
  743. subRanges.append((subRanges[-1][1] + 1, endCode))
  744. # Fill the "holes" in the segments list -- those are the segments in which
  745. # the glyph IDs are _not_ consecutive.
  746. i = 1
  747. while i < len(subRanges):
  748. if subRanges[i - 1][1] + 1 != subRanges[i][0]:
  749. subRanges.insert(i, (subRanges[i - 1][1] + 1, subRanges[i][0] - 1))
  750. i = i + 1
  751. i = i + 1
  752. # Transform the ranges into startCode/endCode lists.
  753. start = []
  754. end = []
  755. for b, e in subRanges:
  756. start.append(b)
  757. end.append(e)
  758. start.pop(0)
  759. assert len(start) + 1 == len(end)
  760. return start, end
  761. class cmap_format_4(CmapSubtable):
  762. def decompile(self, data, ttFont):
  763. # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
  764. # If not, someone is calling the subtable decompile() directly, and must provide both args.
  765. if data is not None and ttFont is not None:
  766. self.decompileHeader(data, ttFont)
  767. else:
  768. assert (
  769. data is None and ttFont is None
  770. ), "Need both data and ttFont arguments"
  771. data = (
  772. self.data
  773. ) # decompileHeader assigns the data after the header to self.data
  774. (segCountX2, searchRange, entrySelector, rangeShift) = struct.unpack(
  775. ">4H", data[:8]
  776. )
  777. data = data[8:]
  778. segCount = segCountX2 // 2
  779. allCodes = array.array("H")
  780. allCodes.frombytes(data)
  781. self.data = data = None
  782. if sys.byteorder != "big":
  783. allCodes.byteswap()
  784. # divide the data
  785. endCode = allCodes[:segCount]
  786. allCodes = allCodes[segCount + 1 :] # the +1 is skipping the reservedPad field
  787. startCode = allCodes[:segCount]
  788. allCodes = allCodes[segCount:]
  789. idDelta = allCodes[:segCount]
  790. allCodes = allCodes[segCount:]
  791. idRangeOffset = allCodes[:segCount]
  792. glyphIndexArray = allCodes[segCount:]
  793. lenGIArray = len(glyphIndexArray)
  794. # build 2-byte character mapping
  795. charCodes = []
  796. gids = []
  797. for i in range(len(startCode) - 1): # don't do 0xffff!
  798. start = startCode[i]
  799. delta = idDelta[i]
  800. rangeOffset = idRangeOffset[i]
  801. partial = rangeOffset // 2 - start + i - len(idRangeOffset)
  802. rangeCharCodes = list(range(startCode[i], endCode[i] + 1))
  803. charCodes.extend(rangeCharCodes)
  804. if rangeOffset == 0:
  805. gids.extend(
  806. [(charCode + delta) & 0xFFFF for charCode in rangeCharCodes]
  807. )
  808. else:
  809. for charCode in rangeCharCodes:
  810. index = charCode + partial
  811. assert index < lenGIArray, (
  812. "In format 4 cmap, range (%d), the calculated index (%d) into the glyph index array is not less than the length of the array (%d) !"
  813. % (i, index, lenGIArray)
  814. )
  815. if glyphIndexArray[index] != 0: # if not missing glyph
  816. glyphID = glyphIndexArray[index] + delta
  817. else:
  818. glyphID = 0 # missing glyph
  819. gids.append(glyphID & 0xFFFF)
  820. self.cmap = _make_map(self.ttFont, charCodes, gids)
  821. def compile(self, ttFont):
  822. if self.data:
  823. return (
  824. struct.pack(">HHH", self.format, self.length, self.language) + self.data
  825. )
  826. charCodes = list(self.cmap.keys())
  827. if not charCodes:
  828. startCode = [0xFFFF]
  829. endCode = [0xFFFF]
  830. else:
  831. charCodes.sort()
  832. names = [self.cmap[code] for code in charCodes]
  833. nameMap = ttFont.getReverseGlyphMap()
  834. try:
  835. gids = [nameMap[name] for name in names]
  836. except KeyError:
  837. nameMap = ttFont.getReverseGlyphMap(rebuild=True)
  838. try:
  839. gids = [nameMap[name] for name in names]
  840. except KeyError:
  841. # allow virtual GIDs in format 4 tables
  842. gids = []
  843. for name in names:
  844. try:
  845. gid = nameMap[name]
  846. except KeyError:
  847. try:
  848. if name[:3] == "gid":
  849. gid = int(name[3:])
  850. else:
  851. gid = ttFont.getGlyphID(name)
  852. except:
  853. raise KeyError(name)
  854. gids.append(gid)
  855. cmap = {} # code:glyphID mapping
  856. for code, gid in zip(charCodes, gids):
  857. cmap[code] = gid
  858. # Build startCode and endCode lists.
  859. # Split the char codes in ranges of consecutive char codes, then split
  860. # each range in more ranges of consecutive/not consecutive glyph IDs.
  861. # See splitRange().
  862. lastCode = charCodes[0]
  863. endCode = []
  864. startCode = [lastCode]
  865. for charCode in charCodes[
  866. 1:
  867. ]: # skip the first code, it's the first start code
  868. if charCode == lastCode + 1:
  869. lastCode = charCode
  870. continue
  871. start, end = splitRange(startCode[-1], lastCode, cmap)
  872. startCode.extend(start)
  873. endCode.extend(end)
  874. startCode.append(charCode)
  875. lastCode = charCode
  876. start, end = splitRange(startCode[-1], lastCode, cmap)
  877. startCode.extend(start)
  878. endCode.extend(end)
  879. startCode.append(0xFFFF)
  880. endCode.append(0xFFFF)
  881. # build up rest of cruft
  882. idDelta = []
  883. idRangeOffset = []
  884. glyphIndexArray = []
  885. for i in range(len(endCode) - 1): # skip the closing codes (0xffff)
  886. indices = []
  887. for charCode in range(startCode[i], endCode[i] + 1):
  888. indices.append(cmap[charCode])
  889. if indices == list(range(indices[0], indices[0] + len(indices))):
  890. idDelta.append((indices[0] - startCode[i]) % 0x10000)
  891. idRangeOffset.append(0)
  892. else:
  893. idDelta.append(0)
  894. idRangeOffset.append(2 * (len(endCode) + len(glyphIndexArray) - i))
  895. glyphIndexArray.extend(indices)
  896. idDelta.append(1) # 0xffff + 1 == (tadaa!) 0. So this end code maps to .notdef
  897. idRangeOffset.append(0)
  898. # Insane.
  899. segCount = len(endCode)
  900. segCountX2 = segCount * 2
  901. searchRange, entrySelector, rangeShift = getSearchRange(segCount, 2)
  902. charCodeArray = array.array("H", endCode + [0] + startCode)
  903. idDeltaArray = array.array("H", idDelta)
  904. restArray = array.array("H", idRangeOffset + glyphIndexArray)
  905. if sys.byteorder != "big":
  906. charCodeArray.byteswap()
  907. if sys.byteorder != "big":
  908. idDeltaArray.byteswap()
  909. if sys.byteorder != "big":
  910. restArray.byteswap()
  911. data = charCodeArray.tobytes() + idDeltaArray.tobytes() + restArray.tobytes()
  912. length = struct.calcsize(cmap_format_4_format) + len(data)
  913. header = struct.pack(
  914. cmap_format_4_format,
  915. self.format,
  916. length,
  917. self.language,
  918. segCountX2,
  919. searchRange,
  920. entrySelector,
  921. rangeShift,
  922. )
  923. return header + data
  924. def fromXML(self, name, attrs, content, ttFont):
  925. self.language = safeEval(attrs["language"])
  926. if not hasattr(self, "cmap"):
  927. self.cmap = {}
  928. cmap = self.cmap
  929. for element in content:
  930. if not isinstance(element, tuple):
  931. continue
  932. nameMap, attrsMap, dummyContent = element
  933. if nameMap != "map":
  934. assert 0, "Unrecognized keyword in cmap subtable"
  935. cmap[safeEval(attrsMap["code"])] = attrsMap["name"]
  936. class cmap_format_6(CmapSubtable):
  937. def decompile(self, data, ttFont):
  938. # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
  939. # If not, someone is calling the subtable decompile() directly, and must provide both args.
  940. if data is not None and ttFont is not None:
  941. self.decompileHeader(data, ttFont)
  942. else:
  943. assert (
  944. data is None and ttFont is None
  945. ), "Need both data and ttFont arguments"
  946. data = (
  947. self.data
  948. ) # decompileHeader assigns the data after the header to self.data
  949. firstCode, entryCount = struct.unpack(">HH", data[:4])
  950. firstCode = int(firstCode)
  951. data = data[4:]
  952. # assert len(data) == 2 * entryCount # XXX not true in Apple's Helvetica!!!
  953. gids = array.array("H")
  954. gids.frombytes(data[: 2 * int(entryCount)])
  955. if sys.byteorder != "big":
  956. gids.byteswap()
  957. self.data = data = None
  958. charCodes = list(range(firstCode, firstCode + len(gids)))
  959. self.cmap = _make_map(self.ttFont, charCodes, gids)
  960. def compile(self, ttFont):
  961. if self.data:
  962. return (
  963. struct.pack(">HHH", self.format, self.length, self.language) + self.data
  964. )
  965. cmap = self.cmap
  966. codes = sorted(cmap.keys())
  967. if codes: # yes, there are empty cmap tables.
  968. codes = list(range(codes[0], codes[-1] + 1))
  969. firstCode = codes[0]
  970. valueList = [
  971. ttFont.getGlyphID(cmap[code]) if code in cmap else 0 for code in codes
  972. ]
  973. gids = array.array("H", valueList)
  974. if sys.byteorder != "big":
  975. gids.byteswap()
  976. data = gids.tobytes()
  977. else:
  978. data = b""
  979. firstCode = 0
  980. header = struct.pack(
  981. ">HHHHH", 6, len(data) + 10, self.language, firstCode, len(codes)
  982. )
  983. return header + data
  984. def fromXML(self, name, attrs, content, ttFont):
  985. self.language = safeEval(attrs["language"])
  986. if not hasattr(self, "cmap"):
  987. self.cmap = {}
  988. cmap = self.cmap
  989. for element in content:
  990. if not isinstance(element, tuple):
  991. continue
  992. name, attrs, content = element
  993. if name != "map":
  994. continue
  995. cmap[safeEval(attrs["code"])] = attrs["name"]
  996. class cmap_format_12_or_13(CmapSubtable):
  997. def __init__(self, format):
  998. self.format = format
  999. self.reserved = 0
  1000. self.data = None
  1001. self.ttFont = None
  1002. def decompileHeader(self, data, ttFont):
  1003. format, reserved, length, language, nGroups = struct.unpack(">HHLLL", data[:16])
  1004. assert (
  1005. len(data) == (16 + nGroups * 12) == (length)
  1006. ), "corrupt cmap table format %d (data length: %d, header length: %d)" % (
  1007. self.format,
  1008. len(data),
  1009. length,
  1010. )
  1011. self.format = format
  1012. self.reserved = reserved
  1013. self.length = length
  1014. self.language = language
  1015. self.nGroups = nGroups
  1016. self.data = data[16:]
  1017. self.ttFont = ttFont
  1018. def decompile(self, data, ttFont):
  1019. # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
  1020. # If not, someone is calling the subtable decompile() directly, and must provide both args.
  1021. if data is not None and ttFont is not None:
  1022. self.decompileHeader(data, ttFont)
  1023. else:
  1024. assert (
  1025. data is None and ttFont is None
  1026. ), "Need both data and ttFont arguments"
  1027. data = (
  1028. self.data
  1029. ) # decompileHeader assigns the data after the header to self.data
  1030. charCodes = []
  1031. gids = []
  1032. pos = 0
  1033. for i in range(self.nGroups):
  1034. startCharCode, endCharCode, glyphID = struct.unpack(
  1035. ">LLL", data[pos : pos + 12]
  1036. )
  1037. pos += 12
  1038. lenGroup = 1 + endCharCode - startCharCode
  1039. charCodes.extend(list(range(startCharCode, endCharCode + 1)))
  1040. gids.extend(self._computeGIDs(glyphID, lenGroup))
  1041. self.data = data = None
  1042. self.cmap = _make_map(self.ttFont, charCodes, gids)
  1043. def compile(self, ttFont):
  1044. if self.data:
  1045. return (
  1046. struct.pack(
  1047. ">HHLLL",
  1048. self.format,
  1049. self.reserved,
  1050. self.length,
  1051. self.language,
  1052. self.nGroups,
  1053. )
  1054. + self.data
  1055. )
  1056. charCodes = list(self.cmap.keys())
  1057. names = list(self.cmap.values())
  1058. nameMap = ttFont.getReverseGlyphMap()
  1059. try:
  1060. gids = [nameMap[name] for name in names]
  1061. except KeyError:
  1062. nameMap = ttFont.getReverseGlyphMap(rebuild=True)
  1063. try:
  1064. gids = [nameMap[name] for name in names]
  1065. except KeyError:
  1066. # allow virtual GIDs in format 12 tables
  1067. gids = []
  1068. for name in names:
  1069. try:
  1070. gid = nameMap[name]
  1071. except KeyError:
  1072. try:
  1073. if name[:3] == "gid":
  1074. gid = int(name[3:])
  1075. else:
  1076. gid = ttFont.getGlyphID(name)
  1077. except:
  1078. raise KeyError(name)
  1079. gids.append(gid)
  1080. cmap = {} # code:glyphID mapping
  1081. for code, gid in zip(charCodes, gids):
  1082. cmap[code] = gid
  1083. charCodes.sort()
  1084. index = 0
  1085. startCharCode = charCodes[0]
  1086. startGlyphID = cmap[startCharCode]
  1087. lastGlyphID = startGlyphID - self._format_step
  1088. lastCharCode = startCharCode - 1
  1089. nGroups = 0
  1090. dataList = []
  1091. maxIndex = len(charCodes)
  1092. for index in range(maxIndex):
  1093. charCode = charCodes[index]
  1094. glyphID = cmap[charCode]
  1095. if not self._IsInSameRun(glyphID, lastGlyphID, charCode, lastCharCode):
  1096. dataList.append(
  1097. struct.pack(">LLL", startCharCode, lastCharCode, startGlyphID)
  1098. )
  1099. startCharCode = charCode
  1100. startGlyphID = glyphID
  1101. nGroups = nGroups + 1
  1102. lastGlyphID = glyphID
  1103. lastCharCode = charCode
  1104. dataList.append(struct.pack(">LLL", startCharCode, lastCharCode, startGlyphID))
  1105. nGroups = nGroups + 1
  1106. data = bytesjoin(dataList)
  1107. lengthSubtable = len(data) + 16
  1108. assert len(data) == (nGroups * 12) == (lengthSubtable - 16)
  1109. return (
  1110. struct.pack(
  1111. ">HHLLL",
  1112. self.format,
  1113. self.reserved,
  1114. lengthSubtable,
  1115. self.language,
  1116. nGroups,
  1117. )
  1118. + data
  1119. )
  1120. def toXML(self, writer, ttFont):
  1121. writer.begintag(
  1122. self.__class__.__name__,
  1123. [
  1124. ("platformID", self.platformID),
  1125. ("platEncID", self.platEncID),
  1126. ("format", self.format),
  1127. ("reserved", self.reserved),
  1128. ("length", self.length),
  1129. ("language", self.language),
  1130. ("nGroups", self.nGroups),
  1131. ],
  1132. )
  1133. writer.newline()
  1134. codes = sorted(self.cmap.items())
  1135. self._writeCodes(codes, writer)
  1136. writer.endtag(self.__class__.__name__)
  1137. writer.newline()
  1138. def fromXML(self, name, attrs, content, ttFont):
  1139. self.format = safeEval(attrs["format"])
  1140. self.reserved = safeEval(attrs["reserved"])
  1141. self.length = safeEval(attrs["length"])
  1142. self.language = safeEval(attrs["language"])
  1143. self.nGroups = safeEval(attrs["nGroups"])
  1144. if not hasattr(self, "cmap"):
  1145. self.cmap = {}
  1146. cmap = self.cmap
  1147. for element in content:
  1148. if not isinstance(element, tuple):
  1149. continue
  1150. name, attrs, content = element
  1151. if name != "map":
  1152. continue
  1153. cmap[safeEval(attrs["code"])] = attrs["name"]
  1154. class cmap_format_12(cmap_format_12_or_13):
  1155. _format_step = 1
  1156. def __init__(self, format=12):
  1157. cmap_format_12_or_13.__init__(self, format)
  1158. def _computeGIDs(self, startingGlyph, numberOfGlyphs):
  1159. return list(range(startingGlyph, startingGlyph + numberOfGlyphs))
  1160. def _IsInSameRun(self, glyphID, lastGlyphID, charCode, lastCharCode):
  1161. return (glyphID == 1 + lastGlyphID) and (charCode == 1 + lastCharCode)
  1162. class cmap_format_13(cmap_format_12_or_13):
  1163. _format_step = 0
  1164. def __init__(self, format=13):
  1165. cmap_format_12_or_13.__init__(self, format)
  1166. def _computeGIDs(self, startingGlyph, numberOfGlyphs):
  1167. return [startingGlyph] * numberOfGlyphs
  1168. def _IsInSameRun(self, glyphID, lastGlyphID, charCode, lastCharCode):
  1169. return (glyphID == lastGlyphID) and (charCode == 1 + lastCharCode)
  1170. def cvtToUVS(threeByteString):
  1171. data = b"\0" + threeByteString
  1172. (val,) = struct.unpack(">L", data)
  1173. return val
  1174. def cvtFromUVS(val):
  1175. assert 0 <= val < 0x1000000
  1176. fourByteString = struct.pack(">L", val)
  1177. return fourByteString[1:]
  1178. class cmap_format_14(CmapSubtable):
  1179. def decompileHeader(self, data, ttFont):
  1180. format, length, numVarSelectorRecords = struct.unpack(">HLL", data[:10])
  1181. self.data = data[10:]
  1182. self.length = length
  1183. self.numVarSelectorRecords = numVarSelectorRecords
  1184. self.ttFont = ttFont
  1185. self.language = 0xFF # has no language.
  1186. def decompile(self, data, ttFont):
  1187. if data is not None and ttFont is not None:
  1188. self.decompileHeader(data, ttFont)
  1189. else:
  1190. assert (
  1191. data is None and ttFont is None
  1192. ), "Need both data and ttFont arguments"
  1193. data = self.data
  1194. self.cmap = (
  1195. {}
  1196. ) # so that clients that expect this to exist in a cmap table won't fail.
  1197. uvsDict = {}
  1198. recOffset = 0
  1199. for n in range(self.numVarSelectorRecords):
  1200. uvs, defOVSOffset, nonDefUVSOffset = struct.unpack(
  1201. ">3sLL", data[recOffset : recOffset + 11]
  1202. )
  1203. recOffset += 11
  1204. varUVS = cvtToUVS(uvs)
  1205. if defOVSOffset:
  1206. startOffset = defOVSOffset - 10
  1207. (numValues,) = struct.unpack(">L", data[startOffset : startOffset + 4])
  1208. startOffset += 4
  1209. for r in range(numValues):
  1210. uv, addtlCnt = struct.unpack(
  1211. ">3sB", data[startOffset : startOffset + 4]
  1212. )
  1213. startOffset += 4
  1214. firstBaseUV = cvtToUVS(uv)
  1215. cnt = addtlCnt + 1
  1216. baseUVList = list(range(firstBaseUV, firstBaseUV + cnt))
  1217. glyphList = [None] * cnt
  1218. localUVList = zip(baseUVList, glyphList)
  1219. try:
  1220. uvsDict[varUVS].extend(localUVList)
  1221. except KeyError:
  1222. uvsDict[varUVS] = list(localUVList)
  1223. if nonDefUVSOffset:
  1224. startOffset = nonDefUVSOffset - 10
  1225. (numRecs,) = struct.unpack(">L", data[startOffset : startOffset + 4])
  1226. startOffset += 4
  1227. localUVList = []
  1228. for r in range(numRecs):
  1229. uv, gid = struct.unpack(">3sH", data[startOffset : startOffset + 5])
  1230. startOffset += 5
  1231. uv = cvtToUVS(uv)
  1232. glyphName = self.ttFont.getGlyphName(gid)
  1233. localUVList.append((uv, glyphName))
  1234. try:
  1235. uvsDict[varUVS].extend(localUVList)
  1236. except KeyError:
  1237. uvsDict[varUVS] = localUVList
  1238. self.uvsDict = uvsDict
  1239. def toXML(self, writer, ttFont):
  1240. writer.begintag(
  1241. self.__class__.__name__,
  1242. [
  1243. ("platformID", self.platformID),
  1244. ("platEncID", self.platEncID),
  1245. ],
  1246. )
  1247. writer.newline()
  1248. uvsDict = self.uvsDict
  1249. uvsList = sorted(uvsDict.keys())
  1250. for uvs in uvsList:
  1251. uvList = uvsDict[uvs]
  1252. uvList.sort(key=lambda item: (item[1] is not None, item[0], item[1]))
  1253. for uv, gname in uvList:
  1254. attrs = [("uv", hex(uv)), ("uvs", hex(uvs))]
  1255. if gname is not None:
  1256. attrs.append(("name", gname))
  1257. writer.simpletag("map", attrs)
  1258. writer.newline()
  1259. writer.endtag(self.__class__.__name__)
  1260. writer.newline()
  1261. def fromXML(self, name, attrs, content, ttFont):
  1262. self.language = 0xFF # provide a value so that CmapSubtable.__lt__() won't fail
  1263. if not hasattr(self, "cmap"):
  1264. self.cmap = (
  1265. {}
  1266. ) # so that clients that expect this to exist in a cmap table won't fail.
  1267. if not hasattr(self, "uvsDict"):
  1268. self.uvsDict = {}
  1269. uvsDict = self.uvsDict
  1270. # For backwards compatibility reasons we accept "None" as an indicator
  1271. # for "default mapping", unless the font actually has a glyph named
  1272. # "None".
  1273. _hasGlyphNamedNone = None
  1274. for element in content:
  1275. if not isinstance(element, tuple):
  1276. continue
  1277. name, attrs, content = element
  1278. if name != "map":
  1279. continue
  1280. uvs = safeEval(attrs["uvs"])
  1281. uv = safeEval(attrs["uv"])
  1282. gname = attrs.get("name")
  1283. if gname == "None":
  1284. if _hasGlyphNamedNone is None:
  1285. _hasGlyphNamedNone = "None" in ttFont.getGlyphOrder()
  1286. if not _hasGlyphNamedNone:
  1287. gname = None
  1288. try:
  1289. uvsDict[uvs].append((uv, gname))
  1290. except KeyError:
  1291. uvsDict[uvs] = [(uv, gname)]
  1292. def compile(self, ttFont):
  1293. if self.data:
  1294. return (
  1295. struct.pack(
  1296. ">HLL", self.format, self.length, self.numVarSelectorRecords
  1297. )
  1298. + self.data
  1299. )
  1300. uvsDict = self.uvsDict
  1301. uvsList = sorted(uvsDict.keys())
  1302. self.numVarSelectorRecords = len(uvsList)
  1303. offset = (
  1304. 10 + self.numVarSelectorRecords * 11
  1305. ) # current value is end of VarSelectorRecords block.
  1306. data = []
  1307. varSelectorRecords = []
  1308. for uvs in uvsList:
  1309. entryList = uvsDict[uvs]
  1310. defList = [entry for entry in entryList if entry[1] is None]
  1311. if defList:
  1312. defList = [entry[0] for entry in defList]
  1313. defOVSOffset = offset
  1314. defList.sort()
  1315. lastUV = defList[0]
  1316. cnt = -1
  1317. defRecs = []
  1318. for defEntry in defList:
  1319. cnt += 1
  1320. if (lastUV + cnt) != defEntry:
  1321. rec = struct.pack(">3sB", cvtFromUVS(lastUV), cnt - 1)
  1322. lastUV = defEntry
  1323. defRecs.append(rec)
  1324. cnt = 0
  1325. rec = struct.pack(">3sB", cvtFromUVS(lastUV), cnt)
  1326. defRecs.append(rec)
  1327. numDefRecs = len(defRecs)
  1328. data.append(struct.pack(">L", numDefRecs))
  1329. data.extend(defRecs)
  1330. offset += 4 + numDefRecs * 4
  1331. else:
  1332. defOVSOffset = 0
  1333. ndefList = [entry for entry in entryList if entry[1] is not None]
  1334. if ndefList:
  1335. nonDefUVSOffset = offset
  1336. ndefList.sort()
  1337. numNonDefRecs = len(ndefList)
  1338. data.append(struct.pack(">L", numNonDefRecs))
  1339. offset += 4 + numNonDefRecs * 5
  1340. for uv, gname in ndefList:
  1341. gid = ttFont.getGlyphID(gname)
  1342. ndrec = struct.pack(">3sH", cvtFromUVS(uv), gid)
  1343. data.append(ndrec)
  1344. else:
  1345. nonDefUVSOffset = 0
  1346. vrec = struct.pack(">3sLL", cvtFromUVS(uvs), defOVSOffset, nonDefUVSOffset)
  1347. varSelectorRecords.append(vrec)
  1348. data = bytesjoin(varSelectorRecords) + bytesjoin(data)
  1349. self.length = 10 + len(data)
  1350. headerdata = struct.pack(
  1351. ">HLL", self.format, self.length, self.numVarSelectorRecords
  1352. )
  1353. return headerdata + data
  1354. class cmap_format_unknown(CmapSubtable):
  1355. def toXML(self, writer, ttFont):
  1356. cmapName = self.__class__.__name__[:12] + str(self.format)
  1357. writer.begintag(
  1358. cmapName,
  1359. [
  1360. ("platformID", self.platformID),
  1361. ("platEncID", self.platEncID),
  1362. ],
  1363. )
  1364. writer.newline()
  1365. writer.dumphex(self.data)
  1366. writer.endtag(cmapName)
  1367. writer.newline()
  1368. def fromXML(self, name, attrs, content, ttFont):
  1369. self.data = readHex(content)
  1370. self.cmap = {}
  1371. def decompileHeader(self, data, ttFont):
  1372. self.language = 0 # dummy value
  1373. self.data = data
  1374. def decompile(self, data, ttFont):
  1375. # we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
  1376. # If not, someone is calling the subtable decompile() directly, and must provide both args.
  1377. if data is not None and ttFont is not None:
  1378. self.decompileHeader(data, ttFont)
  1379. else:
  1380. assert (
  1381. data is None and ttFont is None
  1382. ), "Need both data and ttFont arguments"
  1383. def compile(self, ttFont):
  1384. if self.data:
  1385. return self.data
  1386. else:
  1387. return None
  1388. cmap_classes = {
  1389. 0: cmap_format_0,
  1390. 2: cmap_format_2,
  1391. 4: cmap_format_4,
  1392. 6: cmap_format_6,
  1393. 12: cmap_format_12,
  1394. 13: cmap_format_13,
  1395. 14: cmap_format_14,
  1396. }