//Apache2, 2017-present, WinterDev //Apache2, 2014-2016, Samuel Carlsson, WinterDev using System; using System.Collections.Generic; using System.IO; namespace Typography.OpenFont.Tables { //--------------------------------------------------- //cmap - Character To Glyph Index Mapping Table //--------------------------------------------------- //This table defines the mapping of character codes to the glyph index values used in the font. //It may contain more than one subtable, in order to support more than one character encoding scheme. //Character codes that do not correspond to any glyph in the font should be mapped to glyph index 0. //The glyph at this location must be a special glyph representing a missing character, commonly known as .notdef. //The table header indicates the character encodings for which subtables are present. //Each subtable is in one of seven possible formats and begins with a format code indicating the format used. //The platform ID and platform - specific encoding ID in the header entry(and, in the case of the Macintosh platform, //the language field in the subtable itself) are used to specify a particular 'cmap' encoding. //The header entries must be sorted first by platform ID, then by platform - specific encoding ID, //and then by the language field in the corresponding subtable.Each platform ID, //platform - specific encoding ID, and subtable language combination may appear only once in the 'cmap' table. //When building a Unicode font for Windows, the platform ID should be 3 and the encoding ID should be 1. //When building a symbol font for Windows, the platform ID should be 3 and the encoding ID should be 0. //When building a font that will be used on the Macintosh, the platform ID should be 1 and the encoding ID should be 0. //All Microsoft Unicode BMP encodings(Platform ID = 3, Encoding ID = 1) must provide at least a Format 4 'cmap' subtable. //If the font is meant to support supplementary(non - BMP) Unicode characters, //it will additionally need a Format 12 subtable with a platform encoding ID 10. //The contents of the Format 12 subtable need to be a superset of the contents of the Format 4 subtable. //Microsoft strongly recommends using a BMP Unicode 'cmap' for all fonts. However, some other encodings that appear in current fonts follow: //Windows Encodings //Platform ID Encoding ID Description //3 0 Symbol //3 1 Unicode BMP(UCS - 2) //3 2 ShiftJIS //3 3 PRC //3 4 Big5 //3 5 Wansung //3 6 Johab //3 7 Reserved //3 8 Reserved //3 9 Reserved //3 10 Unicode UCS - 4 //--------------------------------------------------- //////////////////////////////////////////////////////////////////////// //from https://docs.microsoft.com/en-us/typography/opentype/processing-part2 //CMAP Table //Every glyph in a TrueType font is identified by a unique Glyph ID (GID), //a simple sequential numbering of all the glyphs in the font. //These GIDs are mapped to character codepoints in the font's CMAP table. //In OpenType fonts, the principal mapping is to Unicode codepoints; that is, //the GIDs of nominal glyph representations of specific characters are mapped to appropriate Unicode values. //The key to OpenType glyph processing is that not every glyph in a font is directly mapped to a codepoint. //Variant glyph forms, ligatures, dynamically composed diacritics and other rendering forms do not require entries in the CMAP table. //Rather, their GIDs are mapped in layout features to the GIDs of nominal character forms, //i.e. to those glyphs that do have CMAP entries. This is the heart of glyph processing: the mapping of GIDs to each other, //rather than directly to character codepoints. //In order for fonts to be able to correctly render text, //font developers must ensure that the correct nominal glyph form GIDs are mapped to the correct Unicode codepoints. //Application developers, of course, must ensure that their applications correctly manage input and storage of Unicode text codepoints, //or map correctly to these codepoints from other codepages and character sets. //////////////////////////////////////////////////////////////////////// public class Cmap : TableEntry { //https://docs.microsoft.com/en-us/typography/opentype/spec/cmap public const string _N = "cmap"; public override string Name => _N; CharacterMap[] _charMaps = null; List _charMap14List; Dictionary _codepointToGlyphs = new Dictionary(); /// /// find glyph index from given codepoint(s) /// /// /// /// glyph index public ushort GetGlyphIndex(int codepoint, int nextCodepoint, out bool skipNextCodepoint) { // https://docs.microsoft.com/en-us/typography/opentype/spec/cmap // "character codes that do not correspond to any glyph in the font should be mapped to glyph index 0." skipNextCodepoint = false; //default if (!_codepointToGlyphs.TryGetValue(codepoint, out ushort found)) { for (int i = 0; i < _charMaps.Length; ++i) { CharacterMap cmap = _charMaps[i]; if (found == 0) { found = cmap.GetGlyphIndex(codepoint); } else if (cmap.PlatformId == 3 && cmap.EncodingId == 1) { //...When building a Unicode font for Windows, // the platform ID should be 3 and the encoding ID should be 1 ushort glyphIndex = cmap.GetGlyphIndex(codepoint); //glyphIndex=> gid if (glyphIndex != 0) { found = glyphIndex; } } } _codepointToGlyphs[codepoint] = found; } // If there is a second codepoint, we are asked whether this is an UVS sequence // -> if true, return a glyph ID // -> otherwise, return 0 if (nextCodepoint > 0 && _charMap14List != null) { foreach (CharMapFormat14 cmap14 in _charMap14List) { ushort glyphIndex = cmap14.CharacterPairToGlyphIndex(codepoint, found, nextCodepoint); if (glyphIndex > 0) { skipNextCodepoint = true; return glyphIndex; } } } return found; } protected override void ReadContentFrom(BinaryReader input) { //https://www.microsoft.com/typography/otspec/cmap.htm long beginAt = input.BaseStream.Position; // ushort version = input.ReadUInt16(); // 0 ushort tableCount = input.ReadUInt16(); ushort[] platformIds = new ushort[tableCount]; ushort[] encodingIds = new ushort[tableCount]; uint[] offsets = new uint[tableCount]; for (int i = 0; i < tableCount; i++) { platformIds[i] = input.ReadUInt16(); encodingIds[i] = input.ReadUInt16(); offsets[i] = input.ReadUInt32(); } _charMaps = new CharacterMap[tableCount]; for (int i = 0; i < tableCount; i++) { input.BaseStream.Seek(beginAt + offsets[i], SeekOrigin.Begin); CharacterMap cmap = ReadCharacterMap(input); cmap.PlatformId = platformIds[i]; cmap.EncodingId = encodingIds[i]; _charMaps[i] = cmap; // if (cmap is CharMapFormat14 cmap14) { if (_charMap14List == null) _charMap14List = new List(); // _charMap14List.Add(cmap14); } } } static CharacterMap ReadFormat_0(BinaryReader input) { ushort length = input.ReadUInt16(); //Format 0: Byte encoding table //This is the Apple standard character to glyph index mapping table. //Type Name Description //uint16 format Format number is set to 0. //uint16 length This is the length in bytes of the subtable. //uint16 language Please see “Note on the language field in 'cmap' subtables“ in this document. //uint8 glyphIdArray[256] An array that maps character codes to glyph index values. //----------- //This is a simple 1 to 1 mapping of character codes to glyph indices. //The glyph set is limited to 256. Note that if this format is used to index into a larger glyph set, //only the first 256 glyphs will be accessible. ushort language = input.ReadUInt16(); byte[] only256Glyphs = input.ReadBytes(256); ushort[] only256UInt16Glyphs = new ushort[256]; for (int i = 255; i >= 0; --i) { //expand only256UInt16Glyphs[i] = only256Glyphs[i]; } //convert to format4 cmap table ushort[] startArray = new ushort[] { 0, 0xFFFF }; ushort[] endArray = new ushort[] { 255, 0xFFFF }; ushort[] deltaArray = new ushort[] { 0, 1 }; ushort[] offsetArray = new ushort[] { 4, 0 }; return new CharMapFormat4(startArray, endArray, deltaArray, offsetArray, only256UInt16Glyphs); } static CharacterMap ReadFormat_2(BinaryReader input) { //Format 2: High - byte mapping through table //This subtable is useful for the national character code standards used for Japanese, Chinese, and Korean characters. //These code standards use a mixed 8 / 16 - bit encoding, //in which certain byte values signal the first byte of a 2 - byte character(but these values are also legal as the second byte of a 2 - byte character). // //In addition, even for the 2 - byte characters, the mapping of character codes to glyph index values depends heavily on the first byte. //Consequently, the table begins with an array that maps the first byte to a SubHeader record. //For 2 - byte character codes, the SubHeader is used to map the second byte's value through a subArray, as described below. //When processing mixed 8/16-bit text, SubHeader 0 is special: it is used for single-byte character codes. //When SubHeader 0 is used, a second byte is not needed; the single byte value is mapped through the subArray. //------------- // 'cmap' Subtable Format 2: //------------- // Type Name Description // uint16 format Format number is set to 2. // uint16 length This is the length in bytes of the subtable. // uint16 language Please see “Note on the language field in 'cmap' subtables“ in this document. // uint16 subHeaderKeys[256] Array that maps high bytes to subHeaders: value is subHeader index * 8. // SubHeader subHeaders[] Variable - length array of SubHeader records. // uint16 glyphIndexArray[] Variable - length array containing subarrays used for mapping the low byte of 2 - byte characters. //------------------ // A SubHeader is structured as follows: // SubHeader Record: // Type Name Description // uint16 firstCode First valid low byte for this SubHeader. // uint16 entryCount Number of valid low bytes for this SubHeader. // int16 idDelta See text below. // uint16 idRangeOffset See text below. // // The firstCode and entryCount values specify a subrange that begins at firstCode and has a length equal to the value of entryCount. //This subrange stays within the 0 - 255 range of the byte being mapped. //Bytes outside of this subrange are mapped to glyph index 0(missing glyph). //The offset of the byte within this subrange is then used as index into a corresponding subarray of glyphIndexArray. //This subarray is also of length entryCount. //The value of the idRangeOffset is the number of bytes past the actual location of the idRangeOffset word //where the glyphIndexArray element corresponding to firstCode appears. // Finally, if the value obtained from the subarray is not 0(which indicates the missing glyph), //you should add idDelta to it in order to get the glyphIndex. //The value idDelta permits the same subarray to be used for several different subheaders. //The idDelta arithmetic is modulo 65536. Utils.WarnUnimplemented("cmap subtable format 2"); return new NullCharMap(); } static CharMapFormat4 ReadFormat_4(BinaryReader input) { ushort lenOfSubTable = input.ReadUInt16(); //This is the length in bytes of the subtable. **** //This is the Microsoft standard character to glyph index mapping table for fonts that support Unicode ranges other than the range [U+D800 - U+DFFF] (defined as Surrogates Area, in Unicode v 3.0) //which is used for UCS-4 characters. //If a font supports this character range (i.e. in turn supports the UCS-4 characters) a subtable in this format with a platform specific encoding ID 1 is yet needed, //in addition to a subtable in format 12 with a platform specific encoding ID 10. Please see details on format 12 below, for fonts that support UCS-4 characters on Windows. // //This format is used when the character codes for the characters represented by a font fall into several contiguous ranges, //possibly with holes in some or all of the ranges (that is, some of the codes in a range may not have a representation in the font). //The format-dependent data is divided into three parts, which must occur in the following order: // A four-word header gives parameters for an optimized search of the segment list; // Four parallel arrays describe the segments (one segment for each contiguous range of codes); // A variable-length array of glyph IDs (unsigned words). long tableStartEndAt = input.BaseStream.Position + lenOfSubTable; ushort language = input.ReadUInt16(); //Note on the language field in 'cmap' subtables: //The language field must be set to zero for all cmap subtables whose platform IDs are other than Macintosh (platform ID 1). //For cmap subtables whose platform IDs are Macintosh, set this field to the Macintosh language ID of the cmap subtable plus one, //or to zero if the cmap subtable is not language-specific. //For example, a Mac OS Turkish cmap subtable must set this field to 18, since the Macintosh language ID for Turkish is 17. //A Mac OS Roman cmap subtable must set this field to 0, since Mac OS Roman is not a language-specific encoding. ushort segCountX2 = input.ReadUInt16(); //2 * segCount ushort searchRange = input.ReadUInt16(); //2 * (2**FLOOR(log2(segCount))) ushort entrySelector = input.ReadUInt16();//2 * (2**FLOOR(log2(segCount))) ushort rangeShift = input.ReadUInt16(); //2 * (2**FLOOR(log2(segCount))) int segCount = segCountX2 / 2; ushort[] endCode = Utils.ReadUInt16Array(input, segCount);//Ending character code for each segment, last = 0xFFFF. //>To ensure that the search will terminate, the final endCode value must be 0xFFFF. //>This segment need not contain any valid mappings. It can simply map the single character code 0xFFFF to the missing character glyph, glyph 0. ushort Reserved = input.ReadUInt16(); // always 0 ushort[] startCode = Utils.ReadUInt16Array(input, segCount); //Starting character code for each segment ushort[] idDelta = Utils.ReadUInt16Array(input, segCount); //Delta for all character codes in segment ushort[] idRangeOffset = Utils.ReadUInt16Array(input, segCount); //Offset in bytes to glyph indexArray, or 0 //------------------------------------------------------------------------------------ long remainingLen = tableStartEndAt - input.BaseStream.Position; int recordNum2 = (int)(remainingLen / 2); ushort[] glyphIdArray = Utils.ReadUInt16Array(input, recordNum2);//Glyph index array return new CharMapFormat4(startCode, endCode, idDelta, idRangeOffset, glyphIdArray); } static CharMapFormat6 ReadFormat_6(BinaryReader input) { //Format 6: Trimmed table mapping //Type Name Description //uint16 format Format number is set to 6. //uint16 length This is the length in bytes of the subtable. //uint16 language Please see “Note on the language field in 'cmap' subtables“ in this document. //uint16 firstCode First character code of subrange. //uint16 entryCount Number of character codes in subrange. //uint16 glyphIdArray[entryCount] Array of glyph index values for character codes in the range. //The firstCode and entryCount values specify a subrange(beginning at firstCode, length = entryCount) within the range of possible character codes. //Codes outside of this subrange are mapped to glyph index 0. //The offset of the code(from the first code) within this subrange is used as index to the glyphIdArray, //which provides the glyph index value. ushort length = input.ReadUInt16(); ushort language = input.ReadUInt16(); ushort firstCode = input.ReadUInt16(); ushort entryCount = input.ReadUInt16(); ushort[] glyphIdArray = Utils.ReadUInt16Array(input, entryCount); return new CharMapFormat6(firstCode, glyphIdArray); } static CharacterMap ReadFormat_12(BinaryReader input) { //TODO: test this again // Format 12: Segmented coverage //This is the Microsoft standard character to glyph index mapping table for fonts supporting the UCS - 4 characters //in the Unicode Surrogates Area(U + D800 - U + DFFF). //It is a bit like format 4, in that it defines segments for sparse representation in 4 - byte character space. //Here's the subtable format: //'cmap' Subtable Format 12: //Type Name Description //uint16 format Subtable format; set to 12. //uint16 reserved Reserved; set to 0 //uint32 length Byte length of this subtable(including the header) //uint32 language Please see “Note on the language field in 'cmap' subtables“ in this document. //uint32 numGroups Number of groupings which follow //SequentialMapGroup groups[numGroups] Array of SequentialMapGroup records. // //The sequential map group record is the same format as is used for the format 8 subtable. //The qualifications regarding 16 - bit character codes does not apply here, //however, since characters codes are uniformly 32 - bit. //SequentialMapGroup Record: //Type Name Description //uint32 startCharCode First character code in this group //uint32 endCharCode Last character code in this group //uint32 startGlyphID Glyph index corresponding to the starting character code // //Groups must be sorted by increasing startCharCode.A group's endCharCode must be less than the startCharCode of the following group, //if any. The endCharCode is used, rather than a count, because comparisons for group matching are usually done on an existing character code, //and having the endCharCode be there explicitly saves the necessity of an addition per group. // //Fonts providing Unicode - encoded UCS - 4 character support for Windows 2000 and later, //need to have a subtable with platform ID 3, platform specific encoding ID 1 in format 4; //and in addition, need to have a subtable for platform ID 3, platform specific encoding ID 10 in format 12. //Please note, that the content of format 12 subtable, //needs to be a super set of the content in the format 4 subtable. //The format 4 subtable needs to be in the cmap table to enable backward compatibility needs. ushort reserved = input.ReadUInt16(); #if DEBUG if (reserved != 0) { throw new OpenFontNotSupportedException(); } #endif uint length = input.ReadUInt32();// Byte length of this subtable(including the header) uint language = input.ReadUInt32(); uint numGroups = input.ReadUInt32(); #if DEBUG if (numGroups > int.MaxValue) { throw new OpenFontNotSupportedException(); } #endif uint[] startCharCodes = new uint[(int)numGroups]; uint[] endCharCodes = new uint[(int)numGroups]; uint[] startGlyphIds = new uint[(int)numGroups]; for (uint i = 0; i < numGroups; ++i) { //seq map group record startCharCodes[i] = input.ReadUInt32(); endCharCodes[i] = input.ReadUInt32(); startGlyphIds[i] = input.ReadUInt32(); } return new CharMapFormat12(startCharCodes, endCharCodes, startGlyphIds); } private static CharacterMap ReadCharacterMap(BinaryReader input) { ushort format = input.ReadUInt16(); switch (format) { default: Utils.WarnUnimplemented("cmap subtable format {0}", format); return new NullCharMap(); case 0: return ReadFormat_0(input); case 2: return ReadFormat_2(input); case 4: return ReadFormat_4(input); case 6: return ReadFormat_6(input); case 12: return ReadFormat_12(input); case 14: return CharMapFormat14.Create(input); } } public void CollectUnicode(List unicodes) { for (int i = 0; i < _charMaps.Length; ++i) { _charMaps[i].CollectUnicodeChars(unicodes); } } public void CollectUnicode(int platform, List unicodes, List glyphIndexList) { if (_charMaps.Length == 1) { } for (int i = 0; i < _charMaps.Length; ++i) { CharacterMap cmap = _charMaps[i]; if (platform < 0) { cmap.CollectUnicodeChars(unicodes, glyphIndexList); } else if (cmap.PlatformId == platform) { cmap.CollectUnicodeChars(unicodes, glyphIndexList); } } } } }