Files
QuickLook/QuickLook.Plugin/QuickLook.Plugin.FontViewer/Typography.OpenFont/Tables/Cmap.cs
2024-12-30 04:21:24 +08:00

437 lines
24 KiB
C#

//Apache2, 2017-present, WinterDev
//Apache2, 2014-2016, Samuel Carlsson, WinterDev
using System;
using System.Collections.Generic;
using System.IO;
namespace Typography.OpenFont.Tables
{
//---------------------------------------------------
//cmap - Character To Glyph Index Mapping Table
//---------------------------------------------------
//This table defines the mapping of character codes to the glyph index values used in the font.
//It may contain more than one subtable, in order to support more than one character encoding scheme.
//Character codes that do not correspond to any glyph in the font should be mapped to glyph index 0.
//The glyph at this location must be a special glyph representing a missing character, commonly known as .notdef.
//The table header indicates the character encodings for which subtables are present.
//Each subtable is in one of seven possible formats and begins with a format code indicating the format used.
//The platform ID and platform - specific encoding ID in the header entry(and, in the case of the Macintosh platform,
//the language field in the subtable itself) are used to specify a particular 'cmap' encoding.
//The header entries must be sorted first by platform ID, then by platform - specific encoding ID,
//and then by the language field in the corresponding subtable.Each platform ID,
//platform - specific encoding ID, and subtable language combination may appear only once in the 'cmap' table.
//When building a Unicode font for Windows, the platform ID should be 3 and the encoding ID should be 1.
//When building a symbol font for Windows, the platform ID should be 3 and the encoding ID should be 0.
//When building a font that will be used on the Macintosh, the platform ID should be 1 and the encoding ID should be 0.
//All Microsoft Unicode BMP encodings(Platform ID = 3, Encoding ID = 1) must provide at least a Format 4 'cmap' subtable.
//If the font is meant to support supplementary(non - BMP) Unicode characters,
//it will additionally need a Format 12 subtable with a platform encoding ID 10.
//The contents of the Format 12 subtable need to be a superset of the contents of the Format 4 subtable.
//Microsoft strongly recommends using a BMP Unicode 'cmap' for all fonts. However, some other encodings that appear in current fonts follow:
//Windows Encodings
//Platform ID Encoding ID Description
//3 0 Symbol
//3 1 Unicode BMP(UCS - 2)
//3 2 ShiftJIS
//3 3 PRC
//3 4 Big5
//3 5 Wansung
//3 6 Johab
//3 7 Reserved
//3 8 Reserved
//3 9 Reserved
//3 10 Unicode UCS - 4
//---------------------------------------------------
////////////////////////////////////////////////////////////////////////
//from https://docs.microsoft.com/en-us/typography/opentype/processing-part2
//CMAP Table
//Every glyph in a TrueType font is identified by a unique Glyph ID (GID),
//a simple sequential numbering of all the glyphs in the font.
//These GIDs are mapped to character codepoints in the font's CMAP table.
//In OpenType fonts, the principal mapping is to Unicode codepoints; that is,
//the GIDs of nominal glyph representations of specific characters are mapped to appropriate Unicode values.
//The key to OpenType glyph processing is that not every glyph in a font is directly mapped to a codepoint.
//Variant glyph forms, ligatures, dynamically composed diacritics and other rendering forms do not require entries in the CMAP table.
//Rather, their GIDs are mapped in layout features to the GIDs of nominal character forms,
//i.e. to those glyphs that do have CMAP entries. This is the heart of glyph processing: the mapping of GIDs to each other,
//rather than directly to character codepoints.
//In order for fonts to be able to correctly render text,
//font developers must ensure that the correct nominal glyph form GIDs are mapped to the correct Unicode codepoints.
//Application developers, of course, must ensure that their applications correctly manage input and storage of Unicode text codepoints,
//or map correctly to these codepoints from other codepages and character sets.
////////////////////////////////////////////////////////////////////////
public class Cmap : TableEntry
{
//https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
public const string _N = "cmap";
public override string Name => _N;
CharacterMap[] _charMaps = null;
List<CharMapFormat14> _charMap14List;
Dictionary<int, ushort> _codepointToGlyphs = new Dictionary<int, ushort>();
/// <summary>
/// find glyph index from given codepoint(s)
/// </summary>
/// <param name="codepoint"></param>
/// <param name="nextCodepoint"></param>
/// <returns>glyph index</returns>
public ushort GetGlyphIndex(int codepoint, int nextCodepoint, out bool skipNextCodepoint)
{
// https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
// "character codes that do not correspond to any glyph in the font should be mapped to glyph index 0."
skipNextCodepoint = false; //default
if (!_codepointToGlyphs.TryGetValue(codepoint, out ushort found))
{
for (int i = 0; i < _charMaps.Length; ++i)
{
CharacterMap cmap = _charMaps[i];
if (found == 0)
{
found = cmap.GetGlyphIndex(codepoint);
}
else if (cmap.PlatformId == 3 && cmap.EncodingId == 1)
{
//...When building a Unicode font for Windows,
// the platform ID should be 3 and the encoding ID should be 1
ushort glyphIndex = cmap.GetGlyphIndex(codepoint); //glyphIndex=> gid
if (glyphIndex != 0)
{
found = glyphIndex;
}
}
}
_codepointToGlyphs[codepoint] = found;
}
// If there is a second codepoint, we are asked whether this is an UVS sequence
// -> if true, return a glyph ID
// -> otherwise, return 0
if (nextCodepoint > 0 && _charMap14List != null)
{
foreach (CharMapFormat14 cmap14 in _charMap14List)
{
ushort glyphIndex = cmap14.CharacterPairToGlyphIndex(codepoint, found, nextCodepoint);
if (glyphIndex > 0)
{
skipNextCodepoint = true;
return glyphIndex;
}
}
}
return found;
}
protected override void ReadContentFrom(BinaryReader input)
{
//https://www.microsoft.com/typography/otspec/cmap.htm
long beginAt = input.BaseStream.Position;
//
ushort version = input.ReadUInt16(); // 0
ushort tableCount = input.ReadUInt16();
ushort[] platformIds = new ushort[tableCount];
ushort[] encodingIds = new ushort[tableCount];
uint[] offsets = new uint[tableCount];
for (int i = 0; i < tableCount; i++)
{
platformIds[i] = input.ReadUInt16();
encodingIds[i] = input.ReadUInt16();
offsets[i] = input.ReadUInt32();
}
_charMaps = new CharacterMap[tableCount];
for (int i = 0; i < tableCount; i++)
{
input.BaseStream.Seek(beginAt + offsets[i], SeekOrigin.Begin);
CharacterMap cmap = ReadCharacterMap(input);
cmap.PlatformId = platformIds[i];
cmap.EncodingId = encodingIds[i];
_charMaps[i] = cmap;
//
if (cmap is CharMapFormat14 cmap14)
{
if (_charMap14List == null) _charMap14List = new List<CharMapFormat14>();
//
_charMap14List.Add(cmap14);
}
}
}
static CharacterMap ReadFormat_0(BinaryReader input)
{
ushort length = input.ReadUInt16();
//Format 0: Byte encoding table
//This is the Apple standard character to glyph index mapping table.
//Type Name Description
//uint16 format Format number is set to 0.
//uint16 length This is the length in bytes of the subtable.
//uint16 language Please see “Note on the language field in 'cmap' subtables“ in this document.
//uint8 glyphIdArray[256] An array that maps character codes to glyph index values.
//-----------
//This is a simple 1 to 1 mapping of character codes to glyph indices.
//The glyph set is limited to 256. Note that if this format is used to index into a larger glyph set,
//only the first 256 glyphs will be accessible.
ushort language = input.ReadUInt16();
byte[] only256Glyphs = input.ReadBytes(256);
ushort[] only256UInt16Glyphs = new ushort[256];
for (int i = 255; i >= 0; --i)
{
//expand
only256UInt16Glyphs[i] = only256Glyphs[i];
}
//convert to format4 cmap table
ushort[] startArray = new ushort[] { 0, 0xFFFF };
ushort[] endArray = new ushort[] { 255, 0xFFFF };
ushort[] deltaArray = new ushort[] { 0, 1 };
ushort[] offsetArray = new ushort[] { 4, 0 };
return new CharMapFormat4(startArray, endArray, deltaArray, offsetArray, only256UInt16Glyphs);
}
static CharacterMap ReadFormat_2(BinaryReader input)
{
//Format 2: High - byte mapping through table
//This subtable is useful for the national character code standards used for Japanese, Chinese, and Korean characters.
//These code standards use a mixed 8 / 16 - bit encoding,
//in which certain byte values signal the first byte of a 2 - byte character(but these values are also legal as the second byte of a 2 - byte character).
//
//In addition, even for the 2 - byte characters, the mapping of character codes to glyph index values depends heavily on the first byte.
//Consequently, the table begins with an array that maps the first byte to a SubHeader record.
//For 2 - byte character codes, the SubHeader is used to map the second byte's value through a subArray, as described below.
//When processing mixed 8/16-bit text, SubHeader 0 is special: it is used for single-byte character codes.
//When SubHeader 0 is used, a second byte is not needed; the single byte value is mapped through the subArray.
//-------------
// 'cmap' Subtable Format 2:
//-------------
// Type Name Description
// uint16 format Format number is set to 2.
// uint16 length This is the length in bytes of the subtable.
// uint16 language Please see “Note on the language field in 'cmap' subtables“ in this document.
// uint16 subHeaderKeys[256] Array that maps high bytes to subHeaders: value is subHeader index * 8.
// SubHeader subHeaders[] Variable - length array of SubHeader records.
// uint16 glyphIndexArray[] Variable - length array containing subarrays used for mapping the low byte of 2 - byte characters.
//------------------
// A SubHeader is structured as follows:
// SubHeader Record:
// Type Name Description
// uint16 firstCode First valid low byte for this SubHeader.
// uint16 entryCount Number of valid low bytes for this SubHeader.
// int16 idDelta See text below.
// uint16 idRangeOffset See text below.
//
// The firstCode and entryCount values specify a subrange that begins at firstCode and has a length equal to the value of entryCount.
//This subrange stays within the 0 - 255 range of the byte being mapped.
//Bytes outside of this subrange are mapped to glyph index 0(missing glyph).
//The offset of the byte within this subrange is then used as index into a corresponding subarray of glyphIndexArray.
//This subarray is also of length entryCount.
//The value of the idRangeOffset is the number of bytes past the actual location of the idRangeOffset word
//where the glyphIndexArray element corresponding to firstCode appears.
// Finally, if the value obtained from the subarray is not 0(which indicates the missing glyph),
//you should add idDelta to it in order to get the glyphIndex.
//The value idDelta permits the same subarray to be used for several different subheaders.
//The idDelta arithmetic is modulo 65536.
Utils.WarnUnimplemented("cmap subtable format 2");
return new NullCharMap();
}
static CharMapFormat4 ReadFormat_4(BinaryReader input)
{
ushort lenOfSubTable = input.ReadUInt16(); //This is the length in bytes of the subtable. ****
//This is the Microsoft standard character to glyph index mapping table for fonts that support Unicode ranges other than the range [U+D800 - U+DFFF] (defined as Surrogates Area, in Unicode v 3.0)
//which is used for UCS-4 characters.
//If a font supports this character range (i.e. in turn supports the UCS-4 characters) a subtable in this format with a platform specific encoding ID 1 is yet needed,
//in addition to a subtable in format 12 with a platform specific encoding ID 10. Please see details on format 12 below, for fonts that support UCS-4 characters on Windows.
//
//This format is used when the character codes for the characters represented by a font fall into several contiguous ranges,
//possibly with holes in some or all of the ranges (that is, some of the codes in a range may not have a representation in the font).
//The format-dependent data is divided into three parts, which must occur in the following order:
// A four-word header gives parameters for an optimized search of the segment list;
// Four parallel arrays describe the segments (one segment for each contiguous range of codes);
// A variable-length array of glyph IDs (unsigned words).
long tableStartEndAt = input.BaseStream.Position + lenOfSubTable;
ushort language = input.ReadUInt16();
//Note on the language field in 'cmap' subtables:
//The language field must be set to zero for all cmap subtables whose platform IDs are other than Macintosh (platform ID 1).
//For cmap subtables whose platform IDs are Macintosh, set this field to the Macintosh language ID of the cmap subtable plus one,
//or to zero if the cmap subtable is not language-specific.
//For example, a Mac OS Turkish cmap subtable must set this field to 18, since the Macintosh language ID for Turkish is 17.
//A Mac OS Roman cmap subtable must set this field to 0, since Mac OS Roman is not a language-specific encoding.
ushort segCountX2 = input.ReadUInt16(); //2 * segCount
ushort searchRange = input.ReadUInt16(); //2 * (2**FLOOR(log2(segCount)))
ushort entrySelector = input.ReadUInt16();//2 * (2**FLOOR(log2(segCount)))
ushort rangeShift = input.ReadUInt16(); //2 * (2**FLOOR(log2(segCount)))
int segCount = segCountX2 / 2;
ushort[] endCode = Utils.ReadUInt16Array(input, segCount);//Ending character code for each segment, last = 0xFFFF.
//>To ensure that the search will terminate, the final endCode value must be 0xFFFF.
//>This segment need not contain any valid mappings. It can simply map the single character code 0xFFFF to the missing character glyph, glyph 0.
ushort Reserved = input.ReadUInt16(); // always 0
ushort[] startCode = Utils.ReadUInt16Array(input, segCount); //Starting character code for each segment
ushort[] idDelta = Utils.ReadUInt16Array(input, segCount); //Delta for all character codes in segment
ushort[] idRangeOffset = Utils.ReadUInt16Array(input, segCount); //Offset in bytes to glyph indexArray, or 0
//------------------------------------------------------------------------------------
long remainingLen = tableStartEndAt - input.BaseStream.Position;
int recordNum2 = (int)(remainingLen / 2);
ushort[] glyphIdArray = Utils.ReadUInt16Array(input, recordNum2);//Glyph index array
return new CharMapFormat4(startCode, endCode, idDelta, idRangeOffset, glyphIdArray);
}
static CharMapFormat6 ReadFormat_6(BinaryReader input)
{
//Format 6: Trimmed table mapping
//Type Name Description
//uint16 format Format number is set to 6.
//uint16 length This is the length in bytes of the subtable.
//uint16 language Please see “Note on the language field in 'cmap' subtables“ in this document.
//uint16 firstCode First character code of subrange.
//uint16 entryCount Number of character codes in subrange.
//uint16 glyphIdArray[entryCount] Array of glyph index values for character codes in the range.
//The firstCode and entryCount values specify a subrange(beginning at firstCode, length = entryCount) within the range of possible character codes.
//Codes outside of this subrange are mapped to glyph index 0.
//The offset of the code(from the first code) within this subrange is used as index to the glyphIdArray,
//which provides the glyph index value.
ushort length = input.ReadUInt16();
ushort language = input.ReadUInt16();
ushort firstCode = input.ReadUInt16();
ushort entryCount = input.ReadUInt16();
ushort[] glyphIdArray = Utils.ReadUInt16Array(input, entryCount);
return new CharMapFormat6(firstCode, glyphIdArray);
}
static CharacterMap ReadFormat_12(BinaryReader input)
{
//TODO: test this again
// Format 12: Segmented coverage
//This is the Microsoft standard character to glyph index mapping table for fonts supporting the UCS - 4 characters
//in the Unicode Surrogates Area(U + D800 - U + DFFF).
//It is a bit like format 4, in that it defines segments for sparse representation in 4 - byte character space.
//Here's the subtable format:
//'cmap' Subtable Format 12:
//Type Name Description
//uint16 format Subtable format; set to 12.
//uint16 reserved Reserved; set to 0
//uint32 length Byte length of this subtable(including the header)
//uint32 language Please see “Note on the language field in 'cmap' subtables“ in this document.
//uint32 numGroups Number of groupings which follow
//SequentialMapGroup groups[numGroups] Array of SequentialMapGroup records.
//
//The sequential map group record is the same format as is used for the format 8 subtable.
//The qualifications regarding 16 - bit character codes does not apply here,
//however, since characters codes are uniformly 32 - bit.
//SequentialMapGroup Record:
//Type Name Description
//uint32 startCharCode First character code in this group
//uint32 endCharCode Last character code in this group
//uint32 startGlyphID Glyph index corresponding to the starting character code
//
//Groups must be sorted by increasing startCharCode.A group's endCharCode must be less than the startCharCode of the following group,
//if any. The endCharCode is used, rather than a count, because comparisons for group matching are usually done on an existing character code,
//and having the endCharCode be there explicitly saves the necessity of an addition per group.
//
//Fonts providing Unicode - encoded UCS - 4 character support for Windows 2000 and later,
//need to have a subtable with platform ID 3, platform specific encoding ID 1 in format 4;
//and in addition, need to have a subtable for platform ID 3, platform specific encoding ID 10 in format 12.
//Please note, that the content of format 12 subtable,
//needs to be a super set of the content in the format 4 subtable.
//The format 4 subtable needs to be in the cmap table to enable backward compatibility needs.
ushort reserved = input.ReadUInt16();
#if DEBUG
if (reserved != 0) { throw new OpenFontNotSupportedException(); }
#endif
uint length = input.ReadUInt32();// Byte length of this subtable(including the header)
uint language = input.ReadUInt32();
uint numGroups = input.ReadUInt32();
#if DEBUG
if (numGroups > int.MaxValue) { throw new OpenFontNotSupportedException(); }
#endif
uint[] startCharCodes = new uint[(int)numGroups];
uint[] endCharCodes = new uint[(int)numGroups];
uint[] startGlyphIds = new uint[(int)numGroups];
for (uint i = 0; i < numGroups; ++i)
{
//seq map group record
startCharCodes[i] = input.ReadUInt32();
endCharCodes[i] = input.ReadUInt32();
startGlyphIds[i] = input.ReadUInt32();
}
return new CharMapFormat12(startCharCodes, endCharCodes, startGlyphIds);
}
private static CharacterMap ReadCharacterMap(BinaryReader input)
{
ushort format = input.ReadUInt16();
switch (format)
{
default:
Utils.WarnUnimplemented("cmap subtable format {0}", format);
return new NullCharMap();
case 0: return ReadFormat_0(input);
case 2: return ReadFormat_2(input);
case 4: return ReadFormat_4(input);
case 6: return ReadFormat_6(input);
case 12: return ReadFormat_12(input);
case 14: return CharMapFormat14.Create(input);
}
}
public void CollectUnicode(List<uint> unicodes)
{
for (int i = 0; i < _charMaps.Length; ++i)
{
_charMaps[i].CollectUnicodeChars(unicodes);
}
}
public void CollectUnicode(int platform, List<uint> unicodes, List<ushort> glyphIndexList)
{
if (_charMaps.Length == 1)
{
}
for (int i = 0; i < _charMaps.Length; ++i)
{
CharacterMap cmap = _charMaps[i];
if (platform < 0)
{
cmap.CollectUnicodeChars(unicodes, glyphIndexList);
}
else if (cmap.PlatformId == platform)
{
cmap.CollectUnicodeChars(unicodes, glyphIndexList);
}
}
}
}
}