import warnings from binascii import unhexlify from math import ceil from typing import Any, Dict, List, Tuple, Union, cast from ._codecs import adobe_glyphs, charset_encoding from ._utils import logger_warning from .errors import PdfReadWarning from .generic import DecodedStreamObject, DictionaryObject, StreamObject # code freely inspired from @twiggy ; see #711 def build_char_map( font_name: str, space_width: float, obj: DictionaryObject ) -> Tuple[ str, float, Union[str, Dict[int, str]], Dict, DictionaryObject ]: # font_type,space_width /2, encoding, cmap """Determine information about a font. This function returns a tuple consisting of: font sub-type, space_width/2, encoding, map character-map, font-dictionary. The font-dictionary itself is suitable for the curious.""" ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore font_type: str = cast(str, ft["/Subtype"]) space_code = 32 encoding, space_code = parse_encoding(ft, space_code) map_dict, space_code, int_entry = parse_to_unicode(ft, space_code) # encoding can be either a string for decode (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me) # if empty string, it means it is than encoding field is not present and we have to select the good encoding from cmap input data if encoding == "": if -1 not in map_dict or map_dict[-1] == 1: # I have not been able to find any rule for no /Encoding nor /ToUnicode # One example shows /Symbol,bold I consider 8 bits encoding default encoding = "charmap" else: encoding = "utf-16-be" # apply rule from PDF ref 1.7 §5.9.1, 1st bullet : if cmap not empty encoding should be discarded (here transformed into identity for those characters) # if encoding is an str it is expected to be a identity translation elif isinstance(encoding, dict): for x in int_entry: if x <= 255: encoding[x] = chr(x) try: # override space_width with new params space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] except Exception: pass # I conside the space_code is available on one byte if isinstance(space_code, str): try: # one byte sp = space_code.encode("charmap")[0] except Exception: sp = space_code.encode("utf-16-be") sp = sp[0] + 256 * sp[1] else: sp = space_code sp_width = compute_space_width(ft, sp, space_width) return ( font_type, float(sp_width / 2), encoding, # https://github.com/python/mypy/issues/4374 map_dict, ft, ) # used when missing data, e.g. font def missing unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = ( "Unknown", 9999, dict(zip(range(256), ["�"] * 256)), {}, ) _predefined_cmap: Dict[str, str] = { "/Identity-H": "utf-16-be", "/Identity-V": "utf-16-be", "/GB-EUC-H": "gbk", # TBC "/GB-EUC-V": "gbk", # TBC "/GBpc-EUC-H": "gb2312", # TBC "/GBpc-EUC-V": "gb2312", # TBC } # manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz _default_fonts_space_width: Dict[str, int] = { "/Courrier": 600, "/Courier-Bold": 600, "/Courier-BoldOblique": 600, "/Courier-Oblique": 600, "/Helvetica": 278, "/Helvetica-Bold": 278, "/Helvetica-BoldOblique": 278, "/Helvetica-Oblique": 278, "/Helvetica-Narrow": 228, "/Helvetica-NarrowBold": 228, "/Helvetica-NarrowBoldOblique": 228, "/Helvetica-NarrowOblique": 228, "/Times-Roman": 250, "/Times-Bold": 250, "/Times-BoldItalic": 250, "/Times-Italic": 250, "/Symbol": 250, "/ZapfDingbats": 278, } def parse_encoding( ft: DictionaryObject, space_code: int ) -> Tuple[Union[str, Dict[int, str]], int]: encoding: Union[str, List[str], Dict[int, str]] = [] if "/Encoding" not in ft: try: if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: encoding = dict( zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) ) else: encoding = "charmap" return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])] except Exception: if cast(str, ft["/Subtype"]) == "/Type1": return "charmap", space_code else: return "", space_code enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore if isinstance(enc, str): try: # allready done : enc = NameObject.unnumber(enc.encode()).decode() # for #xx decoding if enc in charset_encoding: encoding = charset_encoding[enc].copy() elif enc in _predefined_cmap: encoding = _predefined_cmap[enc] else: raise Exception("not found") except Exception: warnings.warn( f"Advanced encoding {enc} not implemented yet", PdfReadWarning, ) encoding = enc elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: try: encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() except Exception: warnings.warn( f"Advanced encoding {encoding} not implemented yet", PdfReadWarning, ) encoding = charset_encoding["/StandardCoding"].copy() else: encoding = charset_encoding["/StandardCoding"].copy() if "/Differences" in enc: x: int = 0 o: Union[int, str] for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]): if isinstance(o, int): x = o else: # isinstance(o,str): try: encoding[x] = adobe_glyphs[o] # type: ignore except Exception: encoding[x] = o # type: ignore if o == " ": space_code = x x += 1 if isinstance(encoding, list): encoding = dict(zip(range(256), encoding)) return encoding, space_code def parse_to_unicode( ft: DictionaryObject, space_code: int ) -> Tuple[Dict[Any, Any], int, List[int]]: # will store all translation code # and map_dict[-1] we will have the number of bytes to convert map_dict: Dict[Any, Any] = {} # will provide the list of cmap keys as int to correct encoding int_entry: List[int] = [] if "/ToUnicode" not in ft: return {}, space_code, [] process_rg: bool = False process_char: bool = False multiline_rg: Union[ None, Tuple[int, int] ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file cm = prepare_cm(ft) for l in cm.split(b"\n"): process_rg, process_char, multiline_rg = process_cm_line( l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry ) for a, value in map_dict.items(): if value == " ": space_code = a return map_dict, space_code, int_entry def prepare_cm(ft: DictionaryObject) -> bytes: tu = ft["/ToUnicode"] cm: bytes if isinstance(tu, StreamObject): cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() elif isinstance(tu, str) and tu.startswith("/Identity"): cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" # the full range 0000-FFFF will be processed if isinstance(cm, str): cm = cm.encode() # we need to prepare cm before due to missing return line in pdf printed to pdf from word cm = ( cm.strip() .replace(b"beginbfchar", b"\nbeginbfchar\n") .replace(b"endbfchar", b"\nendbfchar\n") .replace(b"beginbfrange", b"\nbeginbfrange\n") .replace(b"endbfrange", b"\nendbfrange\n") .replace(b"<<", b"\n{\n") # text between << and >> not used but .replace(b">>", b"\n}\n") # some solution to find it back ) ll = cm.split(b"<") for i in range(len(ll)): j = ll[i].find(b">") if j >= 0: if j == 0: # string is empty: stash a placeholder here (see below) # see https://github.com/py-pdf/PyPDF2/issues/1111 content = b"." else: content = ll[i][:j].replace(b" ", b"") ll[i] = content + b" " + ll[i][j + 1 :] cm = ( (b" ".join(ll)) .replace(b"[", b" [ ") .replace(b"]", b" ]\n ") .replace(b"\r", b"\n") ) return cm def process_cm_line( l: bytes, process_rg: bool, process_char: bool, multiline_rg: Union[None, Tuple[int, int]], map_dict: Dict[Any, Any], int_entry: List[int], ) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]: if l in (b"", b" ") or l[0] == 37: # 37 = % return process_rg, process_char, multiline_rg if b"beginbfrange" in l: process_rg = True elif b"endbfrange" in l: process_rg = False elif b"beginbfchar" in l: process_char = True elif b"endbfchar" in l: process_char = False elif process_rg: multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg) elif process_char: parse_bfchar(l, map_dict, int_entry) return process_rg, process_char, multiline_rg def parse_bfrange( l: bytes, map_dict: Dict[Any, Any], int_entry: List[int], multiline_rg: Union[None, Tuple[int, int]], ) -> Union[None, Tuple[int, int]]: lst = [x for x in l.split(b" ") if x] closure_found = False nbi = max(len(lst[0]), len(lst[1])) map_dict[-1] = ceil(nbi / 2) fmt = b"%%0%dX" % (map_dict[-1] * 2) if multiline_rg is not None: a = multiline_rg[0] # a, b not in the current line b = multiline_rg[1] for sq in lst[1:]: if sq == b"]": closure_found = True break map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass", ) ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 else: a = int(lst[0], 16) b = int(lst[1], 16) if lst[2] == b"[": for sq in lst[3:]: if sq == b"]": closure_found = True break map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass", ) ] = unhexlify(sq).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 else: # case without list c = int(lst[2], 16) fmt2 = b"%%0%dX" % max(4, len(lst[2])) closure_found = True while a <= b: map_dict[ unhexlify(fmt % a).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass", ) ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") int_entry.append(a) a += 1 c += 1 return None if closure_found else (a, b) def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None: lst = [x for x in l.split(b" ") if x] map_dict[-1] = len(lst[0]) // 2 while len(lst) > 1: map_to = "" # placeholder (see above) means empty string if lst[1] != b".": map_to = unhexlify(lst[1]).decode( "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" ) # join is here as some cases where the code was split map_dict[ unhexlify(lst[0]).decode( "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" ) ] = map_to int_entry.append(int(lst[0], 16)) lst = lst[2:] def compute_space_width( ft: DictionaryObject, space_code: int, space_width: float ) -> float: sp_width: float = space_width * 2 # default value w = [] w1 = {} st: int = 0 if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore try: w1[-1] = cast(float, ft1["/DW"]) except Exception: w1[-1] = 1000.0 if "/W" in ft1: w = list(ft1["/W"]) else: w = [] while len(w) > 0: st = w[0] second = w[1] if isinstance(second, int): for x in range(st, second): w1[x] = w[2] w = w[3:] elif isinstance(second, list): for y in second: w1[st] = y st += 1 w = w[2:] else: logger_warning( "unknown widths : \n" + (ft1["/W"]).__repr__(), __name__, ) break try: sp_width = w1[space_code] except Exception: sp_width = ( w1[-1] / 2.0 ) # if using default we consider space will be only half size elif "/Widths" in ft: w = list(ft["/Widths"]) # type: ignore try: st = cast(int, ft["/FirstChar"]) en: int = cast(int, ft["/LastChar"]) if st > space_code or en < space_code: raise Exception("Not in range") if w[space_code - st] == 0: raise Exception("null width") sp_width = w[space_code - st] except Exception: if "/FontDescriptor" in ft and "/MissingWidth" in cast( DictionaryObject, ft["/FontDescriptor"] ): sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore else: # will consider width of char as avg(width)/2 m = 0 cpt = 0 for x in w: if x > 0: m += x cpt += 1 sp_width = m / max(1, cpt) / 2 return sp_width