Spaces:
Runtime error
Runtime error
import warnings | |
from binascii import unhexlify | |
from math import ceil | |
from typing import Any, Dict, List, Tuple, Union, cast | |
from ._codecs import adobe_glyphs, charset_encoding | |
from ._utils import logger_warning | |
from .errors import PdfReadWarning | |
from .generic import DecodedStreamObject, DictionaryObject, StreamObject | |
# code freely inspired from @twiggy ; see #711 | |
def build_char_map( | |
font_name: str, space_width: float, obj: DictionaryObject | |
) -> Tuple[ | |
str, float, Union[str, Dict[int, str]], Dict, DictionaryObject | |
]: # font_type,space_width /2, encoding, cmap | |
"""Determine information about a font. | |
This function returns a tuple consisting of: | |
font sub-type, space_width/2, encoding, map character-map, font-dictionary. | |
The font-dictionary itself is suitable for the curious.""" | |
ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore | |
font_type: str = cast(str, ft["/Subtype"]) | |
space_code = 32 | |
encoding, space_code = parse_encoding(ft, space_code) | |
map_dict, space_code, int_entry = parse_to_unicode(ft, space_code) | |
# encoding can be either a string for decode (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me) | |
# if empty string, it means it is than encoding field is not present and we have to select the good encoding from cmap input data | |
if encoding == "": | |
if -1 not in map_dict or map_dict[-1] == 1: | |
# I have not been able to find any rule for no /Encoding nor /ToUnicode | |
# One example shows /Symbol,bold I consider 8 bits encoding default | |
encoding = "charmap" | |
else: | |
encoding = "utf-16-be" | |
# apply rule from PDF ref 1.7 §5.9.1, 1st bullet : if cmap not empty encoding should be discarded (here transformed into identity for those characters) | |
# if encoding is an str it is expected to be a identity translation | |
elif isinstance(encoding, dict): | |
for x in int_entry: | |
if x <= 255: | |
encoding[x] = chr(x) | |
try: | |
# override space_width with new params | |
space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])] | |
except Exception: | |
pass | |
# I conside the space_code is available on one byte | |
if isinstance(space_code, str): | |
try: # one byte | |
sp = space_code.encode("charmap")[0] | |
except Exception: | |
sp = space_code.encode("utf-16-be") | |
sp = sp[0] + 256 * sp[1] | |
else: | |
sp = space_code | |
sp_width = compute_space_width(ft, sp, space_width) | |
return ( | |
font_type, | |
float(sp_width / 2), | |
encoding, | |
# https://github.com/python/mypy/issues/4374 | |
map_dict, | |
ft, | |
) | |
# used when missing data, e.g. font def missing | |
unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = ( | |
"Unknown", | |
9999, | |
dict(zip(range(256), ["�"] * 256)), | |
{}, | |
) | |
_predefined_cmap: Dict[str, str] = { | |
"/Identity-H": "utf-16-be", | |
"/Identity-V": "utf-16-be", | |
"/GB-EUC-H": "gbk", # TBC | |
"/GB-EUC-V": "gbk", # TBC | |
"/GBpc-EUC-H": "gb2312", # TBC | |
"/GBpc-EUC-V": "gb2312", # TBC | |
} | |
# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz | |
_default_fonts_space_width: Dict[str, int] = { | |
"/Courrier": 600, | |
"/Courier-Bold": 600, | |
"/Courier-BoldOblique": 600, | |
"/Courier-Oblique": 600, | |
"/Helvetica": 278, | |
"/Helvetica-Bold": 278, | |
"/Helvetica-BoldOblique": 278, | |
"/Helvetica-Oblique": 278, | |
"/Helvetica-Narrow": 228, | |
"/Helvetica-NarrowBold": 228, | |
"/Helvetica-NarrowBoldOblique": 228, | |
"/Helvetica-NarrowOblique": 228, | |
"/Times-Roman": 250, | |
"/Times-Bold": 250, | |
"/Times-BoldItalic": 250, | |
"/Times-Italic": 250, | |
"/Symbol": 250, | |
"/ZapfDingbats": 278, | |
} | |
def parse_encoding( | |
ft: DictionaryObject, space_code: int | |
) -> Tuple[Union[str, Dict[int, str]], int]: | |
encoding: Union[str, List[str], Dict[int, str]] = [] | |
if "/Encoding" not in ft: | |
try: | |
if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding: | |
encoding = dict( | |
zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])]) | |
) | |
else: | |
encoding = "charmap" | |
return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])] | |
except Exception: | |
if cast(str, ft["/Subtype"]) == "/Type1": | |
return "charmap", space_code | |
else: | |
return "", space_code | |
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore | |
if isinstance(enc, str): | |
try: | |
# allready done : enc = NameObject.unnumber(enc.encode()).decode() # for #xx decoding | |
if enc in charset_encoding: | |
encoding = charset_encoding[enc].copy() | |
elif enc in _predefined_cmap: | |
encoding = _predefined_cmap[enc] | |
else: | |
raise Exception("not found") | |
except Exception: | |
warnings.warn( | |
f"Advanced encoding {enc} not implemented yet", | |
PdfReadWarning, | |
) | |
encoding = enc | |
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc: | |
try: | |
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy() | |
except Exception: | |
warnings.warn( | |
f"Advanced encoding {encoding} not implemented yet", | |
PdfReadWarning, | |
) | |
encoding = charset_encoding["/StandardCoding"].copy() | |
else: | |
encoding = charset_encoding["/StandardCoding"].copy() | |
if "/Differences" in enc: | |
x: int = 0 | |
o: Union[int, str] | |
for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]): | |
if isinstance(o, int): | |
x = o | |
else: # isinstance(o,str): | |
try: | |
encoding[x] = adobe_glyphs[o] # type: ignore | |
except Exception: | |
encoding[x] = o # type: ignore | |
if o == " ": | |
space_code = x | |
x += 1 | |
if isinstance(encoding, list): | |
encoding = dict(zip(range(256), encoding)) | |
return encoding, space_code | |
def parse_to_unicode( | |
ft: DictionaryObject, space_code: int | |
) -> Tuple[Dict[Any, Any], int, List[int]]: | |
# will store all translation code | |
# and map_dict[-1] we will have the number of bytes to convert | |
map_dict: Dict[Any, Any] = {} | |
# will provide the list of cmap keys as int to correct encoding | |
int_entry: List[int] = [] | |
if "/ToUnicode" not in ft: | |
return {}, space_code, [] | |
process_rg: bool = False | |
process_char: bool = False | |
multiline_rg: Union[ | |
None, Tuple[int, int] | |
] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file | |
cm = prepare_cm(ft) | |
for l in cm.split(b"\n"): | |
process_rg, process_char, multiline_rg = process_cm_line( | |
l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry | |
) | |
for a, value in map_dict.items(): | |
if value == " ": | |
space_code = a | |
return map_dict, space_code, int_entry | |
def prepare_cm(ft: DictionaryObject) -> bytes: | |
tu = ft["/ToUnicode"] | |
cm: bytes | |
if isinstance(tu, StreamObject): | |
cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data() | |
elif isinstance(tu, str) and tu.startswith("/Identity"): | |
cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" # the full range 0000-FFFF will be processed | |
if isinstance(cm, str): | |
cm = cm.encode() | |
# we need to prepare cm before due to missing return line in pdf printed to pdf from word | |
cm = ( | |
cm.strip() | |
.replace(b"beginbfchar", b"\nbeginbfchar\n") | |
.replace(b"endbfchar", b"\nendbfchar\n") | |
.replace(b"beginbfrange", b"\nbeginbfrange\n") | |
.replace(b"endbfrange", b"\nendbfrange\n") | |
.replace(b"<<", b"\n{\n") # text between << and >> not used but | |
.replace(b">>", b"\n}\n") # some solution to find it back | |
) | |
ll = cm.split(b"<") | |
for i in range(len(ll)): | |
j = ll[i].find(b">") | |
if j >= 0: | |
if j == 0: | |
# string is empty: stash a placeholder here (see below) | |
# see https://github.com/py-pdf/PyPDF2/issues/1111 | |
content = b"." | |
else: | |
content = ll[i][:j].replace(b" ", b"") | |
ll[i] = content + b" " + ll[i][j + 1 :] | |
cm = ( | |
(b" ".join(ll)) | |
.replace(b"[", b" [ ") | |
.replace(b"]", b" ]\n ") | |
.replace(b"\r", b"\n") | |
) | |
return cm | |
def process_cm_line( | |
l: bytes, | |
process_rg: bool, | |
process_char: bool, | |
multiline_rg: Union[None, Tuple[int, int]], | |
map_dict: Dict[Any, Any], | |
int_entry: List[int], | |
) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]: | |
if l in (b"", b" ") or l[0] == 37: # 37 = % | |
return process_rg, process_char, multiline_rg | |
if b"beginbfrange" in l: | |
process_rg = True | |
elif b"endbfrange" in l: | |
process_rg = False | |
elif b"beginbfchar" in l: | |
process_char = True | |
elif b"endbfchar" in l: | |
process_char = False | |
elif process_rg: | |
multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg) | |
elif process_char: | |
parse_bfchar(l, map_dict, int_entry) | |
return process_rg, process_char, multiline_rg | |
def parse_bfrange( | |
l: bytes, | |
map_dict: Dict[Any, Any], | |
int_entry: List[int], | |
multiline_rg: Union[None, Tuple[int, int]], | |
) -> Union[None, Tuple[int, int]]: | |
lst = [x for x in l.split(b" ") if x] | |
closure_found = False | |
nbi = max(len(lst[0]), len(lst[1])) | |
map_dict[-1] = ceil(nbi / 2) | |
fmt = b"%%0%dX" % (map_dict[-1] * 2) | |
if multiline_rg is not None: | |
a = multiline_rg[0] # a, b not in the current line | |
b = multiline_rg[1] | |
for sq in lst[1:]: | |
if sq == b"]": | |
closure_found = True | |
break | |
map_dict[ | |
unhexlify(fmt % a).decode( | |
"charmap" if map_dict[-1] == 1 else "utf-16-be", | |
"surrogatepass", | |
) | |
] = unhexlify(sq).decode("utf-16-be", "surrogatepass") | |
int_entry.append(a) | |
a += 1 | |
else: | |
a = int(lst[0], 16) | |
b = int(lst[1], 16) | |
if lst[2] == b"[": | |
for sq in lst[3:]: | |
if sq == b"]": | |
closure_found = True | |
break | |
map_dict[ | |
unhexlify(fmt % a).decode( | |
"charmap" if map_dict[-1] == 1 else "utf-16-be", | |
"surrogatepass", | |
) | |
] = unhexlify(sq).decode("utf-16-be", "surrogatepass") | |
int_entry.append(a) | |
a += 1 | |
else: # case without list | |
c = int(lst[2], 16) | |
fmt2 = b"%%0%dX" % max(4, len(lst[2])) | |
closure_found = True | |
while a <= b: | |
map_dict[ | |
unhexlify(fmt % a).decode( | |
"charmap" if map_dict[-1] == 1 else "utf-16-be", | |
"surrogatepass", | |
) | |
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass") | |
int_entry.append(a) | |
a += 1 | |
c += 1 | |
return None if closure_found else (a, b) | |
def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None: | |
lst = [x for x in l.split(b" ") if x] | |
map_dict[-1] = len(lst[0]) // 2 | |
while len(lst) > 1: | |
map_to = "" | |
# placeholder (see above) means empty string | |
if lst[1] != b".": | |
map_to = unhexlify(lst[1]).decode( | |
"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass" | |
) # join is here as some cases where the code was split | |
map_dict[ | |
unhexlify(lst[0]).decode( | |
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass" | |
) | |
] = map_to | |
int_entry.append(int(lst[0], 16)) | |
lst = lst[2:] | |
def compute_space_width( | |
ft: DictionaryObject, space_code: int, space_width: float | |
) -> float: | |
sp_width: float = space_width * 2 # default value | |
w = [] | |
w1 = {} | |
st: int = 0 | |
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): | |
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore | |
try: | |
w1[-1] = cast(float, ft1["/DW"]) | |
except Exception: | |
w1[-1] = 1000.0 | |
if "/W" in ft1: | |
w = list(ft1["/W"]) | |
else: | |
w = [] | |
while len(w) > 0: | |
st = w[0] | |
second = w[1] | |
if isinstance(second, int): | |
for x in range(st, second): | |
w1[x] = w[2] | |
w = w[3:] | |
elif isinstance(second, list): | |
for y in second: | |
w1[st] = y | |
st += 1 | |
w = w[2:] | |
else: | |
logger_warning( | |
"unknown widths : \n" + (ft1["/W"]).__repr__(), | |
__name__, | |
) | |
break | |
try: | |
sp_width = w1[space_code] | |
except Exception: | |
sp_width = ( | |
w1[-1] / 2.0 | |
) # if using default we consider space will be only half size | |
elif "/Widths" in ft: | |
w = list(ft["/Widths"]) # type: ignore | |
try: | |
st = cast(int, ft["/FirstChar"]) | |
en: int = cast(int, ft["/LastChar"]) | |
if st > space_code or en < space_code: | |
raise Exception("Not in range") | |
if w[space_code - st] == 0: | |
raise Exception("null width") | |
sp_width = w[space_code - st] | |
except Exception: | |
if "/FontDescriptor" in ft and "/MissingWidth" in cast( | |
DictionaryObject, ft["/FontDescriptor"] | |
): | |
sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore | |
else: | |
# will consider width of char as avg(width)/2 | |
m = 0 | |
cpt = 0 | |
for x in w: | |
if x > 0: | |
m += x | |
cpt += 1 | |
sp_width = m / max(1, cpt) / 2 | |
return sp_width | |