File size: 14,645 Bytes
8a58cf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
import warnings
from binascii import unhexlify
from math import ceil
from typing import Any, Dict, List, Tuple, Union, cast

from ._codecs import adobe_glyphs, charset_encoding
from ._utils import logger_warning
from .errors import PdfReadWarning
from .generic import DecodedStreamObject, DictionaryObject, StreamObject


# code freely inspired from @twiggy ; see #711
def build_char_map(
    font_name: str, space_width: float, obj: DictionaryObject
) -> Tuple[
    str, float, Union[str, Dict[int, str]], Dict, DictionaryObject
]:  # font_type,space_width /2, encoding, cmap
    """Determine information about a font.

    This function returns a tuple consisting of:
    font sub-type, space_width/2, encoding, map character-map, font-dictionary.
    The font-dictionary itself is suitable for the curious."""
    ft: DictionaryObject = obj["/Resources"]["/Font"][font_name]  # type: ignore
    font_type: str = cast(str, ft["/Subtype"])

    space_code = 32
    encoding, space_code = parse_encoding(ft, space_code)
    map_dict, space_code, int_entry = parse_to_unicode(ft, space_code)

    # encoding can be either a string for decode (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me)
    # if empty string, it means it is than encoding field is not present and we have to select the good encoding from cmap input data
    if encoding == "":
        if -1 not in map_dict or map_dict[-1] == 1:
            # I have not been able to find any rule for no /Encoding nor /ToUnicode
            # One example shows /Symbol,bold I consider 8 bits encoding default
            encoding = "charmap"
        else:
            encoding = "utf-16-be"
    # apply rule from PDF ref 1.7 §5.9.1, 1st bullet : if cmap not empty encoding should be discarded (here transformed into identity for those characters)
    # if encoding is an str it is expected to be a identity translation
    elif isinstance(encoding, dict):
        for x in int_entry:
            if x <= 255:
                encoding[x] = chr(x)
    try:
        # override space_width with new params
        space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
    except Exception:
        pass
    # I conside the space_code is available on one byte
    if isinstance(space_code, str):
        try:  # one byte
            sp = space_code.encode("charmap")[0]
        except Exception:
            sp = space_code.encode("utf-16-be")
            sp = sp[0] + 256 * sp[1]
    else:
        sp = space_code
    sp_width = compute_space_width(ft, sp, space_width)

    return (
        font_type,
        float(sp_width / 2),
        encoding,
        # https://github.com/python/mypy/issues/4374
        map_dict,
        ft,
    )


# used when missing data, e.g. font def missing
unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
    "Unknown",
    9999,
    dict(zip(range(256), ["�"] * 256)),
    {},
)


_predefined_cmap: Dict[str, str] = {
    "/Identity-H": "utf-16-be",
    "/Identity-V": "utf-16-be",
    "/GB-EUC-H": "gbk",  # TBC
    "/GB-EUC-V": "gbk",  # TBC
    "/GBpc-EUC-H": "gb2312",  # TBC
    "/GBpc-EUC-V": "gb2312",  # TBC
}


# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
_default_fonts_space_width: Dict[str, int] = {
    "/Courrier": 600,
    "/Courier-Bold": 600,
    "/Courier-BoldOblique": 600,
    "/Courier-Oblique": 600,
    "/Helvetica": 278,
    "/Helvetica-Bold": 278,
    "/Helvetica-BoldOblique": 278,
    "/Helvetica-Oblique": 278,
    "/Helvetica-Narrow": 228,
    "/Helvetica-NarrowBold": 228,
    "/Helvetica-NarrowBoldOblique": 228,
    "/Helvetica-NarrowOblique": 228,
    "/Times-Roman": 250,
    "/Times-Bold": 250,
    "/Times-BoldItalic": 250,
    "/Times-Italic": 250,
    "/Symbol": 250,
    "/ZapfDingbats": 278,
}


def parse_encoding(
    ft: DictionaryObject, space_code: int
) -> Tuple[Union[str, Dict[int, str]], int]:
    encoding: Union[str, List[str], Dict[int, str]] = []
    if "/Encoding" not in ft:
        try:
            if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
                encoding = dict(
                    zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
                )
            else:
                encoding = "charmap"
            return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])]
        except Exception:
            if cast(str, ft["/Subtype"]) == "/Type1":
                return "charmap", space_code
            else:
                return "", space_code
    enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object()  # type: ignore
    if isinstance(enc, str):
        try:
            # allready done : enc = NameObject.unnumber(enc.encode()).decode()  # for #xx decoding
            if enc in charset_encoding:
                encoding = charset_encoding[enc].copy()
            elif enc in _predefined_cmap:
                encoding = _predefined_cmap[enc]
            else:
                raise Exception("not found")
        except Exception:
            warnings.warn(
                f"Advanced encoding {enc} not implemented yet",
                PdfReadWarning,
            )
            encoding = enc
    elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
        try:
            encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
        except Exception:
            warnings.warn(
                f"Advanced encoding {encoding} not implemented yet",
                PdfReadWarning,
            )
            encoding = charset_encoding["/StandardCoding"].copy()
    else:
        encoding = charset_encoding["/StandardCoding"].copy()
    if "/Differences" in enc:
        x: int = 0
        o: Union[int, str]
        for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
            if isinstance(o, int):
                x = o
            else:  # isinstance(o,str):
                try:
                    encoding[x] = adobe_glyphs[o]  # type: ignore
                except Exception:
                    encoding[x] = o  # type: ignore
                    if o == " ":
                        space_code = x
                x += 1
    if isinstance(encoding, list):
        encoding = dict(zip(range(256), encoding))
    return encoding, space_code


def parse_to_unicode(
    ft: DictionaryObject, space_code: int
) -> Tuple[Dict[Any, Any], int, List[int]]:
    # will store all translation code
    # and map_dict[-1] we will have the number of bytes to convert
    map_dict: Dict[Any, Any] = {}

    # will provide the list of cmap keys as int to correct encoding
    int_entry: List[int] = []

    if "/ToUnicode" not in ft:
        return {}, space_code, []
    process_rg: bool = False
    process_char: bool = False
    multiline_rg: Union[
        None, Tuple[int, int]
    ] = None  # tuple = (current_char, remaining size) ; cf #1285 for example of file
    cm = prepare_cm(ft)
    for l in cm.split(b"\n"):
        process_rg, process_char, multiline_rg = process_cm_line(
            l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry
        )

    for a, value in map_dict.items():
        if value == " ":
            space_code = a
    return map_dict, space_code, int_entry


def prepare_cm(ft: DictionaryObject) -> bytes:
    tu = ft["/ToUnicode"]
    cm: bytes
    if isinstance(tu, StreamObject):
        cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
    elif isinstance(tu, str) and tu.startswith("/Identity"):
        cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"  # the full range 0000-FFFF will be processed
    if isinstance(cm, str):
        cm = cm.encode()
    # we need to prepare cm before due to missing return line in pdf printed to pdf from word
    cm = (
        cm.strip()
        .replace(b"beginbfchar", b"\nbeginbfchar\n")
        .replace(b"endbfchar", b"\nendbfchar\n")
        .replace(b"beginbfrange", b"\nbeginbfrange\n")
        .replace(b"endbfrange", b"\nendbfrange\n")
        .replace(b"<<", b"\n{\n")  # text between << and >> not used but
        .replace(b">>", b"\n}\n")  # some solution to find it back
    )
    ll = cm.split(b"<")
    for i in range(len(ll)):
        j = ll[i].find(b">")
        if j >= 0:
            if j == 0:
                # string is empty: stash a placeholder here (see below)
                # see https://github.com/py-pdf/PyPDF2/issues/1111
                content = b"."
            else:
                content = ll[i][:j].replace(b" ", b"")
            ll[i] = content + b" " + ll[i][j + 1 :]
    cm = (
        (b" ".join(ll))
        .replace(b"[", b" [ ")
        .replace(b"]", b" ]\n ")
        .replace(b"\r", b"\n")
    )
    return cm


def process_cm_line(
    l: bytes,
    process_rg: bool,
    process_char: bool,
    multiline_rg: Union[None, Tuple[int, int]],
    map_dict: Dict[Any, Any],
    int_entry: List[int],
) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
    if l in (b"", b" ") or l[0] == 37:  # 37 = %
        return process_rg, process_char, multiline_rg
    if b"beginbfrange" in l:
        process_rg = True
    elif b"endbfrange" in l:
        process_rg = False
    elif b"beginbfchar" in l:
        process_char = True
    elif b"endbfchar" in l:
        process_char = False
    elif process_rg:
        multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg)
    elif process_char:
        parse_bfchar(l, map_dict, int_entry)
    return process_rg, process_char, multiline_rg


def parse_bfrange(
    l: bytes,
    map_dict: Dict[Any, Any],
    int_entry: List[int],
    multiline_rg: Union[None, Tuple[int, int]],
) -> Union[None, Tuple[int, int]]:
    lst = [x for x in l.split(b" ") if x]
    closure_found = False
    nbi = max(len(lst[0]), len(lst[1]))
    map_dict[-1] = ceil(nbi / 2)
    fmt = b"%%0%dX" % (map_dict[-1] * 2)
    if multiline_rg is not None:
        a = multiline_rg[0]  # a, b not in the current line
        b = multiline_rg[1]
        for sq in lst[1:]:
            if sq == b"]":
                closure_found = True
                break
            map_dict[
                unhexlify(fmt % a).decode(
                    "charmap" if map_dict[-1] == 1 else "utf-16-be",
                    "surrogatepass",
                )
            ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
            int_entry.append(a)
            a += 1
    else:
        a = int(lst[0], 16)
        b = int(lst[1], 16)
        if lst[2] == b"[":
            for sq in lst[3:]:
                if sq == b"]":
                    closure_found = True
                    break
                map_dict[
                    unhexlify(fmt % a).decode(
                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
                        "surrogatepass",
                    )
                ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
                int_entry.append(a)
                a += 1
        else:  # case without list
            c = int(lst[2], 16)
            fmt2 = b"%%0%dX" % max(4, len(lst[2]))
            closure_found = True
            while a <= b:
                map_dict[
                    unhexlify(fmt % a).decode(
                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
                        "surrogatepass",
                    )
                ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
                int_entry.append(a)
                a += 1
                c += 1
    return None if closure_found else (a, b)


def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
    lst = [x for x in l.split(b" ") if x]
    map_dict[-1] = len(lst[0]) // 2
    while len(lst) > 1:
        map_to = ""
        # placeholder (see above) means empty string
        if lst[1] != b".":
            map_to = unhexlify(lst[1]).decode(
                "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
            )  # join is here as some cases where the code was split
        map_dict[
            unhexlify(lst[0]).decode(
                "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
            )
        ] = map_to
        int_entry.append(int(lst[0], 16))
        lst = lst[2:]


def compute_space_width(
    ft: DictionaryObject, space_code: int, space_width: float
) -> float:
    sp_width: float = space_width * 2  # default value
    w = []
    w1 = {}
    st: int = 0
    if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
        ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
        try:
            w1[-1] = cast(float, ft1["/DW"])
        except Exception:
            w1[-1] = 1000.0
        if "/W" in ft1:
            w = list(ft1["/W"])
        else:
            w = []
        while len(w) > 0:
            st = w[0]
            second = w[1]
            if isinstance(second, int):
                for x in range(st, second):
                    w1[x] = w[2]
                w = w[3:]
            elif isinstance(second, list):
                for y in second:
                    w1[st] = y
                    st += 1
                w = w[2:]
            else:
                logger_warning(
                    "unknown widths : \n" + (ft1["/W"]).__repr__(),
                    __name__,
                )
                break
        try:
            sp_width = w1[space_code]
        except Exception:
            sp_width = (
                w1[-1] / 2.0
            )  # if using default we consider space will be only half size
    elif "/Widths" in ft:
        w = list(ft["/Widths"])  # type: ignore
        try:
            st = cast(int, ft["/FirstChar"])
            en: int = cast(int, ft["/LastChar"])
            if st > space_code or en < space_code:
                raise Exception("Not in range")
            if w[space_code - st] == 0:
                raise Exception("null width")
            sp_width = w[space_code - st]
        except Exception:
            if "/FontDescriptor" in ft and "/MissingWidth" in cast(
                DictionaryObject, ft["/FontDescriptor"]
            ):
                sp_width = ft["/FontDescriptor"]["/MissingWidth"]  # type: ignore
            else:
                # will consider width of char as avg(width)/2
                m = 0
                cpt = 0
                for x in w:
                    if x > 0:
                        m += x
                        cpt += 1
                sp_width = m / max(1, cpt) / 2
    return sp_width