Spaces:
Running
Running
import argparse | |
import sys | |
from . import HTML2Text, __version__, config | |
def main() -> None: | |
baseurl = "" | |
class bcolors: | |
HEADER = "\033[95m" | |
OKBLUE = "\033[94m" | |
OKGREEN = "\033[92m" | |
WARNING = "\033[93m" | |
FAIL = "\033[91m" | |
ENDC = "\033[0m" | |
BOLD = "\033[1m" | |
UNDERLINE = "\033[4m" | |
p = argparse.ArgumentParser() | |
p.add_argument( | |
"--default-image-alt", | |
dest="default_image_alt", | |
default=config.DEFAULT_IMAGE_ALT, | |
help="The default alt string for images with missing ones", | |
) | |
p.add_argument( | |
"--pad-tables", | |
dest="pad_tables", | |
action="store_true", | |
default=config.PAD_TABLES, | |
help="pad the cells to equal column width in tables", | |
) | |
p.add_argument( | |
"--no-wrap-links", | |
dest="wrap_links", | |
action="store_false", | |
default=config.WRAP_LINKS, | |
help="don't wrap links during conversion", | |
) | |
p.add_argument( | |
"--wrap-list-items", | |
dest="wrap_list_items", | |
action="store_true", | |
default=config.WRAP_LIST_ITEMS, | |
help="wrap list items during conversion", | |
) | |
p.add_argument( | |
"--wrap-tables", | |
dest="wrap_tables", | |
action="store_true", | |
default=config.WRAP_TABLES, | |
help="wrap tables", | |
) | |
p.add_argument( | |
"--ignore-emphasis", | |
dest="ignore_emphasis", | |
action="store_true", | |
default=config.IGNORE_EMPHASIS, | |
help="don't include any formatting for emphasis", | |
) | |
p.add_argument( | |
"--reference-links", | |
dest="inline_links", | |
action="store_false", | |
default=config.INLINE_LINKS, | |
help="use reference style links instead of inline links", | |
) | |
p.add_argument( | |
"--ignore-links", | |
dest="ignore_links", | |
action="store_true", | |
default=config.IGNORE_ANCHORS, | |
help="don't include any formatting for links", | |
) | |
p.add_argument( | |
"--ignore-mailto-links", | |
action="store_true", | |
dest="ignore_mailto_links", | |
default=config.IGNORE_MAILTO_LINKS, | |
help="don't include mailto: links", | |
) | |
p.add_argument( | |
"--protect-links", | |
dest="protect_links", | |
action="store_true", | |
default=config.PROTECT_LINKS, | |
help="protect links from line breaks surrounding them with angle brackets", | |
) | |
p.add_argument( | |
"--ignore-images", | |
dest="ignore_images", | |
action="store_true", | |
default=config.IGNORE_IMAGES, | |
help="don't include any formatting for images", | |
) | |
p.add_argument( | |
"--images-as-html", | |
dest="images_as_html", | |
action="store_true", | |
default=config.IMAGES_AS_HTML, | |
help=( | |
"Always write image tags as raw html; preserves `height`, `width` and " | |
"`alt` if possible." | |
), | |
) | |
p.add_argument( | |
"--images-to-alt", | |
dest="images_to_alt", | |
action="store_true", | |
default=config.IMAGES_TO_ALT, | |
help="Discard image data, only keep alt text", | |
) | |
p.add_argument( | |
"--images-with-size", | |
dest="images_with_size", | |
action="store_true", | |
default=config.IMAGES_WITH_SIZE, | |
help=( | |
"Write image tags with height and width attrs as raw html to retain " | |
"dimensions" | |
), | |
) | |
p.add_argument( | |
"-g", | |
"--google-doc", | |
action="store_true", | |
dest="google_doc", | |
default=False, | |
help="convert an html-exported Google Document", | |
) | |
p.add_argument( | |
"-d", | |
"--dash-unordered-list", | |
action="store_true", | |
dest="ul_style_dash", | |
default=False, | |
help="use a dash rather than a star for unordered list items", | |
) | |
p.add_argument( | |
"-e", | |
"--asterisk-emphasis", | |
action="store_true", | |
dest="em_style_asterisk", | |
default=False, | |
help="use an asterisk rather than an underscore for emphasized text", | |
) | |
p.add_argument( | |
"-b", | |
"--body-width", | |
dest="body_width", | |
type=int, | |
default=config.BODY_WIDTH, | |
help="number of characters per output line, 0 for no wrap", | |
) | |
p.add_argument( | |
"-i", | |
"--google-list-indent", | |
dest="list_indent", | |
type=int, | |
default=config.GOOGLE_LIST_INDENT, | |
help="number of pixels Google indents nested lists", | |
) | |
p.add_argument( | |
"-s", | |
"--hide-strikethrough", | |
action="store_true", | |
dest="hide_strikethrough", | |
default=False, | |
help="hide strike-through text. only relevant when -g is " "specified as well", | |
) | |
p.add_argument( | |
"--escape-all", | |
action="store_true", | |
dest="escape_snob", | |
default=False, | |
help=( | |
"Escape all special characters. Output is less readable, but avoids " | |
"corner case formatting issues." | |
), | |
) | |
p.add_argument( | |
"--bypass-tables", | |
action="store_true", | |
dest="bypass_tables", | |
default=config.BYPASS_TABLES, | |
help="Format tables in HTML rather than Markdown syntax.", | |
) | |
p.add_argument( | |
"--ignore-tables", | |
action="store_true", | |
dest="ignore_tables", | |
default=config.IGNORE_TABLES, | |
help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.", | |
) | |
p.add_argument( | |
"--single-line-break", | |
action="store_true", | |
dest="single_line_break", | |
default=config.SINGLE_LINE_BREAK, | |
help=( | |
"Use a single line break after a block element rather than two line " | |
"breaks. NOTE: Requires --body-width=0" | |
), | |
) | |
p.add_argument( | |
"--unicode-snob", | |
action="store_true", | |
dest="unicode_snob", | |
default=config.UNICODE_SNOB, | |
help="Use unicode throughout document", | |
) | |
p.add_argument( | |
"--no-automatic-links", | |
action="store_false", | |
dest="use_automatic_links", | |
default=config.USE_AUTOMATIC_LINKS, | |
help="Do not use automatic links wherever applicable", | |
) | |
p.add_argument( | |
"--no-skip-internal-links", | |
action="store_false", | |
dest="skip_internal_links", | |
default=config.SKIP_INTERNAL_LINKS, | |
help="Do not skip internal links", | |
) | |
p.add_argument( | |
"--links-after-para", | |
action="store_true", | |
dest="links_each_paragraph", | |
default=config.LINKS_EACH_PARAGRAPH, | |
help="Put links after each paragraph instead of document", | |
) | |
p.add_argument( | |
"--mark-code", | |
action="store_true", | |
dest="mark_code", | |
default=config.MARK_CODE, | |
help="Mark program code blocks with [code]...[/code]", | |
) | |
p.add_argument( | |
"--decode-errors", | |
dest="decode_errors", | |
default=config.DECODE_ERRORS, | |
help=( | |
"What to do in case of decode errors.'ignore', 'strict' and 'replace' are " | |
"acceptable values" | |
), | |
) | |
p.add_argument( | |
"--open-quote", | |
dest="open_quote", | |
default=config.OPEN_QUOTE, | |
help="The character used to open quotes", | |
) | |
p.add_argument( | |
"--close-quote", | |
dest="close_quote", | |
default=config.CLOSE_QUOTE, | |
help="The character used to close quotes", | |
) | |
p.add_argument( | |
"--version", action="version", version=".".join(map(str, __version__)) | |
) | |
p.add_argument("filename", nargs="?") | |
p.add_argument("encoding", nargs="?", default="utf-8") | |
p.add_argument( | |
"--include-sup-sub", | |
dest="include_sup_sub", | |
action="store_true", | |
default=config.INCLUDE_SUP_SUB, | |
help="Include the sup and sub tags", | |
) | |
args = p.parse_args() | |
if args.filename and args.filename != "-": | |
with open(args.filename, "rb") as fp: | |
data = fp.read() | |
else: | |
data = sys.stdin.buffer.read() | |
try: | |
html = data.decode(args.encoding, args.decode_errors) | |
except UnicodeDecodeError as err: | |
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC | |
warning += " Use the " + bcolors.OKGREEN | |
warning += "--decode-errors=ignore" + bcolors.ENDC + " flag." | |
print(warning) | |
raise err | |
h = HTML2Text(baseurl=baseurl) | |
# handle options | |
if args.ul_style_dash: | |
h.ul_item_mark = "-" | |
if args.em_style_asterisk: | |
h.emphasis_mark = "*" | |
h.strong_mark = "__" | |
h.body_width = args.body_width | |
h.google_list_indent = args.list_indent | |
h.ignore_emphasis = args.ignore_emphasis | |
h.ignore_links = args.ignore_links | |
h.ignore_mailto_links = args.ignore_mailto_links | |
h.protect_links = args.protect_links | |
h.ignore_images = args.ignore_images | |
h.images_as_html = args.images_as_html | |
h.images_to_alt = args.images_to_alt | |
h.images_with_size = args.images_with_size | |
h.google_doc = args.google_doc | |
h.hide_strikethrough = args.hide_strikethrough | |
h.escape_snob = args.escape_snob | |
h.bypass_tables = args.bypass_tables | |
h.ignore_tables = args.ignore_tables | |
h.single_line_break = args.single_line_break | |
h.inline_links = args.inline_links | |
h.unicode_snob = args.unicode_snob | |
h.use_automatic_links = args.use_automatic_links | |
h.skip_internal_links = args.skip_internal_links | |
h.links_each_paragraph = args.links_each_paragraph | |
h.mark_code = args.mark_code | |
h.wrap_links = args.wrap_links | |
h.wrap_list_items = args.wrap_list_items | |
h.wrap_tables = args.wrap_tables | |
h.pad_tables = args.pad_tables | |
h.default_image_alt = args.default_image_alt | |
h.open_quote = args.open_quote | |
h.close_quote = args.close_quote | |
h.include_sup_sub = args.include_sup_sub | |
sys.stdout.write(h.handle(html)) | |