File size: 1,507 Bytes
b2b504b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import re

import gradio as gr
import requests
from inscriptis import get_text
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.model.config import ParserConfig
from readability import Document

INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"])


def extract_text(url: str):
    html = requests.get(url).content.decode("utf-8")

    if len(html.strip()) == 0:
        return "", "", "", ""

    parsed_doc = Document(html)

    # get the body of the article with readability-lxml
    title = parsed_doc.short_title()
    clean_html = parsed_doc.summary(html_partial=True)
    del parsed_doc

    # get the formatted plaintext with inscriptis
    text = get_text(clean_html, INSCRIPTIS_CONFIG).strip()

    if not re.search(r"\w+", text):
        # no words found, only whitespace and punctuation
        return title, "", clean_html, html

    # remove excessive empty lines
    text = re.sub(r"\n\s*\n", "\n\n", text)

    return title, text, clean_html, html


title = gr.Textbox(label="Title")
text = gr.Textbox(label="Text", lines=10)
clean_html = gr.Textbox(label="Clean HTML", lines=10)
html = gr.Textbox(label="Raw HTML", lines=10)
demo = gr.Interface(
    extract_text,
    gr.Textbox(placeholder="https://hf.co/", label="URL"),
    [title, text, clean_html, html],
    examples=[
        ["https://huggingface.co./blog/peft"],
        [
            "https://www.nytimes.com/2023/03/08/technology/chatbots-disrupt-internet-industry.html"
        ],
    ],
)

demo.launch()