File size: 3,310 Bytes
73e689e
 
 
325a100
73e689e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3109a9b
73e689e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3109a9b
73e689e
 
 
325a100
73e689e
 
 
 
 
 
 
 
b943d4c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
import re

from gradio.mix import Parallel
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
)

#define function for text cleaning
def clean_text(text):
    text = text.encode("ascii", errors="ignore").decode(
        "ascii"
    )  # remove non-ascii, Chinese characters
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"ADVERTISEMENT", " ", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\n\n", " ", text)
    text = re.sub(r"\t", " ", text)
    text = text.strip(" ")
    text = re.sub(
        " +", " ", text
    ).strip()  # get rid of multiple spaces and replace with a single
    return text

# define function for headlines generator 1-3
modchoice_1 = "chinhon/pegasus-large-commentaries_hd"

def commentaries_headline1(text):
    input_text = clean_text(text)

    tokenizer_1 = AutoTokenizer.from_pretrained(modchoice_1)

    model_1 = AutoModelForSeq2SeqLM.from_pretrained(modchoice_1)

    with tokenizer_1.as_target_tokenizer():
        batch = tokenizer_1(
            input_text, truncation=True, padding="longest", return_tensors="pt"
        )

    translated = model_1.generate(**batch)

    summary_1 = tokenizer_1.batch_decode(translated, skip_special_tokens=True)

    return summary_1[0]


headline1 = gr.Interface(
    fn=commentaries_headline1,
    inputs=gr.inputs.Textbox(),
    outputs=gr.outputs.Textbox(label=" | Model: Fine tuned pegasus-large"),
)

modchoice_2 = "chinhon/pegasus-multi_news-commentaries_hdwriter"

def commentaries_headline2(text):
    input_text = clean_text(text)

    tokenizer_2 = AutoTokenizer.from_pretrained(modchoice_2)

    model_2 = AutoModelForSeq2SeqLM.from_pretrained(modchoice_2)

    with tokenizer_2.as_target_tokenizer():
        batch = tokenizer_2(
            input_text, truncation=True, padding="longest", return_tensors="pt"
        )

    translated = model_2.generate(**batch)

    summary_2 = tokenizer_2.batch_decode(translated, skip_special_tokens=True)

    return summary_2[0]

headline2 = gr.Interface(
    fn=commentaries_headline2,
    inputs=gr.inputs.Textbox(),
    outputs=gr.outputs.Textbox(label=" | Model: Fine tuned pegasus-multi_news"),
)


modchoice_3 = "chinhon/bart-large-commentaries_hdwriter"

def commentaries_headline3(text):
    input_text = clean_text(text)

    tokenizer_3 = AutoTokenizer.from_pretrained(modchoice_3)

    model_3 = AutoModelForSeq2SeqLM.from_pretrained(modchoice_3)

    with tokenizer_3.as_target_tokenizer():
        batch = tokenizer_3(
            input_text, truncation=True, padding="longest", return_tensors="pt"
        )

    translated = model_3.generate(**batch)

    summary_3 = tokenizer_3.batch_decode(
        translated, skip_special_tokens=True, max_length=100
    )

    return summary_3[0]


headline3 = gr.Interface(
    fn=commentaries_headline3,
    inputs=gr.inputs.Textbox(),
    outputs=gr.outputs.Textbox(label=" | Model: Fine tuned bart-large"),
)

#define Gradio interface for 3 parallel apps
Parallel(
    headline1,
    headline2,
    headline3,
    title="Commentaries Headlines Generator",
    inputs=gr.inputs.Textbox(
        lines=20,
        label="Paste parts of your commentary here, and choose from 3 suggested headlines",
    ),
    theme="huggingface", 
).launch(enable_queue=True)