Spaces:
Sleeping
Sleeping
Predict main idea sentence with custom-distill-bert-for-sentence-label
Browse files- app.py +23 -3
- main_idea_with_pipeline.py +39 -0
- main_idea_with_torch.py +119 -0
app.py
CHANGED
@@ -1,7 +1,27 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import AutoModel, AutoConfig
|
3 |
+
from main_idea_with_torch import predict_mainidea_sent_old
|
4 |
+
from main_idea_with_pipeline import predict_mainidea_sent
|
5 |
|
6 |
+
config = AutoConfig.from_pretrained("yutingg/custom-distill-bert-for-sentence-label", trust_remote_code=True)
|
7 |
+
model = AutoModel.from_pretrained("yutingg/custom-distill-bert-for-sentence-label", trust_remote_code=True, config=config)
|
8 |
|
9 |
+
def greet(essay):
|
10 |
+
ret = predict_mainidea_sent(essay, model), predict_mainidea_sent_old(essay, model)
|
11 |
+
return ret
|
12 |
+
|
13 |
+
iface = gr.Interface(fn=greet, inputs="text", outputs=[
|
14 |
+
gr.Dataframe(
|
15 |
+
label="pipeline output",
|
16 |
+
headers=['label: is main idea', 'sentence'],
|
17 |
+
datatype=["str", "str"],
|
18 |
+
col_count=(2, "fixed"),
|
19 |
+
),
|
20 |
+
gr.Dataframe(
|
21 |
+
label="torch output with Triage",
|
22 |
+
headers=['label: is main idea', 'sentence'],
|
23 |
+
datatype=["str", "str"],
|
24 |
+
col_count=(2, "fixed"),
|
25 |
+
)
|
26 |
+
])
|
27 |
iface.launch()
|
main_idea_with_pipeline.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
|
5 |
+
# read in an essay and resturns a df in sentence level
|
6 |
+
def essay_to_sent(essay):
|
7 |
+
sentences = []
|
8 |
+
paragraphs = [l for l in essay.split('\n') if len(l) > 0]
|
9 |
+
for para in paragraphs:
|
10 |
+
# tokenize paragraph by "." and concatenate to sentences[]
|
11 |
+
sentences.extend(sent_tokenize(para))
|
12 |
+
return sentences
|
13 |
+
|
14 |
+
|
15 |
+
######################
|
16 |
+
# prerequisite:
|
17 |
+
# 1. Pip install transformer
|
18 |
+
# 2. Define tokenizer + MAX_LEN
|
19 |
+
# 3. Construct DistillBERTClass_SL class
|
20 |
+
# 4. Construct Triage_SL class
|
21 |
+
# 5. Define predict__SL class
|
22 |
+
# 6. Load model_SL & call eval()
|
23 |
+
# 7. Pre_define predict_params_SL
|
24 |
+
####################
|
25 |
+
|
26 |
+
from transformers import DistilBertTokenizer
|
27 |
+
from transformers import pipeline
|
28 |
+
|
29 |
+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
|
30 |
+
|
31 |
+
|
32 |
+
def predict_mainidea_sent(paragraph, model):
|
33 |
+
# prepare data
|
34 |
+
sentences = essay_to_sent(paragraph)
|
35 |
+
|
36 |
+
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cpu")
|
37 |
+
probability_score = pipe(sentences, batch_size=8, function_to_apply="sigmoid")
|
38 |
+
labels = [score['score'] > 0.5 for score in probability_score]
|
39 |
+
return pd.DataFrame([(str(l), s) for l, s in zip(labels, sentences)], columns=['label', 'sentence'])
|
main_idea_with_torch.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk.tokenize import sent_tokenize
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
######################
|
5 |
+
# prerequisite:
|
6 |
+
# 1. Pip install transformer
|
7 |
+
# 2. Define tokenizer + MAX_LEN
|
8 |
+
# 3. Construct DistillBERTClass_SL class
|
9 |
+
# 4. Construct Triage_SL class
|
10 |
+
# 5. Define predict__SL class
|
11 |
+
# 6. Load model_SL & call eval()
|
12 |
+
# 7. Pre_define predict_params_SL
|
13 |
+
####################
|
14 |
+
|
15 |
+
from transformers import DistilBertTokenizer
|
16 |
+
|
17 |
+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
|
18 |
+
|
19 |
+
|
20 |
+
import torch
|
21 |
+
|
22 |
+
"""### DataSet Class -- Triage_SL"""
|
23 |
+
|
24 |
+
from torch.utils.data import Dataset, DataLoader
|
25 |
+
|
26 |
+
class Triage_SL(Dataset):
|
27 |
+
# initialize the directory containing the dataframe, the tokenizer, and the max lens of sentences
|
28 |
+
def __init__(self, dataframe, tokenizer, max_len):
|
29 |
+
self.len = len(dataframe)
|
30 |
+
self.data = dataframe
|
31 |
+
self.tokenizer = tokenizer # load in tokenizer, used in _getitem
|
32 |
+
self.max_len = max_len
|
33 |
+
|
34 |
+
# The __getitem__ function loads and returns a sample from the dataset at the given index idx.
|
35 |
+
def __getitem__(self, index):
|
36 |
+
if index >= len(self):
|
37 |
+
raise StopIteration
|
38 |
+
# preprossessing sentences to standarize format as in: word+""+word
|
39 |
+
sent = str(self.data.sentence[index])
|
40 |
+
sent = " ".join(sent.split())
|
41 |
+
# 1.- Split the sentence into tokens.
|
42 |
+
# 2.- Add the special [CLS] and [SEP] tokens.
|
43 |
+
# 3.- Map the tokens to their IDs.
|
44 |
+
# 4.- Pad or truncate all sentences to the same length.
|
45 |
+
# 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens.
|
46 |
+
inputs = self.tokenizer.encode_plus(
|
47 |
+
sent, # Sentence to encode
|
48 |
+
None, # text_pair
|
49 |
+
add_special_tokens=True, # Add '[CLS]' and '[SEP]'
|
50 |
+
max_length=self.max_len,
|
51 |
+
pad_to_max_length=True, # Pad & truncate all sentences.
|
52 |
+
return_token_type_ids=True,
|
53 |
+
truncation=True
|
54 |
+
)
|
55 |
+
ids = inputs['input_ids']
|
56 |
+
mask = inputs['attention_mask']
|
57 |
+
|
58 |
+
return {
|
59 |
+
'ids': torch.tensor(ids, dtype=torch.long),
|
60 |
+
'mask': torch.tensor(mask, dtype=torch.long),
|
61 |
+
# 'targets': torch.tensor(self.data.ENCODE_LABEL[index], dtype=torch.float), # sentence label -> y value
|
62 |
+
# 'combined_label': self.data.combined_label[index]
|
63 |
+
}
|
64 |
+
# The __len__ function returns the number of samples in our dataset.
|
65 |
+
def __len__(self):
|
66 |
+
return self.len
|
67 |
+
|
68 |
+
|
69 |
+
# read in an essay and resturns a df in sentence level
|
70 |
+
def essay_to_sent_df(essay):
|
71 |
+
sentences = []
|
72 |
+
paragraphs = [l for l in essay.split('\n') if len(l) > 0]
|
73 |
+
for para in paragraphs:
|
74 |
+
# tokenize paragraph by "." and concatenate to sentences[]
|
75 |
+
sentences.extend(sent_tokenize(para))
|
76 |
+
return pd.DataFrame(sentences, columns=['sentence'])
|
77 |
+
|
78 |
+
# Defining some key variables that will be used later on in the training
|
79 |
+
MAX_LEN = 512
|
80 |
+
"""### Predefine predict_params_SL"""
|
81 |
+
|
82 |
+
PREDICT_BATCH_SIZE = 1
|
83 |
+
predict_params_SL = {'batch_size': PREDICT_BATCH_SIZE,
|
84 |
+
'shuffle': False,
|
85 |
+
'num_workers': 0
|
86 |
+
}
|
87 |
+
|
88 |
+
"""### Predict Fn -- predict_SL"""
|
89 |
+
|
90 |
+
sigmoid = torch.nn.Sigmoid()
|
91 |
+
|
92 |
+
def predict_SL(model, validation_loader):
|
93 |
+
epoch_val_outputs=[]
|
94 |
+
cpu_device = 'cpu'
|
95 |
+
model.eval()
|
96 |
+
with torch.no_grad():
|
97 |
+
for _, data in enumerate(validation_loader, 0):
|
98 |
+
ids = data['ids'].to(cpu_device, dtype = torch.long)
|
99 |
+
mask = data['mask'].to(cpu_device, dtype = torch.long)
|
100 |
+
outputs = model(ids, mask)["logits"].squeeze() # ??squeeze??
|
101 |
+
outputs = (sigmoid(outputs).data>0.5).float()
|
102 |
+
epoch_val_outputs.append(outputs.item())
|
103 |
+
return epoch_val_outputs
|
104 |
+
|
105 |
+
def predict_mainidea_sent_old(paragraph, model):
|
106 |
+
# prepare data
|
107 |
+
sent_df = essay_to_sent_df(paragraph)
|
108 |
+
predicting_SL_set = Triage_SL(sent_df, tokenizer, MAX_LEN)
|
109 |
+
predicting_SL_loader = DataLoader(predicting_SL_set, **predict_params_SL)
|
110 |
+
# load model to device
|
111 |
+
device = 'cpu'
|
112 |
+
model.to(device)
|
113 |
+
# predict + roundup
|
114 |
+
sent_label = predict_SL(model, predicting_SL_loader)
|
115 |
+
print(sent_label)
|
116 |
+
return pd.DataFrame([(str(l), s) for l, s in zip(sent_label, sent_df.sentence)], columns=['label', 'sentence'])
|
117 |
+
|
118 |
+
|
119 |
+
|