Spaces:

yutingg
/

essay-main-idea

Sleeping

App Files Files Community

yutingg commited on Dec 14, 2023

Commit

ecf6936

1 Parent(s): 134e623

Predict main idea sentence with custom-distill-bert-for-sentence-label

Browse files

Files changed (3) hide show

app.py +23 -3
main_idea_with_pipeline.py +39 -0
main_idea_with_torch.py +119 -0

app.py CHANGED Viewed

@@ -1,7 +1,27 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

 import gradio as gr
+from transformers import AutoModel, AutoConfig
+from main_idea_with_torch import predict_mainidea_sent_old
+from main_idea_with_pipeline import predict_mainidea_sent
+config = AutoConfig.from_pretrained("yutingg/custom-distill-bert-for-sentence-label", trust_remote_code=True)
+model = AutoModel.from_pretrained("yutingg/custom-distill-bert-for-sentence-label", trust_remote_code=True, config=config)
+def greet(essay):
+    ret = predict_mainidea_sent(essay, model), predict_mainidea_sent_old(essay, model)
+    return ret
+iface = gr.Interface(fn=greet, inputs="text", outputs=[
+    gr.Dataframe(
+        label="pipeline output",
+        headers=['label: is main idea', 'sentence'],
+        datatype=["str", "str"],
+        col_count=(2, "fixed"),
+    ),
+    gr.Dataframe(
+        label="torch output with Triage",
+        headers=['label: is main idea', 'sentence'],
+        datatype=["str", "str"],
+        col_count=(2, "fixed"),
+    )
+])
 iface.launch()

main_idea_with_pipeline.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from nltk.tokenize import sent_tokenize, word_tokenize
+import pandas as pd
+# read in an essay and resturns a df in sentence level
+def essay_to_sent(essay):
+    sentences = []
+    paragraphs = [l for l in essay.split('\n') if len(l) > 0]
+    for para in paragraphs:
+        # tokenize paragraph by "." and concatenate to sentences[]
+        sentences.extend(sent_tokenize(para))
+    return sentences
+######################
+# prerequisite:
+# 1. Pip install transformer
+# 2. Define tokenizer + MAX_LEN
+# 3. Construct DistillBERTClass_SL class
+# 4. Construct Triage_SL class
+# 5. Define predict__SL class
+# 6. Load model_SL & call eval()
+# 7. Pre_define predict_params_SL
+####################
+from transformers import DistilBertTokenizer
+from transformers import pipeline
+tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+def predict_mainidea_sent(paragraph, model):
+    # prepare data
+    sentences = essay_to_sent(paragraph)
+    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cpu")
+    probability_score = pipe(sentences, batch_size=8, function_to_apply="sigmoid")
+    labels = [score['score'] > 0.5 for score in probability_score]
+    return pd.DataFrame([(str(l), s) for l, s in zip(labels, sentences)], columns=['label', 'sentence'])

main_idea_with_torch.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from nltk.tokenize import sent_tokenize
+import pandas as pd
+######################
+# prerequisite:
+# 1. Pip install transformer
+# 2. Define tokenizer + MAX_LEN
+# 3. Construct DistillBERTClass_SL class
+# 4. Construct Triage_SL class
+# 5. Define predict__SL class
+# 6. Load model_SL & call eval()
+# 7. Pre_define predict_params_SL
+####################
+from transformers import DistilBertTokenizer
+tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
+import torch
+"""### DataSet Class -- Triage_SL"""
+from torch.utils.data import Dataset, DataLoader
+class Triage_SL(Dataset):
+    # initialize the directory containing the dataframe, the tokenizer, and the max lens of sentences
+    def __init__(self, dataframe, tokenizer, max_len):
+        self.len = len(dataframe)
+        self.data = dataframe
+        self.tokenizer = tokenizer # load in tokenizer, used in _getitem
+        self.max_len = max_len
+    # The __getitem__ function loads and returns a sample from the dataset at the given index idx.
+    def __getitem__(self, index):
+        if index >= len(self):
+          raise StopIteration
+        # preprossessing sentences to standarize format as in: word+""+word
+        sent = str(self.data.sentence[index])
+        sent = " ".join(sent.split())
+        # 1.- Split the sentence into tokens.
+        # 2.- Add the special [CLS] and [SEP] tokens.
+        # 3.- Map the tokens to their IDs.
+        # 4.- Pad or truncate all sentences to the same length.
+        # 5.- Create the attention masks which explicitly differentiate real tokens from [PAD] tokens.
+        inputs = self.tokenizer.encode_plus(
+            sent,                       # Sentence to encode
+            None,                       # text_pair
+            add_special_tokens=True,    # Add '[CLS]' and '[SEP]'
+            max_length=self.max_len,
+            pad_to_max_length=True,     # Pad & truncate all sentences.
+            return_token_type_ids=True,
+            truncation=True
+        )
+        ids = inputs['input_ids']
+        mask = inputs['attention_mask']
+        return {
+            'ids': torch.tensor(ids, dtype=torch.long),
+            'mask': torch.tensor(mask, dtype=torch.long),
+            # 'targets': torch.tensor(self.data.ENCODE_LABEL[index], dtype=torch.float), # sentence label -> y value
+#            'combined_label': self.data.combined_label[index]
+        }
+    # The __len__ function returns the number of samples in our dataset.
+    def __len__(self):
+        return self.len
+# read in an essay and resturns a df in sentence level
+def essay_to_sent_df(essay):
+    sentences = []
+    paragraphs = [l for l in essay.split('\n') if len(l) > 0]
+    for para in paragraphs:
+      # tokenize paragraph by "." and concatenate to sentences[]
+      sentences.extend(sent_tokenize(para))
+    return pd.DataFrame(sentences, columns=['sentence'])
+# Defining some key variables that will be used later on in the training
+MAX_LEN = 512
+"""### Predefine predict_params_SL"""
+PREDICT_BATCH_SIZE = 1
+predict_params_SL = {'batch_size': PREDICT_BATCH_SIZE,
+                'shuffle': False,
+                'num_workers': 0
+                }
+"""### Predict Fn -- predict_SL"""
+sigmoid = torch.nn.Sigmoid()
+def predict_SL(model, validation_loader):
+    epoch_val_outputs=[]
+    cpu_device = 'cpu'
+    model.eval()
+    with torch.no_grad():
+        for _, data in enumerate(validation_loader, 0):
+            ids = data['ids'].to(cpu_device, dtype = torch.long)
+            mask = data['mask'].to(cpu_device, dtype = torch.long)
+            outputs = model(ids, mask)["logits"].squeeze() # ??squeeze??
+            outputs = (sigmoid(outputs).data>0.5).float()
+            epoch_val_outputs.append(outputs.item())
+    return epoch_val_outputs
+def predict_mainidea_sent_old(paragraph, model):
+    # prepare data
+    sent_df = essay_to_sent_df(paragraph)
+    predicting_SL_set = Triage_SL(sent_df, tokenizer, MAX_LEN)
+    predicting_SL_loader = DataLoader(predicting_SL_set, **predict_params_SL)
+    # load model to device
+    device = 'cpu'
+    model.to(device)
+    # predict + roundup
+    sent_label = predict_SL(model, predicting_SL_loader)
+    print(sent_label)
+    return pd.DataFrame([(str(l), s) for l, s in zip(sent_label, sent_df.sentence)], columns=['label', 'sentence'])