detectROSE's picture
first_push
55750e2 verified
# -*- coding: utf-8 -*-
"""FinalProject_TextClassificationFineTuning.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1fCS36Rnww__14QDdcsjjG5hfFL83gzpU
"""
!pip install opendatasets
!pip install gradio --quiet
!pip install transformers[sentencepeice] datasets sacrebleu rouge_score py7zr -q
!!pip install rake-nltk # used to determine the key phrases in the text
! pip install kaggle
#After Done, delete all teh models that are not needed
import opendatasets as od
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import opendatasets as od
import gradio as gr
from transformers import pipeline
import matplotlib.pyplot as plt
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
from datasets import load_dataset
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device
# Pretrained-Dataset is this one: PEGASUS MODEL retrieved form https://huggingface.co./nsi319/legal-pegasus
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
text = """ """
input_tokenized = tokenizer.encode(text, return_tensors='pt',max_length=1024,truncation=True)
summary_ids = model.generate(input_tokenized,
num_beams=9,
no_repeat_ngram_size=3,
length_penalty=2.0,
min_length=150,
max_length=250,
early_stopping=True)
summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
### Summary Output
# The Securities and Exchange Commission today charged AT&T, Inc. and three of its Investor Relations executives with aiding and abetting the company's violations of the antifraud provisions of Section 10(b) of the Securities Exchange Act of 1934 and Rule 10b-5 thereunder. According to the SEC's complaint, the company learned in March 2016 that a steeper-than-expected decline in its first quarter smartphone sales would cause its revenue to fall short of analysts' estimates for the quarter. The complaint alleges that to avoid falling short of the consensus revenue estimate for the third consecutive quarter, the executives made private, one-on-one phone calls to analysts at approximately 20 separate firms. On these calls, the SEC alleges that Christopher Womack, Michael Black, and Kent Evans allegedly disclosed internal smartphone sales data and the impact of that data on internal revenue metrics. The SEC further alleges that as a result of what they were told, the analysts substantially reduced their revenue forecasts, leading to the overall consensus Revenue Estimate falling to just below the level that AT&t ultimately reported to the public on April 26, 2016. The SEC is seeking permanent injunctive relief and civil monetary penalties against each defendant.
summary
#Here we load the ToS dataset for additional finetuning.... this step is optional and doing so only improves our model
#The only issue with this is that it requires GPU and runtime disconnects and crashes since I dont have access to GPU or compute power it needs
#Loading the Dataset
# Assign the Kaggle data set URL into variable
dataset = 'https://www.kaggle.com/datasets/simple11/tos-summaries'
# Using opendatasets let's download the data sets
od.download(dataset)
dataset = pd.read_json('/content/tos-summaries/dataset.json', lines = True)
dataset
#print(dataset.head(6))
print(f"Summary: \n{summary}")
'summarization Gradio for my program'
def summarize_text(text):
#changed this to "inputs"
inputs = tokenizer.encode(text, return_tensors='pt',max_length=1024,truncation=True)
#generate summary
summary_ids = model.generate(input_tokenized,
num_beams=9,
no_repeat_ngram_size=3,
length_penalty=2.0,
min_length=150,
max_length=250,
early_stopping=True)
# Decode and return the summary
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
interface = gr.Interface(
fn=summarize_text,
inputs=gr.Textbox(lines=10, placeholder='Enter Text Here...', label='Input text'),
outputs=gr.Textbox(label='Summarized Text'),
title='Terms and Conditions Text Summarizer'
)
interface.launch()
########################################################################################################
import nltk
from rake_nltk import Rake
nltk.download('stopwords')
nltk.download('punkt')
# Uses stopwords for english from NLTK, and all puntuation characters by
# default
r = Rake()
# Extraction given the text.
r.extract_keywords_from_text(summary)
# Obtain keyword phrases ranked from highest to lowest.
r.get_ranked_phrases()
# To get keyword phrases ranked highest to lowest with scores.
r.get_ranked_phrases_with_scores()