Spaces:
Runtime error
Runtime error
File size: 4,359 Bytes
8cc4003 26441fa 8cc4003 574268b 8cc4003 574268b 8cc4003 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pickle
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from nltk.tokenize import sent_tokenize
import io
#contents = pickle.load(f) becomes...
#contents = CPU_Unpickler(f).load()
model_path = "finbert.sav"
#load model from drive
with open(model_path, "rb") as f:
model= pickle.load(f)
#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
import nltk
from finbert_embedding.embedding import FinbertEmbedding
import pandas as pd
from nltk.cluster import KMeansClusterer
import numpy as np
import os
from scipy.spatial import distance_matrix
from tensorflow.python.lib.io import file_io
import pickle
nltk.download('punkt')
def make_summary(word):
# Create tokens from the txt file
tokens = nltk.sent_tokenize(word)
# Strip out trailing and leading white spaces from tokens
sentences = [word.strip() for word in tokens]
#Create a DataFrame from the tokens
data = pd.DataFrame(sentences)
# Assign name Sentences to the column containing text tokens
data.columns = ['Sentences']
# Function to create numerical embeddings for each text tokens in dataframe
def get_sentence_embeddings():
# Create empty list for sentence embeddings
sentence_list = []
# Loop through all sentences and append sentence embeddings to list
for i in tokens:
sentence_embedding = model.sentence_vector(i)
sentence_list.append(sentence_embedding)
# Create empty list for ndarray
sentence_array=[]
# Loop through sentence list and change data type from tensor to array
for i in sentence_list:
sentence_array.append(i.numpy())
# return sentence embeddings as list
return sentence_array
# Apply get_sentence_embeddings to dataframe to create column Embeddings
data['Embeddings'] = get_sentence_embeddings()
#Number of expected sentences for shorter summaries
#if len(tokens) <= 4:
# NUM_CLUSTERS = 1
#else:
#NUM_CLUSTERS = len(tokens)//4
#Number of expected sentences for medium summaries
if len(tokens) <= 1:
NUM_CLUSTERS = 1
else:
NUM_CLUSTERS = len(tokens)//2
iterations = 25
# Convert Embeddings into an array and store in variable X
X = np.array(data['Embeddings'].to_list())
#Build k-means cluster algorithm
Kclusterer = KMeansClusterer(
NUM_CLUSTERS,
distance = nltk.cluster.util.cosine_distance,
repeats = iterations, avoid_empty_clusters = True)
# if length of text is too short, K means would return an error
# use the try except block to return the text as result if it is too short.
try:
assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)
# Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])
# return the text if clustering algorithm catches an exceptiona and move to the next text file
except ValueError:
return word
# function that computes the distance of each embeddings from the centroid of the cluster
def distance_from_centroid(row):
return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]
# apply distance_from_centroid function to data
data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)
## Return Final Summary
summary = " ".join(data.sort_values(
'Distance_From_Centroid',
ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
return summary
import gradio as gr
interface1 = gr.Interface(fn=make_summary,
inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
outputs=gr.outputs.Textbox(label='Output- Finbert')).launch()
|