mvasani3690
commited on
Add files via upload
Browse files
app.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
3 |
+
import torch
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from pprint import pprint
|
7 |
+
from datasets import load_dataset
|
8 |
+
|
9 |
+
# ----- Data Loading ------
|
10 |
+
dataset_dict = load_dataset('HUPD/hupd',
|
11 |
+
name='sample',
|
12 |
+
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
|
13 |
+
icpr_label=None,
|
14 |
+
train_filing_start_date='2016-01-01',
|
15 |
+
train_filing_end_date='2016-01-21',
|
16 |
+
val_filing_start_date='2016-01-22',
|
17 |
+
val_filing_end_date='2016-01-31',
|
18 |
+
)
|
19 |
+
# Here we can see the `train` and `val` splits, along with the
|
20 |
+
# location of the cached data files
|
21 |
+
print('Dataset contents:')
|
22 |
+
print(dataset_dict)
|
23 |
+
|
24 |
+
print('Dataset cache location:')
|
25 |
+
print(dataset_dict.cache_files)
|
26 |
+
|
27 |
+
# Data
|
28 |
+
train_dataset = dataset_dict["train"]
|
29 |
+
val_dataset = dataset_dict["validation"]
|
30 |
+
print(f'Train dataset shape: {train_dataset.shape}')
|
31 |
+
print(f'Validation dataset shape: {val_dataset.shape}')
|
32 |
+
|
33 |
+
# List all available fields
|
34 |
+
print(f'Dataset fields:')
|
35 |
+
print(train_dataset.column_names)
|
36 |
+
|
37 |
+
# Example: preprocess the abstract field of the dataset
|
38 |
+
# using HF tokenizers
|
39 |
+
from transformers import AutoTokenizer
|
40 |
+
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
|
41 |
+
|
42 |
+
# We tokenize in batches, so tokenization is quite fast
|
43 |
+
train_dataset = train_dataset.map(
|
44 |
+
lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
|
45 |
+
batched=True,
|
46 |
+
desc="Tokenizing training files"
|
47 |
+
)
|
48 |
+
val_dataset = val_dataset.map(
|
49 |
+
lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
|
50 |
+
batched=True,
|
51 |
+
desc="Tokenizing training files"
|
52 |
+
)
|
53 |
+
|
54 |
+
# Since we've tokenized the dataset, we have a new cache location
|
55 |
+
print('Dataset cache location after tokenization:')
|
56 |
+
print(train_dataset.cache_files)
|
57 |
+
|
58 |
+
# And we have added some fields to our dataset
|
59 |
+
print('Dataset fields after tokenization:')
|
60 |
+
print(train_dataset.column_names)
|
61 |
+
|
62 |
+
|
63 |
+
# Load the BERT tokenizer and model for sequence classification
|
64 |
+
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
|
65 |
+
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=6)
|
66 |
+
|
67 |
+
# Function to retrieve abstract and claims text based on filing number
|
68 |
+
def get_text_data(filing_number):
|
69 |
+
# Check if the filing number exists in the dataset
|
70 |
+
if filing_number >= len(train_dataset) or filing_number < 0:
|
71 |
+
return None, None # Return None if the filing number is out of range or negative
|
72 |
+
|
73 |
+
# Access the data corresponding to the filing number
|
74 |
+
data = train_dataset[filing_number]
|
75 |
+
|
76 |
+
# Retrieve the abstract and claims text from the data
|
77 |
+
abstract = data.get('abstract', None)
|
78 |
+
claims = data.get('claims', None)
|
79 |
+
|
80 |
+
return abstract, claims
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
# Streamlit app
|
85 |
+
def main():
|
86 |
+
st.title("Patentability Score App")
|
87 |
+
|
88 |
+
# Dropdown menu to select the application filing number
|
89 |
+
filing_number = st.selectbox("Select Application Filing Number", range(len(train_dataset)))
|
90 |
+
|
91 |
+
# Display abstract and claims text boxes based on selected filing number
|
92 |
+
abstract, claims = get_text_data(filing_number)
|
93 |
+
st.subheader("Abstract:")
|
94 |
+
st.text_area("Abstract Text", abstract, height=200, key='abstract_text')
|
95 |
+
st.subheader("Claims:")
|
96 |
+
st.text_area("Claims Text", claims, height=400, key='claims_text')
|
97 |
+
|
98 |
+
# Submit button to calculate and display the patentability score
|
99 |
+
if st.button("Submit"):
|
100 |
+
# Tokenize the abstract and claims texts
|
101 |
+
inputs = tokenizer(abstract, claims, return_tensors="pt", padding=True, truncation=True)
|
102 |
+
|
103 |
+
# Perform inference with the model to get the logits
|
104 |
+
with torch.no_grad():
|
105 |
+
logits = model(**inputs).logits
|
106 |
+
|
107 |
+
# Calculate the patentability score
|
108 |
+
score = torch.softmax(logits, dim=1).tolist()[0]
|
109 |
+
|
110 |
+
# Display the patentability score
|
111 |
+
st.subheader("Patentability Score:")
|
112 |
+
st.write("REJECTED:", score[0])
|
113 |
+
st.write("ACCEPTED:", score[1])
|
114 |
+
st.write("PENDING:", score[2])
|
115 |
+
st.write("CONT-REJECTED:", score[3])
|
116 |
+
st.write("CONT-ACCEPTED:", score[4])
|
117 |
+
st.write("CONT-PENDING:", score[5])
|
118 |
+
|
119 |
+
if __name__ == "__main__":
|
120 |
+
main()
|