mvasani3690 commited on
Commit
4c98fef
·
unverified ·
1 Parent(s): 97f5b4d

Add files via upload

Browse files
Files changed (1) hide show
  1. app.py +120 -0
app.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch
4
+
5
+ import torch
6
+ from pprint import pprint
7
+ from datasets import load_dataset
8
+
9
+ # ----- Data Loading ------
10
+ dataset_dict = load_dataset('HUPD/hupd',
11
+ name='sample',
12
+ data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
13
+ icpr_label=None,
14
+ train_filing_start_date='2016-01-01',
15
+ train_filing_end_date='2016-01-21',
16
+ val_filing_start_date='2016-01-22',
17
+ val_filing_end_date='2016-01-31',
18
+ )
19
+ # Here we can see the `train` and `val` splits, along with the
20
+ # location of the cached data files
21
+ print('Dataset contents:')
22
+ print(dataset_dict)
23
+
24
+ print('Dataset cache location:')
25
+ print(dataset_dict.cache_files)
26
+
27
+ # Data
28
+ train_dataset = dataset_dict["train"]
29
+ val_dataset = dataset_dict["validation"]
30
+ print(f'Train dataset shape: {train_dataset.shape}')
31
+ print(f'Validation dataset shape: {val_dataset.shape}')
32
+
33
+ # List all available fields
34
+ print(f'Dataset fields:')
35
+ print(train_dataset.column_names)
36
+
37
+ # Example: preprocess the abstract field of the dataset
38
+ # using HF tokenizers
39
+ from transformers import AutoTokenizer
40
+ tokenizer = AutoTokenizer.from_pretrained('roberta-base')
41
+
42
+ # We tokenize in batches, so tokenization is quite fast
43
+ train_dataset = train_dataset.map(
44
+ lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
45
+ batched=True,
46
+ desc="Tokenizing training files"
47
+ )
48
+ val_dataset = val_dataset.map(
49
+ lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
50
+ batched=True,
51
+ desc="Tokenizing training files"
52
+ )
53
+
54
+ # Since we've tokenized the dataset, we have a new cache location
55
+ print('Dataset cache location after tokenization:')
56
+ print(train_dataset.cache_files)
57
+
58
+ # And we have added some fields to our dataset
59
+ print('Dataset fields after tokenization:')
60
+ print(train_dataset.column_names)
61
+
62
+
63
+ # Load the BERT tokenizer and model for sequence classification
64
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
65
+ model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=6)
66
+
67
+ # Function to retrieve abstract and claims text based on filing number
68
+ def get_text_data(filing_number):
69
+ # Check if the filing number exists in the dataset
70
+ if filing_number >= len(train_dataset) or filing_number < 0:
71
+ return None, None # Return None if the filing number is out of range or negative
72
+
73
+ # Access the data corresponding to the filing number
74
+ data = train_dataset[filing_number]
75
+
76
+ # Retrieve the abstract and claims text from the data
77
+ abstract = data.get('abstract', None)
78
+ claims = data.get('claims', None)
79
+
80
+ return abstract, claims
81
+
82
+
83
+
84
+ # Streamlit app
85
+ def main():
86
+ st.title("Patentability Score App")
87
+
88
+ # Dropdown menu to select the application filing number
89
+ filing_number = st.selectbox("Select Application Filing Number", range(len(train_dataset)))
90
+
91
+ # Display abstract and claims text boxes based on selected filing number
92
+ abstract, claims = get_text_data(filing_number)
93
+ st.subheader("Abstract:")
94
+ st.text_area("Abstract Text", abstract, height=200, key='abstract_text')
95
+ st.subheader("Claims:")
96
+ st.text_area("Claims Text", claims, height=400, key='claims_text')
97
+
98
+ # Submit button to calculate and display the patentability score
99
+ if st.button("Submit"):
100
+ # Tokenize the abstract and claims texts
101
+ inputs = tokenizer(abstract, claims, return_tensors="pt", padding=True, truncation=True)
102
+
103
+ # Perform inference with the model to get the logits
104
+ with torch.no_grad():
105
+ logits = model(**inputs).logits
106
+
107
+ # Calculate the patentability score
108
+ score = torch.softmax(logits, dim=1).tolist()[0]
109
+
110
+ # Display the patentability score
111
+ st.subheader("Patentability Score:")
112
+ st.write("REJECTED:", score[0])
113
+ st.write("ACCEPTED:", score[1])
114
+ st.write("PENDING:", score[2])
115
+ st.write("CONT-REJECTED:", score[3])
116
+ st.write("CONT-ACCEPTED:", score[4])
117
+ st.write("CONT-PENDING:", score[5])
118
+
119
+ if __name__ == "__main__":
120
+ main()