import streamlit as st from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import torch from pprint import pprint from datasets import load_dataset # ----- Data Loading ------ dataset_dict = load_dataset('HUPD/hupd', name='sample', data_files="https://huggingface.co./datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", icpr_label=None, train_filing_start_date='2016-01-01', train_filing_end_date='2016-01-21', val_filing_start_date='2016-01-22', val_filing_end_date='2016-01-31', ) # Here we can see the `train` and `val` splits, along with the # location of the cached data files print('Dataset contents:') print(dataset_dict) print('Dataset cache location:') print(dataset_dict.cache_files) # Data train_dataset = dataset_dict["train"] val_dataset = dataset_dict["validation"] print(f'Train dataset shape: {train_dataset.shape}') print(f'Validation dataset shape: {val_dataset.shape}') # List all available fields print(f'Dataset fields:') print(train_dataset.column_names) # Example: preprocess the abstract field of the dataset # using HF tokenizers from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained('roberta-base') # We tokenize in batches, so tokenization is quite fast train_dataset = train_dataset.map( lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'), batched=True, desc="Tokenizing training files" ) val_dataset = val_dataset.map( lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'), batched=True, desc="Tokenizing training files" ) # Since we've tokenized the dataset, we have a new cache location print('Dataset cache location after tokenization:') print(train_dataset.cache_files) # And we have added some fields to our dataset print('Dataset fields after tokenization:') print(train_dataset.column_names) # Load the BERT tokenizer and model for sequence classification tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=6) # Function to retrieve abstract and claims text based on filing number def get_text_data(filing_number): # Check if the filing number exists in the dataset if filing_number >= len(train_dataset) or filing_number < 0: return None, None # Return None if the filing number is out of range or negative # Access the data corresponding to the filing number data = train_dataset[filing_number] # Retrieve the abstract and claims text from the data abstract = data.get('abstract', None) claims = data.get('claims', None) return abstract, claims # Streamlit app st.markdown("Link to app - [Patentabiity app](https://huggingface.co./spaces/mvasani/Patentatbility_score_app)") def main(): st.title("Patentability Score App") # Dropdown menu to select the application filing number filing_number = st.selectbox("Select Application Filing Number", range(len(train_dataset))) # Display abstract and claims text boxes based on selected filing number abstract, claims = get_text_data(filing_number) st.subheader("Abstract:") st.text_area("Abstract Text", abstract, height=200, key='abstract_text') st.subheader("Claims:") st.text_area("Claims Text", claims, height=400, key='claims_text') # Submit button to calculate and display the patentability score if st.button("Submit"): # Tokenize the abstract and claims texts inputs = tokenizer(abstract, claims, return_tensors="pt", padding=True, truncation=True) # Perform inference with the model to get the logits with torch.no_grad(): logits = model(**inputs).logits # Calculate the patentability score score = torch.softmax(logits, dim=1).tolist()[0] # Display the patentability score st.subheader("Patentability Score:") st.write("REJECTED:", score[0]) st.write("ACCEPTED:", score[1]) st.write("PENDING:", score[2]) st.write("CONT-REJECTED:", score[3]) st.write("CONT-ACCEPTED:", score[4]) st.write("CONT-PENDING:", score[5]) if __name__ == "__main__": main()