Spaces:
Running
Running
DataRaptor
commited on
Commit
·
152844c
1
Parent(s):
86a60b5
Upload 5 files
Browse files- app.py +161 -2
- fold-0-train.csv +0 -0
- infer.py +133 -0
- model_weights.pth +3 -0
- requirements.txt +6 -0
app.py
CHANGED
@@ -1,8 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
|
7 |
-
st.markdown('Source code: https://github.com/dataraptor/PatentMatch/tree/main')
|
8 |
|
|
|
1 |
+
import datetime
|
2 |
+
import os
|
3 |
+
import pathlib
|
4 |
+
import requests
|
5 |
+
import zipfile
|
6 |
+
import pandas as pd
|
7 |
+
import pydeck as pdk
|
8 |
+
import geopandas as gpd
|
9 |
import streamlit as st
|
10 |
+
import leafmap.colormaps as cm
|
11 |
+
from leafmap.common import hex_to_rgb
|
12 |
+
import time
|
13 |
+
from infer import USPPPMModel, USPPPMDataset
|
14 |
+
import torch
|
15 |
+
import pandas as pd
|
16 |
|
17 |
+
@st.cache_resource
|
18 |
+
def load_model():
|
19 |
+
model = USPPPMModel('microsoft/deberta-v3-small')
|
20 |
+
model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))
|
21 |
+
model.eval()
|
22 |
+
ds = USPPPMDataset(model.tokenizer, 133)
|
23 |
+
return model, ds
|
24 |
+
|
25 |
+
def infer(anchor, target, title):
|
26 |
+
model, ds = load_model()
|
27 |
+
d = {
|
28 |
+
'anchor': anchor,
|
29 |
+
'target': target,
|
30 |
+
'title': title,
|
31 |
+
'label': 0
|
32 |
+
}
|
33 |
+
|
34 |
+
x = ds[d][0]
|
35 |
+
with torch.no_grad():
|
36 |
+
y = model(x)
|
37 |
+
|
38 |
+
return y.cpu().numpy()[0][0]
|
39 |
+
|
40 |
+
@st.cache_data
|
41 |
+
def get_context():
|
42 |
+
df = pd.read_csv('./fold-0-train.csv')
|
43 |
+
l = list(set(list(df['title'].values)))
|
44 |
+
return l
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
st.set_page_config(
|
50 |
+
page_title="PatentMatch",
|
51 |
+
page_icon="🧊",
|
52 |
+
layout="centered",
|
53 |
+
initial_sidebar_state="expanded",
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
# fix sidebar
|
59 |
+
st.markdown("""
|
60 |
+
<style>
|
61 |
+
.css-vk3wp9 {
|
62 |
+
background-color: rgb(255 255 255);
|
63 |
+
}
|
64 |
+
.css-18l0hbk {
|
65 |
+
padding: 0.34rem 1.2rem !important;
|
66 |
+
margin: 0.125rem 2rem;
|
67 |
+
}
|
68 |
+
.css-nziaof {
|
69 |
+
padding: 0.34rem 1.2rem !important;
|
70 |
+
margin: 0.125rem 2rem;
|
71 |
+
background-color: rgb(181 197 227 / 18%) !important;
|
72 |
+
}
|
73 |
+
</style>
|
74 |
+
""", unsafe_allow_html=True
|
75 |
+
)
|
76 |
+
hide_st_style = """
|
77 |
+
<style>
|
78 |
+
#MainMenu {visibility: hidden;}
|
79 |
+
footer {visibility: hidden;}
|
80 |
+
header {visibility: hidden;}
|
81 |
+
</style>
|
82 |
+
"""
|
83 |
+
st.markdown(hide_st_style, unsafe_allow_html=True)
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
def app():
|
88 |
+
|
89 |
+
st.title("PatentMatch: Patent Semantic Similarity Matcher")
|
90 |
+
#st.markdown("[![View in W&B](https://img.shields.io/badge/View%20in-W%26B-blue)](https://wandb.ai/<username>/<project_name>?workspace=user-<username>)")
|
91 |
+
|
92 |
+
st.markdown(
|
93 |
+
"""This project is focused on developing a Transformer based NLP model to match phrases
|
94 |
+
in U.S. patents based on their semantic similarity within a specific
|
95 |
+
technical domain context. The trained model achieved Pearson correlation coefficient score of 0.745.
|
96 |
+
[[Source Code]](https://github.com/dataraptor/PatentMatch)
|
97 |
+
"""
|
98 |
+
)
|
99 |
+
|
100 |
+
st.markdown('---')
|
101 |
+
# st.selectbox("Select from example",
|
102 |
+
# [
|
103 |
+
# "Example 1",
|
104 |
+
# "Example 2",
|
105 |
+
# ])
|
106 |
+
|
107 |
+
|
108 |
+
row1_col1, row1_col2, row1_col3 = st.columns(
|
109 |
+
[0.5, 0.4, 0.4]
|
110 |
+
)
|
111 |
+
# with row1_col1:
|
112 |
+
# frequency = st.selectbox("Section",
|
113 |
+
# [
|
114 |
+
# "A: Human Necessities",
|
115 |
+
# "B: Operations and Transport",
|
116 |
+
# "C: Chemistry and Metallurgy",
|
117 |
+
# "D: Textiles",
|
118 |
+
# "E: Fixed Constructions",
|
119 |
+
# "F: Mechanical Engineering",
|
120 |
+
# "G: Physics",
|
121 |
+
# "H: Electricity",
|
122 |
+
# "Y: Emerging Cross-Sectional Technologies",
|
123 |
+
# ])
|
124 |
+
# with row1_col2:
|
125 |
+
# class_box = st.selectbox("Class",
|
126 |
+
# [
|
127 |
+
# "21",
|
128 |
+
# "14",
|
129 |
+
# "23",
|
130 |
+
# ])
|
131 |
+
|
132 |
+
with row1_col1:
|
133 |
+
l = get_context()
|
134 |
+
context = st.selectbox("Context", l, l.index('basic electric elements'))
|
135 |
+
|
136 |
+
|
137 |
+
with row1_col2:
|
138 |
+
anchor = st.text_input("Anchor", "deflect light")
|
139 |
+
with row1_col3:
|
140 |
+
target = st.text_input("Target", "bending moment")
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
if st.button("Predict Scores", type="primary"):
|
146 |
+
with st.spinner("Predicting scores..."):
|
147 |
+
score = infer(anchor, target, context)
|
148 |
+
ss = st.success("Scores predicted successfully!")
|
149 |
+
|
150 |
+
score += 2.0
|
151 |
+
fmt = "{:<.3f}".format(score)
|
152 |
+
st.subheader(f"Similarity Score: {fmt}")
|
153 |
+
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
app()
|
158 |
+
|
159 |
+
|
160 |
+
# Display a footer with links and credits
|
161 |
+
st.markdown("---")
|
162 |
+
st.markdown("Built by [Shamim Ahamed](https://www.shamimahamed.com/). Data provided by [Kaggle](https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching)")
|
163 |
+
#st.markdown("Data provided by [The Feedback Prize - ELLIPSE Corpus Scoring Challenge on Kaggle](https://www.kaggle.com/c/feedbackprize-ellipse-corpus-scoring-challenge)")
|
164 |
|
165 |
|
166 |
|
|
|
167 |
|
fold-0-train.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
infer.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import nn
|
2 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
3 |
+
import torch
|
4 |
+
from torch.utils.data import Dataset
|
5 |
+
|
6 |
+
class MeanPooling(nn.Module):
|
7 |
+
def __init__(self):
|
8 |
+
super(MeanPooling, self).__init__()
|
9 |
+
|
10 |
+
def forward(self, last_hidden_state, attention_mask):
|
11 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
|
12 |
+
sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
|
13 |
+
sum_mask = input_mask_expanded.sum(1)
|
14 |
+
sum_mask = torch.clamp(sum_mask, min=1e-9)
|
15 |
+
mean_embeddings = sum_embeddings / sum_mask
|
16 |
+
return mean_embeddings
|
17 |
+
|
18 |
+
class MeanPoolingLayer(nn.Module):
|
19 |
+
def __init__(self, input_size, target_size):
|
20 |
+
super(MeanPoolingLayer, self).__init__()
|
21 |
+
self.pool = MeanPooling()
|
22 |
+
self.fc = nn.Linear(input_size, target_size)
|
23 |
+
|
24 |
+
def forward(self, inputs, mask):
|
25 |
+
last_hidden_states = inputs[0]
|
26 |
+
feature = self.pool(last_hidden_states, mask)
|
27 |
+
outputs = self.fc(feature)
|
28 |
+
return outputs
|
29 |
+
|
30 |
+
|
31 |
+
def weight_init_normal(module, model):
|
32 |
+
if isinstance(module, nn.Linear):
|
33 |
+
module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
|
34 |
+
if module.bias is not None:
|
35 |
+
module.bias.data.zero_()
|
36 |
+
elif isinstance(module, nn.Embedding):
|
37 |
+
module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
|
38 |
+
if module.padding_idx is not None:
|
39 |
+
module.weight.data[module.padding_idx].zero_()
|
40 |
+
elif isinstance(module, nn.LayerNorm):
|
41 |
+
module.bias.data.zero_()
|
42 |
+
module.weight.data.fill_(1.0)
|
43 |
+
|
44 |
+
|
45 |
+
class USPPPMModel(nn.Module):
|
46 |
+
def __init__(self, backbone):
|
47 |
+
super(USPPPMModel, self).__init__()
|
48 |
+
self.config = AutoConfig.from_pretrained(backbone, output_hidden_states=True)
|
49 |
+
self.model = AutoModel.from_pretrained(backbone, config=self.config)
|
50 |
+
self.head = MeanPoolingLayer(768,1)
|
51 |
+
self.tokenizer = AutoTokenizer.from_pretrained(backbone);
|
52 |
+
|
53 |
+
# sectoks = ['[CTG]', '[CTX]', '[ANC]', '[TGT]']
|
54 |
+
# self.tokenizer.add_special_tokens({'additional_special_tokens': sectoks})
|
55 |
+
# self.model.resize_token_embeddings(len(self.tokenizer))
|
56 |
+
|
57 |
+
def _init_weights(self, layer):
|
58 |
+
for module in layer.modules():
|
59 |
+
init_fn = weight_init_normal
|
60 |
+
init_fn(module, self)
|
61 |
+
# print(type(module))
|
62 |
+
|
63 |
+
def forward(self, inputs):
|
64 |
+
outputs = self.model(**inputs)
|
65 |
+
outputs = self.head(outputs, inputs['attention_mask'])
|
66 |
+
return outputs
|
67 |
+
|
68 |
+
|
69 |
+
table = """
|
70 |
+
A: Human Necessities
|
71 |
+
B: Operations and Transport
|
72 |
+
C: Chemistry and Metallurgy
|
73 |
+
D: Textiles
|
74 |
+
E: Fixed Constructions
|
75 |
+
F: Mechanical Engineering
|
76 |
+
G: Physics
|
77 |
+
H: Electricity
|
78 |
+
Y: Emerging Cross-Sectional Technologies
|
79 |
+
"""
|
80 |
+
splits = [i for i in table.split('\n') if i != '']
|
81 |
+
table = {e.split(': ')[0]: e.split(': ')[1] for e in splits}
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
class USPPPMDataset(Dataset):
|
86 |
+
def __init__(self, tokenizer, max_length):
|
87 |
+
self.tokenizer = tokenizer
|
88 |
+
self.max_length = max_length
|
89 |
+
|
90 |
+
|
91 |
+
def __len__(self): return 0
|
92 |
+
|
93 |
+
def __getitem__(self, x):
|
94 |
+
score = x['label']
|
95 |
+
|
96 |
+
sep = '' + self.tokenizer.sep_token + ''
|
97 |
+
|
98 |
+
s = x['anchor'] + sep + x['target'] + sep + x['title']
|
99 |
+
|
100 |
+
inputs = self.tokenizer(
|
101 |
+
s, add_special_tokens=True,
|
102 |
+
max_length=self.max_length, padding='max_length',
|
103 |
+
truncation=True,
|
104 |
+
return_offsets_mapping=False
|
105 |
+
)
|
106 |
+
for k, v in inputs.items(): inputs[k] = torch.tensor(v, dtype=torch.long).unsqueeze(dim=0)
|
107 |
+
label = torch.tensor(score, dtype=torch.float)
|
108 |
+
return inputs, label
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
if __name__ == '__main__':
|
114 |
+
model = USPPPMModel('microsoft/deberta-v3-small')
|
115 |
+
model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))
|
116 |
+
model.eval()
|
117 |
+
|
118 |
+
ds = USPPPMDataset(model.tokenizer, 133)
|
119 |
+
|
120 |
+
d = {
|
121 |
+
'anchor': 'sprayed',
|
122 |
+
'target': 'thermal sprayed coating',
|
123 |
+
'title': 'building',
|
124 |
+
'label': 0
|
125 |
+
}
|
126 |
+
inp = ds[d]
|
127 |
+
x = inp[0]
|
128 |
+
|
129 |
+
with torch.no_grad():
|
130 |
+
y = model(x)
|
131 |
+
print('y:', y)
|
132 |
+
|
133 |
+
|
model_weights.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0b49ff053c7beac972a85d464305398ab93252901348418af1692e7ca0959dd
|
3 |
+
size 565268017
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.21.0
|
2 |
+
Pillow
|
3 |
+
protobuf
|
4 |
+
torchvision==0.15.2
|
5 |
+
torch==2.0.1
|
6 |
+
numpy
|