Spaces:
Runtime error
Runtime error
"good run"
Browse files- README.md +30 -0
- app.py +116 -4
- cache/slot/tag2idx.json +11 -0
- cache/slot/vocab.pkl +3 -0
- ckpt/slot/slot_checkpoint.pth +3 -0
- dataset.py +70 -0
- model.py +68 -0
- requirements.txt +9 -0
- utils.py +42 -0
README.md
CHANGED
@@ -5,9 +5,39 @@ colorFrom: purple
|
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.47.1
|
|
|
8 |
app_file: app.py
|
|
|
|
|
9 |
pinned: false
|
|
|
|
|
10 |
license: apache-2.0
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.47.1
|
8 |
+
python_version: 3.10
|
9 |
app_file: app.py
|
10 |
+
app_port: 7860
|
11 |
+
fullWidth: false
|
12 |
pinned: false
|
13 |
+
hf_oauth: false
|
14 |
+
disable_embedding: false
|
15 |
license: apache-2.0
|
16 |
+
tags:
|
17 |
+
- NLP
|
18 |
+
- Slot Tagging
|
19 |
---
|
20 |
|
21 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
22 |
+
|
23 |
+
# Slot Tagging
|
24 |
+
|
25 |
+
![Demo](https://your-demo-url-here.com)
|
26 |
+
|
27 |
+
This is a Slot Tagging demo powered by Gradio. It allows you to enter a sentence, and it will predict and highlight the slots in the text.
|
28 |
+
|
29 |
+
## Usage
|
30 |
+
|
31 |
+
1. Enter a sentence in the text box.
|
32 |
+
2. Click the "Submit" button to see the predicted slots highlighted in the text.
|
33 |
+
|
34 |
+
## Getting Started
|
35 |
+
|
36 |
+
To run this demo locally, follow these steps:
|
37 |
+
|
38 |
+
1. Clone this repository.
|
39 |
+
2. Install the required dependencies listed in the `requirements.txt` file.
|
40 |
+
3. Run the `app.py` file using Python.
|
41 |
+
|
42 |
+
```bash
|
43 |
+
python app.py
|
app.py
CHANGED
@@ -1,7 +1,119 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import json
|
3 |
+
import pickle
|
4 |
+
from pathlib import Path
|
5 |
+
from utils import Vocab
|
6 |
+
from model import SeqTagger
|
7 |
+
from dataset import SeqTaggingClsDataset
|
8 |
+
from typing import Dict
|
9 |
+
import torch
|
10 |
|
11 |
+
# Disable cudnn to ensure the model runs on CPU
|
12 |
+
torch.backends.cudnn.enabled = False
|
13 |
|
14 |
+
# Define hyperparameters
|
15 |
+
max_len = 256
|
16 |
+
hidden_size = 500
|
17 |
+
num_layers = 2
|
18 |
+
dropout = 0.2
|
19 |
+
bidirectional = True
|
20 |
+
lr = 1e-3
|
21 |
+
batch_size = 1
|
22 |
+
|
23 |
+
device = "cpu"
|
24 |
+
|
25 |
+
# Model and data paths
|
26 |
+
ckpt_dir = Path("./ckpt/slot/")
|
27 |
+
cache_dir = Path("./cache/slot/")
|
28 |
+
|
29 |
+
# Load the vocabulary
|
30 |
+
with open(cache_dir / "vocab.pkl", "rb") as f:
|
31 |
+
vocab: Vocab = pickle.load(f)
|
32 |
+
|
33 |
+
# Load the tag mapping
|
34 |
+
tag_idx_path = cache_dir / "tag2idx.json"
|
35 |
+
tag2idx: Dict[str, int] = json.loads(tag_idx_path.read_text())
|
36 |
+
idx2tag = {idx: tag for tag, idx in tag2idx.items()}
|
37 |
+
|
38 |
+
def _idx2tag(idx: int):
|
39 |
+
return idx2tag[idx]
|
40 |
+
|
41 |
+
# Create the dataset
|
42 |
+
datasets = SeqTaggingClsDataset({}, vocab, tag2idx, max_len)
|
43 |
+
|
44 |
+
# Create an uninitialized tensor with the defined shape
|
45 |
+
shape = (4117, 300)
|
46 |
+
embeddings = torch.empty(shape).to(device)
|
47 |
+
|
48 |
+
# Create the model
|
49 |
+
best_model = SeqTagger(
|
50 |
+
embeddings=embeddings,
|
51 |
+
hidden_size=hidden_size,
|
52 |
+
num_layers=num_layers,
|
53 |
+
dropout=dropout,
|
54 |
+
bidirectional=bidirectional,
|
55 |
+
num_class=len(tag2idx)
|
56 |
+
).to(device)
|
57 |
+
|
58 |
+
# Define the path to the model checkpoint
|
59 |
+
ckpt_path = ckpt_dir / "slot_checkpoint.pth"
|
60 |
+
|
61 |
+
# Load the model weights
|
62 |
+
checkpoint = torch.load(ckpt_path, map_location=torch.device('cpu'))
|
63 |
+
best_model.load_state_dict(checkpoint['model_state_dict'])
|
64 |
+
|
65 |
+
# Set the model to evaluation mode
|
66 |
+
best_model.eval()
|
67 |
+
|
68 |
+
def classify(text: str):
|
69 |
+
# Tokenize the text
|
70 |
+
str_text = [str(text.split())]
|
71 |
+
dic_text = {"tokens": str_text, "tags": [None], "id": ["text-0"]}
|
72 |
+
|
73 |
+
encoded_data = datasets.collate_fn(dic_text)
|
74 |
+
|
75 |
+
preds = []
|
76 |
+
mask = encoded_data['encoded_tags']
|
77 |
+
mask = (mask != -1)
|
78 |
+
|
79 |
+
# Use the trained model to predict each data point
|
80 |
+
for encoded_token in encoded_data['encoded_tokens'].to(device):
|
81 |
+
encoded_token = encoded_token.reshape(1, encoded_token.shape[0])
|
82 |
+
|
83 |
+
outputs = best_model(encoded_token)
|
84 |
+
outputs = torch.argmax(outputs, dim=1)[mask[0]].tolist()
|
85 |
+
|
86 |
+
preds.extend([[_idx2tag(output) for output in outputs]])
|
87 |
+
|
88 |
+
text_tags = []
|
89 |
+
for i, tag in enumerate(preds[0]):
|
90 |
+
if tag == "O":
|
91 |
+
text_tags.extend([(text.split()[i], None), (" ", None)])
|
92 |
+
else:
|
93 |
+
text_tags.extend([(text.split()[i], tag), (" ", None)])
|
94 |
+
|
95 |
+
return text_tags
|
96 |
+
|
97 |
+
# Create a Gradio interface
|
98 |
+
demo = gr.Interface(
|
99 |
+
classify,
|
100 |
+
gr.Textbox(placeholder="Please enter a text..."),
|
101 |
+
gr.HighlightedText(),
|
102 |
+
interpretation="none",
|
103 |
+
live=False,
|
104 |
+
enable_queue=True,
|
105 |
+
examples=[
|
106 |
+
["i have three people for august seventh"],
|
107 |
+
["a table for 2 adults and 4 children please"],
|
108 |
+
["i have a booking tomorrow for chara conelly at 9pm"],
|
109 |
+
["me and 4 others will be there at 8:30pm"],
|
110 |
+
["probably malik belliard has done the booking and it is on in 10 days"],
|
111 |
+
["i want to book a table for me and my wife tonight at 6 p.m"],
|
112 |
+
["date 18th of december"]
|
113 |
+
],
|
114 |
+
title="Slot Tagging",
|
115 |
+
description="This is a demo for slot tagging. Enter a sentence, and it will predict and highlight the slots."
|
116 |
+
)
|
117 |
+
|
118 |
+
# Launch the Gradio interface
|
119 |
+
demo.launch(share=True)
|
cache/slot/tag2idx.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"B-time": 0,
|
3 |
+
"I-date": 1,
|
4 |
+
"I-people": 2,
|
5 |
+
"B-last_name": 3,
|
6 |
+
"O": 4,
|
7 |
+
"I-time": 5,
|
8 |
+
"B-date": 6,
|
9 |
+
"B-people": 7,
|
10 |
+
"B-first_name": 8
|
11 |
+
}
|
cache/slot/vocab.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:23fcbc41efc0b4c4aaa74d7dbb67c74cd2aea6490d19c3465020abe77b602647
|
3 |
+
size 49861
|
ckpt/slot/slot_checkpoint.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55f20f97d1122d3277b4d69ffc3407108f3f16ab5a9094df2cd8d259ae20db5a
|
3 |
+
size 97891750
|
dataset.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset.py
|
2 |
+
|
3 |
+
from typing import List, Dict
|
4 |
+
import torch
|
5 |
+
from torch.utils.data import Dataset
|
6 |
+
from utils import Vocab
|
7 |
+
import numpy as np
|
8 |
+
import re
|
9 |
+
|
10 |
+
class SeqClsDataset(Dataset):
|
11 |
+
def __init__(
|
12 |
+
self,
|
13 |
+
data: List[Dict],
|
14 |
+
vocab: Vocab,
|
15 |
+
label_mapping: Dict[str, int],
|
16 |
+
max_len: int,
|
17 |
+
):
|
18 |
+
self.data = data
|
19 |
+
self.vocab = vocab
|
20 |
+
self.label_mapping = label_mapping
|
21 |
+
self._idx2label = {idx: intent for intent, idx in self.label_mapping.items()}
|
22 |
+
self.max_len = max_len
|
23 |
+
|
24 |
+
def __len__(self) -> int:
|
25 |
+
return len(self.data)
|
26 |
+
|
27 |
+
def __getitem__(self, index) -> Dict:
|
28 |
+
instance = self.data[index]
|
29 |
+
return instance
|
30 |
+
|
31 |
+
@property
|
32 |
+
def num_classes(self) -> int:
|
33 |
+
return len(self.label_mapping)
|
34 |
+
|
35 |
+
def label2idx(self, label: str):
|
36 |
+
return self.label_mapping[label]
|
37 |
+
|
38 |
+
def idx2label(self, idx: int):
|
39 |
+
return self._idx2label[idx]
|
40 |
+
|
41 |
+
|
42 |
+
class SeqTaggingClsDataset(SeqClsDataset):
|
43 |
+
def collate_fn(self, samples: List[Dict]) -> Dict:
|
44 |
+
batch_size = len(samples['tokens'])
|
45 |
+
|
46 |
+
tokens = samples["tokens"]
|
47 |
+
tags = samples["tags"] # list[str]
|
48 |
+
|
49 |
+
batch_data = self.vocab.token_to_id("[PAD]") * np.ones((batch_size, self.max_len))
|
50 |
+
batch_labels = -1 * np.ones((batch_size, self.max_len))
|
51 |
+
|
52 |
+
# Copy the data to the numpy array
|
53 |
+
for j in range(batch_size):
|
54 |
+
tokens[j] = eval(tokens[j])
|
55 |
+
cur_len = len(tokens[j])
|
56 |
+
tags[j] = [self.label_mapping["O"]] * cur_len
|
57 |
+
|
58 |
+
batch_data[j][:cur_len] = self.vocab.encode(tokens[j])
|
59 |
+
batch_labels[j][:cur_len] = tags[j]
|
60 |
+
|
61 |
+
# Convert integer index sequences to PyTorch tensors
|
62 |
+
batch_data = torch.LongTensor(batch_data)
|
63 |
+
batch_labels = torch.LongTensor(batch_labels)
|
64 |
+
|
65 |
+
# Create a batch data dictionary
|
66 |
+
batch_data = {
|
67 |
+
"encoded_tokens": batch_data,
|
68 |
+
"encoded_tags": batch_labels
|
69 |
+
}
|
70 |
+
return batch_data
|
model.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
class SeqClassifier(nn.Module):
|
6 |
+
def __init__(
|
7 |
+
self,
|
8 |
+
embeddings: torch.tensor,
|
9 |
+
hidden_size: int,
|
10 |
+
num_layers: int,
|
11 |
+
dropout: float,
|
12 |
+
bidirectional: bool,
|
13 |
+
num_class: int,
|
14 |
+
) -> None:
|
15 |
+
super(SeqClassifier, self).__init__()
|
16 |
+
self.embed = nn.Embedding.from_pretrained(embeddings, freeze=False)
|
17 |
+
self.hidden_size = hidden_size
|
18 |
+
self.num_layers = num_layers
|
19 |
+
self.dropout = dropout
|
20 |
+
self.bidirectional = bidirectional
|
21 |
+
self.num_class = num_class
|
22 |
+
|
23 |
+
# Model architecture
|
24 |
+
self.rnn = nn.GRU(
|
25 |
+
input_size=embeddings.size(1),
|
26 |
+
hidden_size=hidden_size,
|
27 |
+
num_layers=num_layers,
|
28 |
+
dropout=dropout,
|
29 |
+
bidirectional=bidirectional,
|
30 |
+
batch_first=True
|
31 |
+
)
|
32 |
+
self.dropout_layer = nn.Dropout(p=self.dropout)
|
33 |
+
self.fc = nn.Linear(self.encoder_output_size, num_class)
|
34 |
+
|
35 |
+
@property
|
36 |
+
def encoder_output_size(self) -> int:
|
37 |
+
# Calculate the output dimension of RNN
|
38 |
+
if self.bidirectional:
|
39 |
+
return self.hidden_size * 2
|
40 |
+
else:
|
41 |
+
return self.hidden_size
|
42 |
+
|
43 |
+
class SeqTagger(SeqClassifier):
|
44 |
+
def __init__(self, embeddings, hidden_size, num_layers, dropout, bidirectional, num_class):
|
45 |
+
super(SeqTagger, self).__init__(embeddings, hidden_size, num_layers, dropout, bidirectional, num_class)
|
46 |
+
|
47 |
+
def forward(self, batch) -> torch.Tensor:
|
48 |
+
# Apply the embedding layer that maps each token to its embedding
|
49 |
+
batch = self.embed(batch)
|
50 |
+
|
51 |
+
# Run the LSTM along the sentences of length batch_max_len
|
52 |
+
batch, _ = self.rnn(batch) # dim: batch_size x max_len x hidden_size
|
53 |
+
|
54 |
+
batch = self.dropout_layer(batch)
|
55 |
+
|
56 |
+
if not self.training:
|
57 |
+
# Remove this block after completing train_slot, if batch and predict should be combined
|
58 |
+
batch = batch.reshape(-1, batch.shape[2]) # dim: batch_size*max_len x hidden_size
|
59 |
+
|
60 |
+
# Pass through the fully connected layer
|
61 |
+
batch = self.fc(batch)
|
62 |
+
return F.log_softmax(batch, dim=1) # dim: batch_size*max_len x num_tags
|
63 |
+
|
64 |
+
batch = batch.reshape(-1, batch.shape[2]) # dim: batch_size*max_len x hidden_size
|
65 |
+
|
66 |
+
# Pass through the fully connected layer
|
67 |
+
batch = self.fc(batch)
|
68 |
+
return F.log_softmax(batch, dim=1) # dim: batch_size*max_len x num_tags
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==1.12.1
|
2 |
+
seqeval==1.2.2
|
3 |
+
tqdm
|
4 |
+
numpy
|
5 |
+
pandas
|
6 |
+
scikit-learn==1.1.2
|
7 |
+
transformers[torch]
|
8 |
+
datasets
|
9 |
+
huggingface-hub
|
utils.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Iterable, List
|
2 |
+
|
3 |
+
class Vocab:
|
4 |
+
PAD = "[PAD]"
|
5 |
+
UNK = "[UNK]"
|
6 |
+
|
7 |
+
def __init__(self, vocab: Iterable[str]) -> None:
|
8 |
+
self.token2idx = {
|
9 |
+
Vocab.PAD: 0,
|
10 |
+
Vocab.UNK: 1,
|
11 |
+
**{token: i for i, token in enumerate(vocab, 2)},
|
12 |
+
}
|
13 |
+
|
14 |
+
@property
|
15 |
+
def pad_id(self) -> int:
|
16 |
+
return self.token2idx[Vocab.PAD]
|
17 |
+
|
18 |
+
@property
|
19 |
+
def unk_id(self) -> int:
|
20 |
+
return self.token2idx[Vocab.UNK]
|
21 |
+
|
22 |
+
@property
|
23 |
+
def tokens(self) -> List[str]:
|
24 |
+
return list(self.token2idx.keys())
|
25 |
+
|
26 |
+
def token_to_id(self, token: str) -> int:
|
27 |
+
return self.token2idx.get(token, self.unk_id)
|
28 |
+
|
29 |
+
def encode(self, tokens: List[str]) -> List[int]:
|
30 |
+
return [self.token_to_id(token) for token in tokens]
|
31 |
+
|
32 |
+
def encode_batch(
|
33 |
+
self, batch_tokens: List[List[str]], to_len: int = None
|
34 |
+
) -> List[List[int]]:
|
35 |
+
batch_ids = [self.encode(tokens) for tokens in batch_tokens]
|
36 |
+
to_len = max(len(ids) for ids in batch_ids) if to_len is None else to_len
|
37 |
+
padded_ids = pad_to_len(batch_ids, to_len, self.pad_id)
|
38 |
+
return padded_ids
|
39 |
+
|
40 |
+
def pad_to_len(seqs: List[List[int]], to_len: int, padding: int) -> List[List[int]]:
|
41 |
+
paddeds = [seq[:to_len] + [padding] * max(0, to_len - len(seq)) for seq in seqs]
|
42 |
+
return paddeds
|