xjlulu commited on
Commit
fba58f1
1 Parent(s): 702e96a

"good run"

Browse files
README.md CHANGED
@@ -5,9 +5,39 @@ colorFrom: purple
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.47.1
 
8
  app_file: app.py
 
 
9
  pinned: false
 
 
10
  license: apache-2.0
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 3.47.1
8
+ python_version: 3.10
9
  app_file: app.py
10
+ app_port: 7860
11
+ fullWidth: false
12
  pinned: false
13
+ hf_oauth: false
14
+ disable_embedding: false
15
  license: apache-2.0
16
+ tags:
17
+ - NLP
18
+ - Slot Tagging
19
  ---
20
 
21
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
22
+
23
+ # Slot Tagging
24
+
25
+ ![Demo](https://your-demo-url-here.com)
26
+
27
+ This is a Slot Tagging demo powered by Gradio. It allows you to enter a sentence, and it will predict and highlight the slots in the text.
28
+
29
+ ## Usage
30
+
31
+ 1. Enter a sentence in the text box.
32
+ 2. Click the "Submit" button to see the predicted slots highlighted in the text.
33
+
34
+ ## Getting Started
35
+
36
+ To run this demo locally, follow these steps:
37
+
38
+ 1. Clone this repository.
39
+ 2. Install the required dependencies listed in the `requirements.txt` file.
40
+ 3. Run the `app.py` file using Python.
41
+
42
+ ```bash
43
+ python app.py
app.py CHANGED
@@ -1,7 +1,119 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import json
3
+ import pickle
4
+ from pathlib import Path
5
+ from utils import Vocab
6
+ from model import SeqTagger
7
+ from dataset import SeqTaggingClsDataset
8
+ from typing import Dict
9
+ import torch
10
 
11
+ # Disable cudnn to ensure the model runs on CPU
12
+ torch.backends.cudnn.enabled = False
13
 
14
+ # Define hyperparameters
15
+ max_len = 256
16
+ hidden_size = 500
17
+ num_layers = 2
18
+ dropout = 0.2
19
+ bidirectional = True
20
+ lr = 1e-3
21
+ batch_size = 1
22
+
23
+ device = "cpu"
24
+
25
+ # Model and data paths
26
+ ckpt_dir = Path("./ckpt/slot/")
27
+ cache_dir = Path("./cache/slot/")
28
+
29
+ # Load the vocabulary
30
+ with open(cache_dir / "vocab.pkl", "rb") as f:
31
+ vocab: Vocab = pickle.load(f)
32
+
33
+ # Load the tag mapping
34
+ tag_idx_path = cache_dir / "tag2idx.json"
35
+ tag2idx: Dict[str, int] = json.loads(tag_idx_path.read_text())
36
+ idx2tag = {idx: tag for tag, idx in tag2idx.items()}
37
+
38
+ def _idx2tag(idx: int):
39
+ return idx2tag[idx]
40
+
41
+ # Create the dataset
42
+ datasets = SeqTaggingClsDataset({}, vocab, tag2idx, max_len)
43
+
44
+ # Create an uninitialized tensor with the defined shape
45
+ shape = (4117, 300)
46
+ embeddings = torch.empty(shape).to(device)
47
+
48
+ # Create the model
49
+ best_model = SeqTagger(
50
+ embeddings=embeddings,
51
+ hidden_size=hidden_size,
52
+ num_layers=num_layers,
53
+ dropout=dropout,
54
+ bidirectional=bidirectional,
55
+ num_class=len(tag2idx)
56
+ ).to(device)
57
+
58
+ # Define the path to the model checkpoint
59
+ ckpt_path = ckpt_dir / "slot_checkpoint.pth"
60
+
61
+ # Load the model weights
62
+ checkpoint = torch.load(ckpt_path, map_location=torch.device('cpu'))
63
+ best_model.load_state_dict(checkpoint['model_state_dict'])
64
+
65
+ # Set the model to evaluation mode
66
+ best_model.eval()
67
+
68
+ def classify(text: str):
69
+ # Tokenize the text
70
+ str_text = [str(text.split())]
71
+ dic_text = {"tokens": str_text, "tags": [None], "id": ["text-0"]}
72
+
73
+ encoded_data = datasets.collate_fn(dic_text)
74
+
75
+ preds = []
76
+ mask = encoded_data['encoded_tags']
77
+ mask = (mask != -1)
78
+
79
+ # Use the trained model to predict each data point
80
+ for encoded_token in encoded_data['encoded_tokens'].to(device):
81
+ encoded_token = encoded_token.reshape(1, encoded_token.shape[0])
82
+
83
+ outputs = best_model(encoded_token)
84
+ outputs = torch.argmax(outputs, dim=1)[mask[0]].tolist()
85
+
86
+ preds.extend([[_idx2tag(output) for output in outputs]])
87
+
88
+ text_tags = []
89
+ for i, tag in enumerate(preds[0]):
90
+ if tag == "O":
91
+ text_tags.extend([(text.split()[i], None), (" ", None)])
92
+ else:
93
+ text_tags.extend([(text.split()[i], tag), (" ", None)])
94
+
95
+ return text_tags
96
+
97
+ # Create a Gradio interface
98
+ demo = gr.Interface(
99
+ classify,
100
+ gr.Textbox(placeholder="Please enter a text..."),
101
+ gr.HighlightedText(),
102
+ interpretation="none",
103
+ live=False,
104
+ enable_queue=True,
105
+ examples=[
106
+ ["i have three people for august seventh"],
107
+ ["a table for 2 adults and 4 children please"],
108
+ ["i have a booking tomorrow for chara conelly at 9pm"],
109
+ ["me and 4 others will be there at 8:30pm"],
110
+ ["probably malik belliard has done the booking and it is on in 10 days"],
111
+ ["i want to book a table for me and my wife tonight at 6 p.m"],
112
+ ["date 18th of december"]
113
+ ],
114
+ title="Slot Tagging",
115
+ description="This is a demo for slot tagging. Enter a sentence, and it will predict and highlight the slots."
116
+ )
117
+
118
+ # Launch the Gradio interface
119
+ demo.launch(share=True)
cache/slot/tag2idx.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "B-time": 0,
3
+ "I-date": 1,
4
+ "I-people": 2,
5
+ "B-last_name": 3,
6
+ "O": 4,
7
+ "I-time": 5,
8
+ "B-date": 6,
9
+ "B-people": 7,
10
+ "B-first_name": 8
11
+ }
cache/slot/vocab.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23fcbc41efc0b4c4aaa74d7dbb67c74cd2aea6490d19c3465020abe77b602647
3
+ size 49861
ckpt/slot/slot_checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55f20f97d1122d3277b4d69ffc3407108f3f16ab5a9094df2cd8d259ae20db5a
3
+ size 97891750
dataset.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset.py
2
+
3
+ from typing import List, Dict
4
+ import torch
5
+ from torch.utils.data import Dataset
6
+ from utils import Vocab
7
+ import numpy as np
8
+ import re
9
+
10
+ class SeqClsDataset(Dataset):
11
+ def __init__(
12
+ self,
13
+ data: List[Dict],
14
+ vocab: Vocab,
15
+ label_mapping: Dict[str, int],
16
+ max_len: int,
17
+ ):
18
+ self.data = data
19
+ self.vocab = vocab
20
+ self.label_mapping = label_mapping
21
+ self._idx2label = {idx: intent for intent, idx in self.label_mapping.items()}
22
+ self.max_len = max_len
23
+
24
+ def __len__(self) -> int:
25
+ return len(self.data)
26
+
27
+ def __getitem__(self, index) -> Dict:
28
+ instance = self.data[index]
29
+ return instance
30
+
31
+ @property
32
+ def num_classes(self) -> int:
33
+ return len(self.label_mapping)
34
+
35
+ def label2idx(self, label: str):
36
+ return self.label_mapping[label]
37
+
38
+ def idx2label(self, idx: int):
39
+ return self._idx2label[idx]
40
+
41
+
42
+ class SeqTaggingClsDataset(SeqClsDataset):
43
+ def collate_fn(self, samples: List[Dict]) -> Dict:
44
+ batch_size = len(samples['tokens'])
45
+
46
+ tokens = samples["tokens"]
47
+ tags = samples["tags"] # list[str]
48
+
49
+ batch_data = self.vocab.token_to_id("[PAD]") * np.ones((batch_size, self.max_len))
50
+ batch_labels = -1 * np.ones((batch_size, self.max_len))
51
+
52
+ # Copy the data to the numpy array
53
+ for j in range(batch_size):
54
+ tokens[j] = eval(tokens[j])
55
+ cur_len = len(tokens[j])
56
+ tags[j] = [self.label_mapping["O"]] * cur_len
57
+
58
+ batch_data[j][:cur_len] = self.vocab.encode(tokens[j])
59
+ batch_labels[j][:cur_len] = tags[j]
60
+
61
+ # Convert integer index sequences to PyTorch tensors
62
+ batch_data = torch.LongTensor(batch_data)
63
+ batch_labels = torch.LongTensor(batch_labels)
64
+
65
+ # Create a batch data dictionary
66
+ batch_data = {
67
+ "encoded_tokens": batch_data,
68
+ "encoded_tags": batch_labels
69
+ }
70
+ return batch_data
model.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class SeqClassifier(nn.Module):
6
+ def __init__(
7
+ self,
8
+ embeddings: torch.tensor,
9
+ hidden_size: int,
10
+ num_layers: int,
11
+ dropout: float,
12
+ bidirectional: bool,
13
+ num_class: int,
14
+ ) -> None:
15
+ super(SeqClassifier, self).__init__()
16
+ self.embed = nn.Embedding.from_pretrained(embeddings, freeze=False)
17
+ self.hidden_size = hidden_size
18
+ self.num_layers = num_layers
19
+ self.dropout = dropout
20
+ self.bidirectional = bidirectional
21
+ self.num_class = num_class
22
+
23
+ # Model architecture
24
+ self.rnn = nn.GRU(
25
+ input_size=embeddings.size(1),
26
+ hidden_size=hidden_size,
27
+ num_layers=num_layers,
28
+ dropout=dropout,
29
+ bidirectional=bidirectional,
30
+ batch_first=True
31
+ )
32
+ self.dropout_layer = nn.Dropout(p=self.dropout)
33
+ self.fc = nn.Linear(self.encoder_output_size, num_class)
34
+
35
+ @property
36
+ def encoder_output_size(self) -> int:
37
+ # Calculate the output dimension of RNN
38
+ if self.bidirectional:
39
+ return self.hidden_size * 2
40
+ else:
41
+ return self.hidden_size
42
+
43
+ class SeqTagger(SeqClassifier):
44
+ def __init__(self, embeddings, hidden_size, num_layers, dropout, bidirectional, num_class):
45
+ super(SeqTagger, self).__init__(embeddings, hidden_size, num_layers, dropout, bidirectional, num_class)
46
+
47
+ def forward(self, batch) -> torch.Tensor:
48
+ # Apply the embedding layer that maps each token to its embedding
49
+ batch = self.embed(batch)
50
+
51
+ # Run the LSTM along the sentences of length batch_max_len
52
+ batch, _ = self.rnn(batch) # dim: batch_size x max_len x hidden_size
53
+
54
+ batch = self.dropout_layer(batch)
55
+
56
+ if not self.training:
57
+ # Remove this block after completing train_slot, if batch and predict should be combined
58
+ batch = batch.reshape(-1, batch.shape[2]) # dim: batch_size*max_len x hidden_size
59
+
60
+ # Pass through the fully connected layer
61
+ batch = self.fc(batch)
62
+ return F.log_softmax(batch, dim=1) # dim: batch_size*max_len x num_tags
63
+
64
+ batch = batch.reshape(-1, batch.shape[2]) # dim: batch_size*max_len x hidden_size
65
+
66
+ # Pass through the fully connected layer
67
+ batch = self.fc(batch)
68
+ return F.log_softmax(batch, dim=1) # dim: batch_size*max_len x num_tags
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch==1.12.1
2
+ seqeval==1.2.2
3
+ tqdm
4
+ numpy
5
+ pandas
6
+ scikit-learn==1.1.2
7
+ transformers[torch]
8
+ datasets
9
+ huggingface-hub
utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Iterable, List
2
+
3
+ class Vocab:
4
+ PAD = "[PAD]"
5
+ UNK = "[UNK]"
6
+
7
+ def __init__(self, vocab: Iterable[str]) -> None:
8
+ self.token2idx = {
9
+ Vocab.PAD: 0,
10
+ Vocab.UNK: 1,
11
+ **{token: i for i, token in enumerate(vocab, 2)},
12
+ }
13
+
14
+ @property
15
+ def pad_id(self) -> int:
16
+ return self.token2idx[Vocab.PAD]
17
+
18
+ @property
19
+ def unk_id(self) -> int:
20
+ return self.token2idx[Vocab.UNK]
21
+
22
+ @property
23
+ def tokens(self) -> List[str]:
24
+ return list(self.token2idx.keys())
25
+
26
+ def token_to_id(self, token: str) -> int:
27
+ return self.token2idx.get(token, self.unk_id)
28
+
29
+ def encode(self, tokens: List[str]) -> List[int]:
30
+ return [self.token_to_id(token) for token in tokens]
31
+
32
+ def encode_batch(
33
+ self, batch_tokens: List[List[str]], to_len: int = None
34
+ ) -> List[List[int]]:
35
+ batch_ids = [self.encode(tokens) for tokens in batch_tokens]
36
+ to_len = max(len(ids) for ids in batch_ids) if to_len is None else to_len
37
+ padded_ids = pad_to_len(batch_ids, to_len, self.pad_id)
38
+ return padded_ids
39
+
40
+ def pad_to_len(seqs: List[List[int]], to_len: int, padding: int) -> List[List[int]]:
41
+ paddeds = [seq[:to_len] + [padding] * max(0, to_len - len(seq)) for seq in seqs]
42
+ return paddeds