Spaces:

xjlulu
/

slot_tagging

Runtime error

App Files Files Community

xjlulu commited on Oct 7, 2023

Commit

fba58f1

•

1 Parent(s): 702e96a

"good run"

Browse files

Files changed (9) hide show

README.md +30 -0
app.py +116 -4
cache/slot/tag2idx.json +11 -0
cache/slot/vocab.pkl +3 -0
ckpt/slot/slot_checkpoint.pth +3 -0
dataset.py +70 -0
model.py +68 -0
requirements.txt +9 -0
utils.py +42 -0

README.md CHANGED Viewed

@@ -5,9 +5,39 @@ colorFrom: purple
 colorTo: yellow
 sdk: gradio
 sdk_version: 3.47.1
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorTo: yellow
 sdk: gradio
 sdk_version: 3.47.1
+python_version: 3.10
 app_file: app.py
+app_port: 7860
+fullWidth: false
 pinned: false
+hf_oauth: false
+disable_embedding: false
 license: apache-2.0
+tags:
+  - NLP
+  - Slot Tagging
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Slot Tagging
+![Demo](https://your-demo-url-here.com)
+This is a Slot Tagging demo powered by Gradio. It allows you to enter a sentence, and it will predict and highlight the slots in the text.
+## Usage
+1. Enter a sentence in the text box.
+2. Click the "Submit" button to see the predicted slots highlighted in the text.
+## Getting Started
+To run this demo locally, follow these steps:
+1. Clone this repository.
+2. Install the required dependencies listed in the `requirements.txt` file.
+3. Run the `app.py` file using Python.
+```bash
+python app.py

app.py CHANGED Viewed

@@ -1,7 +1,119 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import json
+import pickle
+from pathlib import Path
+from utils import Vocab
+from model import SeqTagger
+from dataset import SeqTaggingClsDataset
+from typing import Dict
+import torch
+# Disable cudnn to ensure the model runs on CPU
+torch.backends.cudnn.enabled = False
+# Define hyperparameters
+max_len = 256
+hidden_size = 500
+num_layers = 2
+dropout = 0.2
+bidirectional = True
+lr = 1e-3
+batch_size = 1
+device = "cpu"
+# Model and data paths
+ckpt_dir = Path("./ckpt/slot/")
+cache_dir = Path("./cache/slot/")
+# Load the vocabulary
+with open(cache_dir / "vocab.pkl", "rb") as f:
+    vocab: Vocab = pickle.load(f)
+# Load the tag mapping
+tag_idx_path = cache_dir / "tag2idx.json"
+tag2idx: Dict[str, int] = json.loads(tag_idx_path.read_text())
+idx2tag = {idx: tag for tag, idx in tag2idx.items()}
+def _idx2tag(idx: int):
+    return idx2tag[idx]
+# Create the dataset
+datasets = SeqTaggingClsDataset({}, vocab, tag2idx, max_len)
+# Create an uninitialized tensor with the defined shape
+shape = (4117, 300)
+embeddings = torch.empty(shape).to(device)
+# Create the model
+best_model = SeqTagger(
+    embeddings=embeddings,
+    hidden_size=hidden_size,
+    num_layers=num_layers,
+    dropout=dropout,
+    bidirectional=bidirectional,
+    num_class=len(tag2idx)
+).to(device)
+# Define the path to the model checkpoint
+ckpt_path = ckpt_dir / "slot_checkpoint.pth"
+# Load the model weights
+checkpoint = torch.load(ckpt_path, map_location=torch.device('cpu'))
+best_model.load_state_dict(checkpoint['model_state_dict'])
+# Set the model to evaluation mode
+best_model.eval()
+def classify(text: str):
+    # Tokenize the text
+    str_text = [str(text.split())]
+    dic_text = {"tokens": str_text, "tags": [None], "id": ["text-0"]}
+    encoded_data = datasets.collate_fn(dic_text)
+    preds = []
+    mask = encoded_data['encoded_tags']
+    mask = (mask != -1)
+    # Use the trained model to predict each data point
+    for encoded_token in encoded_data['encoded_tokens'].to(device):
+        encoded_token = encoded_token.reshape(1, encoded_token.shape[0])
+        outputs = best_model(encoded_token)
+        outputs = torch.argmax(outputs, dim=1)[mask[0]].tolist()
+        preds.extend([[_idx2tag(output) for output in outputs]])
+    text_tags = []
+    for i, tag in enumerate(preds[0]):
+        if tag == "O":
+            text_tags.extend([(text.split()[i], None), (" ", None)])
+        else:
+            text_tags.extend([(text.split()[i], tag), (" ", None)])
+    return text_tags
+# Create a Gradio interface
+demo = gr.Interface(
+    classify,
+    gr.Textbox(placeholder="Please enter a text..."),
+    gr.HighlightedText(),
+    interpretation="none",
+    live=False,
+    enable_queue=True,
+    examples=[
+        ["i have three people for august seventh"],
+        ["a table for 2 adults and 4 children please"],
+        ["i have a booking tomorrow for chara conelly at 9pm"],
+        ["me and 4 others will be there at 8:30pm"],
+        ["probably malik belliard has done the booking and it is on in 10 days"],
+        ["i want to book a table for me and my wife tonight at 6 p.m"],
+        ["date 18th of december"]
+    ],
+    title="Slot Tagging",
+    description="This is a demo for slot tagging. Enter a sentence, and it will predict and highlight the slots."
+)
+# Launch the Gradio interface
+demo.launch(share=True)

cache/slot/tag2idx.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "B-time": 0,
+  "I-date": 1,
+  "I-people": 2,
+  "B-last_name": 3,
+  "O": 4,
+  "I-time": 5,
+  "B-date": 6,
+  "B-people": 7,
+  "B-first_name": 8
+}

cache/slot/vocab.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23fcbc41efc0b4c4aaa74d7dbb67c74cd2aea6490d19c3465020abe77b602647
+size 49861

ckpt/slot/slot_checkpoint.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55f20f97d1122d3277b4d69ffc3407108f3f16ab5a9094df2cd8d259ae20db5a
+size 97891750

dataset.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# dataset.py
+from typing import List, Dict
+import torch
+from torch.utils.data import Dataset
+from utils import Vocab
+import numpy as np
+import re
+class SeqClsDataset(Dataset):
+    def __init__(
+        self,
+        data: List[Dict],
+        vocab: Vocab,
+        label_mapping: Dict[str, int],
+        max_len: int,
+    ):
+        self.data = data
+        self.vocab = vocab
+        self.label_mapping = label_mapping
+        self._idx2label = {idx: intent for intent, idx in self.label_mapping.items()}
+        self.max_len = max_len
+    def __len__(self) -> int:
+        return len(self.data)
+    def __getitem__(self, index) -> Dict:
+        instance = self.data[index]
+        return instance
+    @property
+    def num_classes(self) -> int:
+        return len(self.label_mapping)
+    def label2idx(self, label: str):
+        return self.label_mapping[label]
+    def idx2label(self, idx: int):
+        return self._idx2label[idx]
+class SeqTaggingClsDataset(SeqClsDataset):
+    def collate_fn(self, samples: List[Dict]) -> Dict:
+        batch_size = len(samples['tokens'])
+        tokens = samples["tokens"]
+        tags = samples["tags"]  # list[str]
+        batch_data = self.vocab.token_to_id("[PAD]") * np.ones((batch_size, self.max_len))
+        batch_labels = -1 * np.ones((batch_size, self.max_len))
+        # Copy the data to the numpy array
+        for j in range(batch_size):
+            tokens[j] = eval(tokens[j])
+            cur_len = len(tokens[j])
+            tags[j] = [self.label_mapping["O"]] * cur_len
+            batch_data[j][:cur_len] = self.vocab.encode(tokens[j])
+            batch_labels[j][:cur_len] = tags[j]
+        # Convert integer index sequences to PyTorch tensors
+        batch_data = torch.LongTensor(batch_data)
+        batch_labels = torch.LongTensor(batch_labels)
+        # Create a batch data dictionary
+        batch_data = {
+            "encoded_tokens": batch_data,
+            "encoded_tags": batch_labels
+        }
+        return batch_data

model.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SeqClassifier(nn.Module):
+    def __init__(
+        self,
+        embeddings: torch.tensor,
+        hidden_size: int,
+        num_layers: int,
+        dropout: float,
+        bidirectional: bool,
+        num_class: int,
+    ) -> None:
+        super(SeqClassifier, self).__init__()
+        self.embed = nn.Embedding.from_pretrained(embeddings, freeze=False)
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.bidirectional = bidirectional
+        self.num_class = num_class
+        # Model architecture
+        self.rnn = nn.GRU(
+            input_size=embeddings.size(1),
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            dropout=dropout,
+            bidirectional=bidirectional,
+            batch_first=True
+        )
+        self.dropout_layer = nn.Dropout(p=self.dropout)
+        self.fc = nn.Linear(self.encoder_output_size, num_class)
+    @property
+    def encoder_output_size(self) -> int:
+        # Calculate the output dimension of RNN
+        if self.bidirectional:
+            return self.hidden_size * 2
+        else:
+            return self.hidden_size
+class SeqTagger(SeqClassifier):
+    def __init__(self, embeddings, hidden_size, num_layers, dropout, bidirectional, num_class):
+        super(SeqTagger, self).__init__(embeddings, hidden_size, num_layers, dropout, bidirectional, num_class)
+    def forward(self, batch) -> torch.Tensor:
+        # Apply the embedding layer that maps each token to its embedding
+        batch = self.embed(batch)
+        # Run the LSTM along the sentences of length batch_max_len
+        batch, _ = self.rnn(batch)  # dim: batch_size x max_len x hidden_size
+        batch = self.dropout_layer(batch)
+        if not self.training:
+            # Remove this block after completing train_slot, if batch and predict should be combined
+            batch = batch.reshape(-1, batch.shape[2])  # dim: batch_size*max_len x hidden_size
+            # Pass through the fully connected layer
+            batch = self.fc(batch)
+            return F.log_softmax(batch, dim=1)  # dim: batch_size*max_len x num_tags
+        batch = batch.reshape(-1, batch.shape[2])  # dim: batch_size*max_len x hidden_size
+        # Pass through the fully connected layer
+        batch = self.fc(batch)
+        return F.log_softmax(batch, dim=1)  # dim: batch_size*max_len x num_tags

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch==1.12.1
+seqeval==1.2.2
+tqdm
+numpy
+pandas
+scikit-learn==1.1.2
+transformers[torch]
+datasets
+huggingface-hub

utils.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from typing import Iterable, List
+class Vocab:
+    PAD = "[PAD]"
+    UNK = "[UNK]"
+    def __init__(self, vocab: Iterable[str]) -> None:
+        self.token2idx = {
+            Vocab.PAD: 0,
+            Vocab.UNK: 1,
+            **{token: i for i, token in enumerate(vocab, 2)},
+        }
+    @property
+    def pad_id(self) -> int:
+        return self.token2idx[Vocab.PAD]
+    @property
+    def unk_id(self) -> int:
+        return self.token2idx[Vocab.UNK]
+    @property
+    def tokens(self) -> List[str]:
+        return list(self.token2idx.keys())
+    def token_to_id(self, token: str) -> int:
+        return self.token2idx.get(token, self.unk_id)
+    def encode(self, tokens: List[str]) -> List[int]:
+        return [self.token_to_id(token) for token in tokens]
+    def encode_batch(
+        self, batch_tokens: List[List[str]], to_len: int = None
+    ) -> List[List[int]]:
+        batch_ids = [self.encode(tokens) for tokens in batch_tokens]
+        to_len = max(len(ids) for ids in batch_ids) if to_len is None else to_len
+        padded_ids = pad_to_len(batch_ids, to_len, self.pad_id)
+        return padded_ids
+def pad_to_len(seqs: List[List[int]], to_len: int, padding: int) -> List[List[int]]:
+    paddeds = [seq[:to_len] + [padding] * max(0, to_len - len(seq)) for seq in seqs]
+    return paddeds