Upload 12 files

Browse files

Files changed (12) hide show

training/dataset_dict.json +1 -0
training/test/data-00000-of-00001.arrow +3 -0
training/test/dataset_info.json +62 -0
training/test/state.json +13 -0
training/train.py +134 -0
training/train/data-00000-of-00001.arrow +3 -0
training/train/dataset_info.json +62 -0
training/train/google_wellformed_query_dataset.csv +0 -0
training/train/state.json +13 -0
training/validation/data-00000-of-00001.arrow +3 -0
training/validation/dataset_info.json +62 -0
training/validation/state.json +13 -0

training/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "test", "validation"]}

training/test/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9afc5d689789eb1388168b261564e63112bb8f0a9d3f9c96a1cf590f73a9449
+size 190736

training/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "builder_name": "google_wellformed_query",
+  "citation": "@misc{faruqui2018identifying,\n      title={Identifying Well-formed Natural Language Questions},\n      author={Manaal Faruqui and Dipanjan Das},\n      year={2018},\n      eprint={1808.09419},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n",
+  "config_name": "default",
+  "dataset_name": "google_wellformed_query",
+  "dataset_size": 1230988,
+  "description": "Google's query wellformedness dataset was created by crowdsourcing well-formedness annotations for 25,100 queries from the Paralex corpus. Every query was annotated by five raters each with 1/0 rating of whether or not the query is well-formed.\n",
+  "download_checksums": {
+    "https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/train.tsv": {
+      "num_bytes": 805818,
+      "checksum": null
+    },
+    "https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/test.tsv": {
+      "num_bytes": 178070,
+      "checksum": null
+    },
+    "https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/dev.tsv": {
+      "num_bytes": 173131,
+      "checksum": null
+    }
+  },
+  "download_size": 1157019,
+  "features": {
+    "rating": {
+      "dtype": "float32",
+      "_type": "Value"
+    },
+    "content": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "https://github.com/google-research-datasets/query-wellformedness",
+  "license": "",
+  "size_in_bytes": 2388007,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 857383,
+      "num_examples": 17500,
+      "dataset_name": "google_wellformed_query"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 189499,
+      "num_examples": 3850,
+      "dataset_name": "google_wellformed_query"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 184106,
+      "num_examples": 3750,
+      "dataset_name": "google_wellformed_query"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

training/test/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "007669a06fb24065",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "test"
+}

training/train.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, Dataset
+from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, AlbertConfig
+from datasets import Dataset as HFDataset
+import pandas as pd
+import os
+# Ensure the /model/ directory exists
+model_dir = 'model'
+os.makedirs(model_dir, exist_ok=True)
+# Load datasets from the Arrow files
+train_dataset = HFDataset.from_file('train/data-00000-of-00001.arrow')
+val_dataset = HFDataset.from_file('validation/data-00000-of-00001.arrow')
+test_dataset = HFDataset.from_file('test/data-00000-of-00001.arrow')
+# Convert datasets to pandas DataFrame
+train_df = train_dataset.to_pandas()
+val_df = val_dataset.to_pandas()
+test_df = test_dataset.to_pandas()
+# Remove question marks at the end of each query
+train_df['content'] = train_df['content'].str.rstrip('?')
+val_df['content'] = val_df['content'].str.rstrip('?')
+test_df['content'] = test_df['content'].str.rstrip('?')
+# Convert labels to integers (0 or 1)
+train_df['rating'] = train_df['rating'].apply(lambda x: int(x >= 0.5))
+val_df['rating'] = val_df['rating'].apply(lambda x: int(x >= 0.5))
+test_df['rating'] = test_df['rating'].apply(lambda x: int(x >= 0.5))
+# Initialize ALBERT tokenizer
+tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+# Custom Dataset class for PyTorch
+class QueryDataset(Dataset):
+    def __init__(self, texts, labels, tokenizer, max_length=32):
+        self.texts = texts
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        text = str(self.texts[idx])
+        label = int(self.labels[idx])  # Ensure label is an integer
+        encoding = self.tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=self.max_length,
+            padding='max_length',  # Ensure consistent length
+            truncation=True,       # Truncate longer sequences
+            return_attention_mask=True,
+            return_tensors='pt'
+        )
+        return {
+            'input_ids': encoding['input_ids'].flatten(),
+            'attention_mask': encoding['attention_mask'].flatten(),
+            'label': torch.tensor(label, dtype=torch.long)
+        }
+# Prepare datasets
+train_dataset = QueryDataset(train_df['content'].values, train_df['rating'].values, tokenizer)
+val_dataset = QueryDataset(val_df['content'].values, val_df['rating'].values, tokenizer)
+test_dataset = QueryDataset(test_df['content'].values, test_df['rating'].values, tokenizer)
+# DataLoaders
+batch_size = 128
+train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+val_loader = DataLoader(val_dataset, batch_size=batch_size)
+test_loader = DataLoader(test_dataset, batch_size=batch_size)
+# Load ALBERT model
+model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.to(device)
+# Optimizer and loss function
+optimizer = AdamW(model.parameters(), lr=1e-5)
+criterion = nn.CrossEntropyLoss()
+# Training loop
+epochs = 4
+for epoch in range(epochs):
+    model.train()
+    total_loss = 0
+    for batch in train_loader:
+        input_ids = batch['input_ids'].to(device)
+        attention_mask = batch['attention_mask'].to(device)
+        labels = batch['label'].to(device)
+        optimizer.zero_grad()
+        outputs = model(input_ids, attention_mask=attention_mask)
+        loss = criterion(outputs.logits, labels)
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    avg_loss = total_loss / len(train_loader)
+    print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')
+    # Validation step at the end of each epoch
+    model.eval()
+    correct_predictions = 0
+    total_predictions = 0
+    with torch.no_grad():
+        for batch in val_loader:
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            labels = batch['label'].to(device)
+            outputs = model(input_ids, attention_mask=attention_mask)
+            preds = torch.argmax(outputs.logits, dim=1)
+            correct_predictions += (preds == labels).sum().item()
+            total_predictions += labels.size(0)
+    accuracy = correct_predictions / total_predictions
+    print(f'Validation Accuracy after Epoch {epoch + 1}: {accuracy:.4f}')
+# Save the model, tokenizer, and config to /model/ directory
+model.save_pretrained(model_dir, safe_serialization=True)  # Save model weights in safetensors format
+tokenizer.save_pretrained(model_dir)
+# Update config with correct classifier details
+config = AlbertConfig.from_pretrained('albert-base-v2')
+config.num_labels = 2  # Set the number of labels for classification
+config.save_pretrained(model_dir)
+print(f"Model and all required files saved to {model_dir}")

training/train/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f13a5a3621b3b4b062d3b6f1958162b1c4f6c9235cf3f22b3841f5e4a23704d2
+size 861704

training/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "builder_name": "google_wellformed_query",
+  "citation": "@misc{faruqui2018identifying,\n      title={Identifying Well-formed Natural Language Questions},\n      author={Manaal Faruqui and Dipanjan Das},\n      year={2018},\n      eprint={1808.09419},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n",
+  "config_name": "default",
+  "dataset_name": "google_wellformed_query",
+  "dataset_size": 1230988,
+  "description": "Google's query wellformedness dataset was created by crowdsourcing well-formedness annotations for 25,100 queries from the Paralex corpus. Every query was annotated by five raters each with 1/0 rating of whether or not the query is well-formed.\n",
+  "download_checksums": {
+    "https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/train.tsv": {
+      "num_bytes": 805818,
+      "checksum": null
+    },
+    "https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/test.tsv": {
+      "num_bytes": 178070,
+      "checksum": null
+    },
+    "https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/dev.tsv": {
+      "num_bytes": 173131,
+      "checksum": null
+    }
+  },
+  "download_size": 1157019,
+  "features": {
+    "rating": {
+      "dtype": "float32",
+      "_type": "Value"
+    },
+    "content": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "https://github.com/google-research-datasets/query-wellformedness",
+  "license": "",
+  "size_in_bytes": 2388007,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 857383,
+      "num_examples": 17500,
+      "dataset_name": "google_wellformed_query"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 189499,
+      "num_examples": 3850,
+      "dataset_name": "google_wellformed_query"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 184106,
+      "num_examples": 3750,
+      "dataset_name": "google_wellformed_query"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

training/train/google_wellformed_query_dataset.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

training/train/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "5aec13d80b0bb552",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

training/validation/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e07235be93ddd52fa59494a2ec702b7fbdd405e96c6069ba10d8588a45071d6
+size 185352

training/validation/dataset_info.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "builder_name": "google_wellformed_query",
+  "citation": "@misc{faruqui2018identifying,\n      title={Identifying Well-formed Natural Language Questions},\n      author={Manaal Faruqui and Dipanjan Das},\n      year={2018},\n      eprint={1808.09419},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n",
+  "config_name": "default",
+  "dataset_name": "google_wellformed_query",
+  "dataset_size": 1230988,
+  "description": "Google's query wellformedness dataset was created by crowdsourcing well-formedness annotations for 25,100 queries from the Paralex corpus. Every query was annotated by five raters each with 1/0 rating of whether or not the query is well-formed.\n",
+  "download_checksums": {
+    "https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/train.tsv": {
+      "num_bytes": 805818,
+      "checksum": null
+    },
+    "https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/test.tsv": {
+      "num_bytes": 178070,
+      "checksum": null
+    },
+    "https://raw.githubusercontent.com/google-research-datasets/query-wellformedness/master/dev.tsv": {
+      "num_bytes": 173131,
+      "checksum": null
+    }
+  },
+  "download_size": 1157019,
+  "features": {
+    "rating": {
+      "dtype": "float32",
+      "_type": "Value"
+    },
+    "content": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "https://github.com/google-research-datasets/query-wellformedness",
+  "license": "",
+  "size_in_bytes": 2388007,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 857383,
+      "num_examples": 17500,
+      "dataset_name": "google_wellformed_query"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 189499,
+      "num_examples": 3850,
+      "dataset_name": "google_wellformed_query"
+    },
+    "validation": {
+      "name": "validation",
+      "num_bytes": 184106,
+      "num_examples": 3750,
+      "dataset_name": "google_wellformed_query"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

training/validation/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "cc2d3fe0964202f3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "validation"
+}