shoaibanwar13 commited on
Commit
7be98ad
·
verified ·
1 Parent(s): cac3183

Upload my-model.py

Browse files
Files changed (1) hide show
  1. my-model.py +40 -0
my-model.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ # Load a pre-trained tokenizer and model
5
+ model_name = "distilbert-base-uncased"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
8
+
9
+ # Load a small dataset for fine-tuning (e.g., SST2 for sentiment analysis)
10
+ dataset = load_dataset("sst2")
11
+ def tokenize_function(examples):
12
+ return tokenizer(examples["sentence"], padding="max_length", truncation=True)
13
+
14
+ # Tokenize the dataset
15
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
16
+
17
+ # Define training arguments
18
+ training_args = TrainingArguments(
19
+ output_dir="./results",
20
+ evaluation_strategy="epoch",
21
+ logging_dir="./logs",
22
+ num_train_epochs=1,
23
+ per_device_train_batch_size=8,
24
+ per_device_eval_batch_size=8,
25
+ )
26
+
27
+ # Create Trainer instance
28
+ trainer = Trainer(
29
+ model=model,
30
+ args=training_args,
31
+ train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(1000)), # Use a small subset for training
32
+ eval_dataset=tokenized_datasets["validation"].select(range(100)),
33
+ )
34
+
35
+ # Train the model
36
+ trainer.train()
37
+
38
+ # Save the trained model
39
+ model.save_pretrained("my-small-model")
40
+ tokenizer.save_pretrained("my-small-model")