pizb commited on
Commit
891ef77
Β·
1 Parent(s): ece63b7

feat: change dataset to custom datset

Browse files
Files changed (2) hide show
  1. Readme.md +19 -0
  2. article_base_train_test.py +83 -66
Readme.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset Structure
2
+
3
+ /custom_vqa_project/
4
+ β”‚
5
+ β”œβ”€β”€ /dataset/
6
+ β”‚ β”œβ”€β”€ /images/
7
+ β”‚ β”‚ β”œβ”€β”€ train/
8
+ β”‚ β”‚ β”‚ β”œβ”€β”€ image1.jpg
9
+ β”‚ β”‚ β”‚ β”œβ”€β”€ image2.jpg
10
+ β”‚ β”‚ └── val/
11
+ β”‚ β”‚ β”œβ”€β”€ image3.jpg
12
+ β”‚ β”‚ └── image4.jpg
13
+ β”‚ β”œβ”€β”€ train.json # Metadata for the training set
14
+ β”‚ └── val.json # Metadata for the validation set
15
+ β”‚
16
+ β”œβ”€β”€ /scripts/
17
+ β”‚ └── train.py # Your fine-tuning script
18
+ β”‚
19
+ └── README.md
article_base_train_test.py CHANGED
@@ -1,80 +1,97 @@
1
- from huggingface_hub import notebook_login
2
- from datasets import load_dataset
 
3
  from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
4
  import torch
 
5
  from peft import get_peft_model, LoraConfig
6
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def main():
9
- ds = load_dataset('HuggingFaceM4/VQAv2', split="train", trust_remote_code=True)
10
- cols_remove = ["question_type", "answers", "answer_type", "image_id", "question_id"]
11
- ds = ds.remove_columns(cols_remove)
12
- ds = ds.train_test_split(test_size=0.1)
13
- train_ds = ds["train"]
14
- val_ds = ds["test"]
15
-
16
- model_id = "google/paligemma-3b-pt-224"
17
- processor = PaliGemmaProcessor.from_pretrained(model_id)
18
- image_token = processor.tokenizer.convert_tokens_to_ids("<image>")
19
- device = "cuda"
20
 
21
- bnb_config = BitsAndBytesConfig(
22
  load_in_4bit=True,
23
  bnb_4bit_quant_type="nf4",
24
  bnb_4bit_compute_type=torch.bfloat16
25
- )
26
- lora_config = LoraConfig(
27
- r=8,
28
- target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
29
- task_type="CAUSAL_LM",
30
- )
31
- model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
32
- model = get_peft_model(model, lora_config)
33
- model.print_trainable_parameters()
34
- #trainable params: 11,298,816 || all params: 2,934,634,224 || trainable%: 0.38501616002417344
35
-
36
- args=TrainingArguments(
37
- num_train_epochs=2,
38
- remove_unused_columns=False,
39
- per_device_train_batch_size=16,
40
- gradient_accumulation_steps=4,
41
- warmup_steps=2,
42
- learning_rate=2e-5,
43
- weight_decay=1e-6,
44
- adam_beta2=0.999,
45
- logging_steps=100,
46
- # optim="adamw_hf",
47
- optim="paged_adamw_8bit", # for QLoRA
48
- save_strategy="steps",
49
- save_steps=1000,
50
- push_to_hub=True,
51
- save_total_limit=1,
52
- bf16=True,
53
- report_to=["tensorboard"],
54
- dataloader_pin_memory=False
55
- )
56
-
57
- def collate_fn(examples):
58
- texts = ["answer " + example["question"] for example in examples]
59
- labels= [example['multiple_choice_answer'] for example in examples] # μš°λ¦¬λŠ” label 이 ν•„μš” 없을듯?
60
- images = [example["image"].convert("RGB") for example in examples]
61
- tokens = processor(text=texts, images=images, suffix=labels,
62
- return_tensors="pt", padding="longest")
63
 
64
- tokens = tokens.to(torch.bfloat16).to(device)
65
- return tokens
66
-
67
- trainer = Trainer(
68
- model=model,
69
- train_dataset=train_ds,
70
- eval_dataset=val_ds,
71
- data_collator=collate_fn,
72
- args=args
73
- )
74
-
75
- trainer.train()
 
 
 
 
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  if __name__ == "__main__":
79
- notebook_login()
80
- main()
 
1
+ import os
2
+ import json
3
+ from datasets import load_dataset, Dataset
4
  from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
5
  import torch
6
+ from PIL import Image
7
  from peft import get_peft_model, LoraConfig
8
 
9
+ # Function to load custom dataset
10
+ def load_custom_dataset(json_file, image_folder):
11
+ with open(json_file, 'r') as f:
12
+ data = json.load(f)
13
+
14
+ # Prepare dataset format for Hugging Face
15
+ questions = []
16
+ images = []
17
+ answers = []
18
+ multiple_choice_answers = []
19
 
20
+ for item in data:
21
+ questions.append(item['question'])
22
+ images.append(os.path.join(image_folder, item['image_id']))
23
+ answers.append(item['answer'])
24
+ multiple_choice_answers.append(item['multiple_choice_answer'])
25
+
26
+ return Dataset.from_dict({
27
+ 'question': questions,
28
+ 'image': images,
29
+ 'answer': answers,
30
+ 'multiple_choice_answer': multiple_choice_answers
31
+ })
32
+
33
+ # Main training function
34
  def main():
35
+ # Load custom dataset
36
+ train_ds = load_custom_dataset('dataset/train.json', 'dataset/images/train')
37
+ val_ds = load_custom_dataset('dataset/val.json', 'dataset/images/val')
38
+
39
+ model_id = "google/paligemma-3b-pt-224"
40
+ processor = PaliGemmaProcessor.from_pretrained(model_id)
41
+ image_token = processor.tokenizer.convert_tokens_to_ids("<image>")
42
+ device = "cuda"
 
 
 
43
 
44
+ bnb_config = BitsAndBytesConfig(
45
  load_in_4bit=True,
46
  bnb_4bit_quant_type="nf4",
47
  bnb_4bit_compute_type=torch.bfloat16
48
+ )
49
+ lora_config = LoraConfig(
50
+ r=8,
51
+ target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
52
+ task_type="CAUSAL_LM"
53
+ )
54
+
55
+ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0})
56
+ model = get_peft_model(model, lora_config)
57
+ model.print_trainable_parameters()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ args = TrainingArguments(
60
+ num_train_epochs=2,
61
+ remove_unused_columns=False,
62
+ per_device_train_batch_size=16,
63
+ gradient_accumulation_steps=4,
64
+ warmup_steps=2,
65
+ learning_rate=2e-5,
66
+ weight_decay=1e-6,
67
+ logging_steps=100,
68
+ optim="paged_adamw_8bit",
69
+ save_strategy="steps",
70
+ save_steps=1000,
71
+ save_total_limit=1,
72
+ bf16=True,
73
+ report_to=["tensorboard"],
74
+ dataloader_pin_memory=False
75
+ )
76
 
77
+ # Custom collate function
78
+ def collate_fn(examples):
79
+ texts = ["answer " + example["question"] for example in examples]
80
+ labels = [example['multiple_choice_answer'] for example in examples]
81
+ images = [Image.open(image_path).convert("RGB") for image_path in examples['image']]
82
+ tokens = processor(text=texts, images=images, suffix=labels, return_tensors="pt", padding="longest")
83
+ tokens = tokens.to(torch.bfloat16).to(device)
84
+ return tokens
85
+
86
+ trainer = Trainer(
87
+ model=model,
88
+ train_dataset=train_ds,
89
+ eval_dataset=val_ds,
90
+ data_collator=collate_fn,
91
+ args=args
92
+ )
93
+
94
+ trainer.train()
95
 
96
  if __name__ == "__main__":
97
+ main()