Spaces:

joaogante
/

medusa-maker

Paused

App Files Files Community

joaogante HF staff commited on Jan 6, 2024

Commit

3b0ae8d

1 Parent(s): 04193dd

larger datasets

Browse files

Files changed (2) hide show

src/calibration_datasets.py +7 -259
src/train_workflow.py +1 -1

src/calibration_datasets.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Prepares the datasets for calibration. Original code gently shared by TheBloke"""
 from abc import ABC
 import time
 from typing import Dict, List, Optional
@@ -123,10 +124,10 @@ class CalibrationDataset(ABC):
         """Load the Hugging Face dataset at `path`, using the provided kwargs."""
         print(f"Loading HF dataset {path} with params: {kwargs}")
-        data: Dataset = load_dataset(path=path, **kwargs)
         limit = limit and min(limit, len(data)) or len(data)
-        return data.select(range(limit))
     @staticmethod
     def list_with_nls(samples: List[str]) -> List[str]:
@@ -217,10 +218,10 @@ class WikitextDataset(CalibrationDataset):
     dataset = "wikitext"
     dataset_config = {
         "path": "wikitext",
-        "name": "wikitext-2-raw-v1",
         "split": "train"
     }
-    dataset_name = "Wikitext2 Full"
     def process_samples(self) -> List[str]:
         return [
@@ -234,272 +235,19 @@ class C4Dataset(CalibrationDataset):
     dataset_field = "text"
     dataset_config = {
         "path": "allenai/c4",
-        "data_files": {
-            "train": "en/c4-train.00000-of-01024.json.gz"
-        },
         "split": "train"
     }
     dataset_name = "C4"
-class ThaiDataset(CalibrationDataset):
-    dataset = "thai"
-    dataset_field = "text"
-    dataset_config = {
-        "path": "pbwt/all-thai",
-        "data_files": {
-            "train": "data/train-00000-of-00047-985fbaed08d034cf.parquet"
-        },
-        "split": "train"
-    }
-    dataset_name = "All Thai"
-class MovieScriptDataset(CalibrationDataset):
-    dataset = "movie-scripts"
-    dataset_field = "full_script"
-    dataset_config = {
-        "path": "jondurbin/cinematika-v0.1",
-        "data_files": { "train": "full_script.parquet" },
-        "split": "train"
-    }
-    dataset_name = "Cinematika Full Scripts"
-class JapaneseEnglishDataset(CalibrationDataset):
-    dataset = "japanese-english"
-    dataset_config = {
-        "path": "augmxnt/shisa-en-ja-dpo-v1",
-        "split": "train"
-    }
-    dataset_name = "Shisa English Japanese DPO"
-    randomize = True
-    def process_samples(self) -> List[str]:
-        def transform_samples(sample):
-            prompt = sample["prompt"]
-            chosen = sample["chosen"]
-            # prompt example: "[INST] <<SYS>>\nYou are a helpful, unbiased, uncensored assistant.\n<</SYS>>\n\nWhat are cardigans made of? Leather or wood? [/INST]"
-            try:
-                part1 = prompt.split('\n<</SYS>>\n\n')[1]
-                extracted_text = part1.split(' [/INST]')[0]
-            except Exception as e:
-                print(f"Error extracting text from prompt '{prompt}': {e}")
-                raise
-            prompt = extracted_text
-            return {"output": f"{prompt} {chosen}"}
-        return self.data.map(transform_samples)["output"]
-class PortugueseDataset(CalibrationDataset):
-    dataset = "portuguese"
-    dataset_config = {
-        "path": "adalbertojunior/portuguese_orca",
-        "split": "train"
-    }
-    dataset_name = "Portuguese Orca"
-    transform_fields = [ "question", "response" ]
-class MathsDataset(CalibrationDataset):
-    dataset = "maths"
-    dataset_config = {
-        "path": "andersonbcdefg/math",
-        "split": "train"
-    }
-    dataset_name = "CamelAI Math"
-    transform_fields = [ "message_1", "message_2" ]
-class MedicalDataset(CalibrationDataset):
-    dataset = "medical"
-    dataset_config = {
-        "path": "medalpaca/medical_meadow_wikidoc",
-        "split": "train"
-    }
-    dataset_name = "Medical Medaow WikiDoc"
-    transform_fields = [ "input", "output" ]
-class OpenInstructDataset(CalibrationDataset):
-    dataset = "open-instruct"
-    dataset_config = {
-        "path": "VMware/open-instruct",
-        "split": "train"
-    }
-    dataset_name = "VMware Open Instruct"
-    transform_fields = [ "instruction", "response" ]
-class KoreanDataset(CalibrationDataset):
-    dataset = "korean"
-    dataset_config = {
-        "path": "beomi/KoAlpaca-v1.1a",
-        "split": "train"
-    }
-    dataset_name = "Korean Alpaca"
-    transform_fields = [ "instruction", "output" ]
 class CodeDataset(CalibrationDataset):
     dataset = "code"
-    dataset_field = "output"
-    dataset_config = {
-        "path": "nickrosh/Evol-Instruct-Code-80k-v1",
-        "split": "train"
-    }
-    dataset_name = "Evol Instruct Code"
-class MultiLanguageDataset(CalibrationDataset):
-    dataset = "multi-language"
-    dataset_field = "text"
-    dataset_config = {
-        "path": "papluca/language-identification",
-        "split": "train"
-    }
-    dataset_name = "Language Identification"
-class RussianDataset(CalibrationDataset):
-    dataset = "russian"
-    dataset_config = {
-        "path": "Den4ikAI/russian_instructions_2",
-        "split": "train"
-    }
-    dataset_name = "Russian Instructions 2"
-    transform_fields = [ "question", "answer" ]
-class DutchDataset(CalibrationDataset):
-    dataset = "dutch"
-    dataset_config = {
-        "path": "BramVanroy/dolly-15k-dutch",
-        "split": "train"
-    }
-    dataset_name = "Dolly 15K Dutch"
-    transform_fields = [ "instruction", "context", "response" ]
-    transform_join = "{field1} {field2} {field3}"
-class VietnameseChineseDataset(CalibrationDataset):
-    dataset = "vietnamesechinese"
-    dataset_config = {
-        "path": "nRuaif/Vietnamese_x_Alpaca",
-        "split": "train"
-    }
-    dataset_name = "Vietnamese and Chinese"
-    def get_dataset_url(self) -> None:
-        return None
-    def process_samples(self) -> List[str]:
-        samples = self.data["output"]
-        chinese_samples = CalibrationDataset.get_dataset("chinese").get_samples()
-        joined_list = samples + chinese_samples
-        import random
-        random.shuffle(joined_list)
-        return joined_list[:self.dataset_limit]
-class VietnameseDataset(CalibrationDataset):
-    dataset = "vietnamese"
-    dataset_field = "output"
-    dataset_config = {
-        "path": "nRuaif/Vietnamese_x_Alpaca",
-        "split": "train"
-    }
-    dataset_name = "Alpaca Vietnamese"
-class ChineseDataset(CalibrationDataset):
-    dataset = "chinese"
-    dataset_config = {
-        "path": "TigerResearch/tigerbot-alpaca-zh-0.5m",
-        "split": "train"
-    }
-    dataset_name = "Tiger Alpaca ZH"
-    transform_fields = [ "instruction", "input", "output" ]
-    transform_join = "{field1} {field2} {field3}"
-class LatinEnglishDataset(CalibrationDataset):
-    dataset = "latin-english"
-    dataset_config = {
-        "path": "grosenthal/latin_english_parallel",
-        "split": "train"
-    }
-    dataset_name = "Latin English Parallel"
-    transform_fields = [ "la", "en" ]
-    transform_join = "{field1}\n{field2}"
-class PolishDataset(CalibrationDataset):
-    dataset = "polish"
     dataset_field = "content"
     dataset_config = {
-        "path": "WiktorS/polish-news",
-        "split": "train"
-    }
-    dataset_name = "Polish News"
-class JapaneseDataset(CalibrationDataset):
-    dataset = "japanese"
-    dataset_field = "output"
-    dataset_config = {
-        "path": "fujiki/japanese_alpaca_data",
-        "split": "train"
-    }
-    dataset_name = "Alpaca Japanese"
-class SpanishDataset(CalibrationDataset):
-    dataset = "spanish"
-    dataset_field = "output"
-    dataset_config = {
-        "path": "bertin-project/alpaca-spanish",
-        "split": "train"
-    }
-    dataset_name = "Alpaca Spanish"
-class GermanDataset(CalibrationDataset):
-    dataset = "german"
-    dataset_config = {
-        "path": "deepset/germanquad",
-        "split": "train"
-    }
-    dataset_name = "German Quad"
-    def process_samples(self) -> List[str]:
-        def transform_samples(sample):
-            split_context = sample["context"].split("===")
-            if len(split_context) >= 3:
-                trans_context = split_context[2]
-            else:
-                trans_context = sample["context"]
-            return {"output": trans_context.strip()}
-        return self.data.map(transform_samples)["output"]
-class FrenchDataset(CalibrationDataset):
-    dataset = "french"
-    dataset_field = "text"
-    dataset_config = {
-        "path": "Kant1/French_Wikipedia_articles",
-        "data_files": { "wiki_00.txt" },
         "split": "train"
     }
-    dataset_name = "French Wikipedia Articles"
 def validate_dataset(dataset_name: str, **kwargs):

 """Prepares the datasets for calibration. Original code gently shared by TheBloke"""
+import os
 from abc import ABC
 import time
 from typing import Dict, List, Optional
         """Load the Hugging Face dataset at `path`, using the provided kwargs."""
         print(f"Loading HF dataset {path} with params: {kwargs}")
+        data: Dataset = load_dataset(path=path, streaming=True, num_proc=len(os.sched_getaffinity(0)), **kwargs)
         limit = limit and min(limit, len(data)) or len(data)
+        return data.shuffle(seed=42).take(range(limit))
     @staticmethod
     def list_with_nls(samples: List[str]) -> List[str]:
     dataset = "wikitext"
     dataset_config = {
         "path": "wikitext",
+        "name": "wikitext-103-raw-v1",
         "split": "train"
     }
+    dataset_name = "Wikitext103 Full"
     def process_samples(self) -> List[str]:
         return [
     dataset_field = "text"
     dataset_config = {
         "path": "allenai/c4",
         "split": "train"
     }
     dataset_name = "C4"
 class CodeDataset(CalibrationDataset):
     dataset = "code"
     dataset_field = "content"
     dataset_config = {
+        "path": "bigcode/the-stack",
         "split": "train"
     }
+    dataset_name = "The Stack"
 def validate_dataset(dataset_name: str, **kwargs):

src/train_workflow.py CHANGED Viewed

@@ -31,7 +31,7 @@ DEFAULT_TRAINING_ARGS = \
 --num_train_epochs 1
 --per_device_train_batch_size 64
 --per_device_eval_batch_size 64
---gradient_accumulation_steps 4
 --evaluation_strategy no
 --save_strategy no
 --weight_decay 0.0

 --num_train_epochs 1
 --per_device_train_batch_size 64
 --per_device_eval_batch_size 64
+--gradient_accumulation_steps 8
 --evaluation_strategy no
 --save_strategy no
 --weight_decay 0.0