Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

dh-mc commited on Sep 15, 2024

Commit

d028752

1 Parent(s): cf912f1

rtx4090 0-shot

Browse files

Files changed (9) hide show

data/Llama3.1-8B-Chinese-Chat_results.csv +0 -0
data/Mistral-7B-v0.3-Chinese-Chat_results.csv +0 -0
data/Qwen2-7B-Instruct_results.csv +0 -0
data/internlm2_5-7b-chat-1m_results.csv +0 -0
data/internlm2_5-7b-chat_results.csv +0 -0
llm_toolkit/eval_shots.py +1 -0
llm_toolkit/logical_reasoning_utils.py +11 -2
llm_toolkit/translation_engine.py +0 -130
llm_toolkit/translation_utils.py +0 -420

data/Llama3.1-8B-Chinese-Chat_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/Mistral-7B-v0.3-Chinese-Chat_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/Qwen2-7B-Instruct_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/internlm2_5-7b-chat-1m_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/internlm2_5-7b-chat_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

llm_toolkit/eval_shots.py CHANGED Viewed

@@ -117,6 +117,7 @@ def evaluate_model_with_num_shots(
             tokenizer=tokenizer,
             chinese_prompt=not use_english_datasets,
             using_p1=False,
         )
         if len(sys.argv) > 1:
             num = int(sys.argv[1])

             tokenizer=tokenizer,
             chinese_prompt=not use_english_datasets,
             using_p1=False,
+            num_shots=num_shots,
         )
         if len(sys.argv) > 1:
             num = int(sys.argv[1])

llm_toolkit/logical_reasoning_utils.py CHANGED Viewed

@@ -263,7 +263,12 @@ def save_results(model_name, results_path, dataset, predictions, debug=False):
 def load_logical_reasoning_dataset(
-    data_path, tokenizer=None, using_p1=True, chinese_prompt=True, test_data=None
 ):
     postfix = "" if chinese_prompt else "_en"
     train_data_file = data_path + f"/train{postfix}.csv"
@@ -276,7 +281,11 @@ def load_logical_reasoning_dataset(
     )
     if tokenizer:
-        reasoning_prompt = get_prompt_template(using_p1, chinese_prompt)
         def formatting_prompts_func(examples):
             inputs = examples["text"]

 def load_logical_reasoning_dataset(
+    data_path,
+    tokenizer=None,
+    using_p1=True,
+    chinese_prompt=True,
+    test_data=None,
+    num_shots=0,
 ):
     postfix = "" if chinese_prompt else "_en"
     train_data_file = data_path + f"/train{postfix}.csv"
     )
     if tokenizer:
+        reasoning_prompt = (
+            get_prompt_template(using_p1, chinese_prompt)
+            if num_shots == 0
+            else get_few_shot_prompt_template(num_shots, datasets["train"].to_pandas())
+        )
         def formatting_prompts_func(examples):
             inputs = examples["text"]

llm_toolkit/translation_engine.py DELETED Viewed

@@ -1,130 +0,0 @@
-import os
-import pandas as pd
-import torch
-from unsloth import FastLanguageModel, is_bfloat16_supported
-from trl import SFTTrainer
-from transformers import TrainingArguments, TextStreamer
-from llm_toolkit.translation_utils import *
-from llamafactory.chat import ChatModel
-print(f"loading {__file__}")
-def get_model_names(
-    model_name, save_method="merged_4bit_forced", quantization_method="q5_k_m"
-):
-    hub_model = model_name.split("/")[-1] + "-MAC-"
-    local_model = "models/" + hub_model
-    return {
-        "local": local_model + save_method,
-        "local-gguf": local_model + quantization_method,
-        "hub": hub_model + save_method,
-        "hub-gguf": hub_model + "gguf-" + quantization_method,
-    }
-def load_model(
-    model_name,
-    max_seq_length=2048,
-    dtype=None,
-    load_in_4bit=False,
-    template="chatml",
-    adapter_name_or_path=None,
-):
-    print(f"loading model: {model_name}")
-    if adapter_name_or_path:
-        args = dict(
-            model_name_or_path=model_name,
-            adapter_name_or_path=adapter_name_or_path,  # load the saved LoRA adapters
-            template=template,  # same to the one in training
-            finetuning_type="lora",  # same to the one in training
-            quantization_bit=4,  # load 4-bit quantized model
-        )
-        chat_model = ChatModel(args)
-        return chat_model.engine.model, chat_model.engine.tokenizer
-    model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name=model_name,  # YOUR MODEL YOU USED FOR TRAINING
-        max_seq_length=max_seq_length,
-        dtype=dtype,
-        load_in_4bit=load_in_4bit,
-        trust_remote_code=True,
-    )
-    FastLanguageModel.for_inference(model)
-    return model, tokenizer
-def test_model(model, tokenizer, prompt):
-    inputs = tokenizer(
-        [prompt],
-        return_tensors="pt",
-    ).to("cuda")
-    text_streamer = TextStreamer(tokenizer)
-    _ = model.generate(
-        **inputs, max_new_tokens=128, streamer=text_streamer, use_cache=True
-    )
-def load_trainer(
-    model,
-    tokenizer,
-    dataset,
-    num_train_epochs,
-    max_seq_length=2048,
-    fp16=False,
-    bf16=False,
-    output_dir="./outputs",
-):
-    model = FastLanguageModel.get_peft_model(
-        model,
-        r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
-        target_modules=[
-            "q_proj",
-            "k_proj",
-            "v_proj",
-            "o_proj",
-            "gate_proj",
-            "up_proj",
-            "down_proj",
-        ],
-        lora_alpha=16,
-        lora_dropout=0,  # Supports any, but = 0 is optimized
-        bias="none",  # Supports any, but = "none" is optimized
-        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
-        use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
-        random_state=3407,
-        use_rslora=False,  # We support rank stabilized LoRA
-        loftq_config=None,  # And LoftQ
-    )
-    trainer = SFTTrainer(
-        model=model,
-        tokenizer=tokenizer,
-        train_dataset=dataset,
-        dataset_text_field="text",
-        max_seq_length=max_seq_length,
-        dataset_num_proc=2,
-        packing=False,  # Can make training 5x faster for short sequences.
-        args=TrainingArguments(
-            per_device_train_batch_size=2,
-            gradient_accumulation_steps=4,
-            warmup_steps=5,
-            num_train_epochs=num_train_epochs,
-            learning_rate=2e-4,
-            fp16=not is_bfloat16_supported(),
-            bf16=is_bfloat16_supported(),
-            logging_steps=100,
-            optim="adamw_8bit",
-            weight_decay=0.01,
-            lr_scheduler_type="linear",
-            seed=3407,
-            output_dir=output_dir,
-        ),
-    )
-    return trainer

llm_toolkit/translation_utils.py DELETED Viewed

@@ -1,420 +0,0 @@
-import os
-import re
-import pandas as pd
-import evaluate
-import seaborn as sns
-import matplotlib.pyplot as plt
-from datasets import load_dataset
-from langchain_openai import ChatOpenAI
-from langchain_core.prompts import ChatPromptTemplate
-from tqdm import tqdm
-print(f"loading {__file__}")
-bleu = evaluate.load("bleu")
-rouge = evaluate.load("rouge")
-meteor = evaluate.load("meteor")
-accuracy = evaluate.load("accuracy")
-def extract_answer(text, debug=False):
-    if text:
-        # Remove the begin and end tokens
-        text = re.sub(
-            r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE
-        )
-        if debug:
-            print("--------\nstep 1:", text)
-        text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
-        if debug:
-            print("--------\nstep 2:", text)
-        text = re.sub(
-            r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
-        )
-        if debug:
-            print("--------\nstep 3:", text)
-    return text
-def calc_metrics(references, predictions, debug=False):
-    assert len(references) == len(
-        predictions
-    ), f"lengths are difference: {len(references)} != {len(predictions)}"
-    predictions = [extract_answer(text) for text in predictions]
-    correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)]
-    accuracy = sum(correct) / len(references)
-    results = {"accuracy": accuracy}
-    if debug:
-        correct_ids = [i for i, c in enumerate(correct) if c == 1]
-        results["correct_ids"] = correct_ids
-    results["meteor"] = meteor.compute(predictions=predictions, references=references)[
-        "meteor"
-    ]
-    results["bleu_scores"] = bleu.compute(
-        predictions=predictions, references=references, max_order=4
-    )
-    results["rouge_scores"] = rouge.compute(
-        predictions=predictions, references=references
-    )
-    return results
-def save_results(model_name, results_path, dataset, predictions, debug=False):
-    if not os.path.exists(results_path):
-        # Get the directory part of the file path
-        dir_path = os.path.dirname(results_path)
-        # Create all directories in the path (if they don't exist)
-        os.makedirs(dir_path, exist_ok=True)
-        df = dataset.to_pandas()
-        df.drop(columns=["text", "prompt"], inplace=True)
-    else:
-        df = pd.read_csv(results_path, on_bad_lines="warn")
-    df[model_name] = predictions
-    if debug:
-        print(df.head(1))
-    df.to_csv(results_path, index=False)
-def load_translation_dataset(data_path, tokenizer=None):
-    train_data_file = data_path.replace(".tsv", "-train.tsv")
-    test_data_file = data_path.replace(".tsv", "-test.tsv")
-    if not os.path.exists(train_data_file):
-        print("generating train/test data files")
-        dataset = load_dataset(
-            "csv", data_files=data_path, delimiter="\t", split="train"
-        )
-        print(len(dataset))
-        dataset = dataset.filter(lambda x: x["chinese"] and x["english"])
-        datasets = dataset.train_test_split(test_size=0.2)
-        print(len(dataset))
-        # Convert to pandas DataFrame
-        train_df = pd.DataFrame(datasets["train"])
-        test_df = pd.DataFrame(datasets["test"])
-        # Save to TSV
-        train_df.to_csv(train_data_file, sep="\t", index=False)
-        test_df.to_csv(test_data_file, sep="\t", index=False)
-    print("loading train/test data files")
-    datasets = load_dataset(
-        "csv",
-        data_files={"train": train_data_file, "test": test_data_file},
-        delimiter="\t",
-    )
-    if tokenizer:
-        translation_prompt = "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{}"
-        def formatting_prompts_func(examples):
-            inputs = examples["chinese"]
-            outputs = examples["english"]
-            messages = [
-                {
-                    "role": "system",
-                    "content": "You are an expert in translating Chinese to English.",
-                },
-                None,
-            ]
-            model_name = os.getenv("MODEL_NAME")
-            if "mistral" in model_name.lower():
-                messages = messages[1:]
-            texts = []
-            prompts = []
-            for input, output in zip(inputs, outputs):
-                prompt = translation_prompt.format(input)
-                messages[-1] = {"role": "user", "content": prompt}
-                prompt = tokenizer.apply_chat_template(
-                    messages, tokenize=False, add_generation_prompt=True
-                )
-                prompts.append(prompt)
-                texts.append(prompt + output + tokenizer.eos_token)
-            return {"text": texts, "prompt": prompts}
-        datasets = datasets.map(
-            formatting_prompts_func,
-            batched=True,
-        )
-    print(datasets)
-    return datasets
-def eval_model(model, tokenizer, eval_dataset):
-    total = len(eval_dataset)
-    predictions = []
-    for i in tqdm(range(total)):
-        inputs = tokenizer(
-            eval_dataset["prompt"][i : i + 1],
-            return_tensors="pt",
-        ).to("cuda")
-        outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
-        decoded_output = tokenizer.batch_decode(outputs)
-        debug = i == 0
-        decoded_output = [
-            extract_answer(output, debug=debug) for output in decoded_output
-        ]
-        predictions.extend(decoded_output)
-    return predictions
-def save_model(
-    model,
-    tokenizer,
-    include_gguf=True,
-    include_merged=True,
-    publish=True,
-):
-    try:
-        token = os.getenv("HF_TOKEN") or None
-        model_name = os.getenv("MODEL_NAME")
-        save_method = "lora"
-        quantization_method = "q5_k_m"
-        model_names = get_model_names(
-            model_name, save_method=save_method, quantization_method=quantization_method
-        )
-        model.save_pretrained(model_names["local"])
-        tokenizer.save_pretrained(model_names["local"])
-        if publish:
-            model.push_to_hub(
-                model_names["hub"],
-                token=token,
-            )
-            tokenizer.push_to_hub(
-                model_names["hub"],
-                token=token,
-            )
-        if include_merged:
-            model.save_pretrained_merged(
-                model_names["local"] + "-merged", tokenizer, save_method=save_method
-            )
-            if publish:
-                model.push_to_hub_merged(
-                    model_names["hub"] + "-merged",
-                    tokenizer,
-                    save_method="lora",
-                    token="",
-                )
-        if include_gguf:
-            model.save_pretrained_gguf(
-                model_names["local-gguf"],
-                tokenizer,
-                quantization_method=quantization_method,
-            )
-            if publish:
-                model.push_to_hub_gguf(
-                    model_names["hub-gguf"],
-                    tokenizer,
-                    quantization_method=quantization_method,
-                    token=token,
-                )
-    except Exception as e:
-        print(e)
-def get_metrics(df):
-    metrics_df = pd.DataFrame(df.columns.T)[2:]
-    metrics_df.rename(columns={0: "model"}, inplace=True)
-    metrics_df["model"] = metrics_df["model"].apply(lambda x: x.split("/")[-1])
-    metrics_df.reset_index(inplace=True)
-    metrics_df = metrics_df.drop(columns=["index"])
-    accuracy = []
-    meteor = []
-    bleu_1 = []
-    rouge_l = []
-    all_metrics = []
-    for col in df.columns[2:]:
-        metrics = calc_metrics(df["english"], df[col], debug=True)
-        print(f"{col}: {metrics}")
-        accuracy.append(metrics["accuracy"])
-        meteor.append(metrics["meteor"])
-        bleu_1.append(metrics["bleu_scores"]["bleu"])
-        rouge_l.append(metrics["rouge_scores"]["rougeL"])
-        all_metrics.append(metrics)
-    metrics_df["accuracy"] = accuracy
-    metrics_df["meteor"] = meteor
-    metrics_df["bleu_1"] = bleu_1
-    metrics_df["rouge_l"] = rouge_l
-    metrics_df["all_metrics"] = all_metrics
-    return metrics_df
-def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
-    plt.figure(figsize=figsize)
-    df_melted = pd.melt(
-        metrics_df, id_vars="model", value_vars=["meteor", "bleu_1", "rouge_l"]
-    )
-    barplot = sns.barplot(x="variable", y="value", hue="model", data=df_melted)
-    # Set different hatches for each model
-    hatches = ["/", "\\", "|", "-", "+", "x", "o", "O", ".", "*", "//", "\\\\"]
-    # Create a dictionary to map models to hatches
-    model_hatches = {
-        model: hatches[i % len(hatches)]
-        for i, model in enumerate(metrics_df["model"].unique())
-    }
-    # Apply hatches based on the model
-    num_vars = len(df_melted["variable"].unique())
-    for i, bar in enumerate(barplot.patches):
-        model = df_melted["model"].iloc[i // num_vars]
-        bar.set_hatch(model_hatches[model])
-    # Manually update legend to match the bar hatches
-    handles, labels = barplot.get_legend_handles_labels()
-    for handle, model in zip(handles, metrics_df["model"].unique()):
-        handle.set_hatch(model_hatches[model])
-    barplot.set_xticklabels(["METEOR", "BLEU-1", "ROUGE-L"])
-    for p in barplot.patches:
-        if p.get_height() == 0:
-            continue
-        barplot.annotate(
-            f"{p.get_height():.2f}",
-            (p.get_x() + p.get_width() / 2.0, p.get_height()),
-            ha="center",
-            va="center",
-            xytext=(0, 10),
-            textcoords="offset points",
-        )
-    barplot.set(ylim=ylim, ylabel="Scores", xlabel="Metrics")
-    plt.legend(bbox_to_anchor=(0.5, -0.1), loc="upper center")
-    plt.show()
-def plot_times(perf_df, ylim=0.421):
-    # Adjusted code to put "train-time" bars in red at the bottom
-    fig, ax1 = plt.subplots(figsize=(12, 10))
-    color_train = "tab:red"
-    color_eval = "orange"
-    ax1.set_xlabel("Models")
-    ax1.set_ylabel("Time (mins)")
-    ax1.set_xticks(range(len(perf_df["model"])))  # Set x-ticks positions
-    ax1.set_xticklabels(perf_df["model"], rotation=90)
-    # Plot "train-time" first so it's at the bottom
-    ax1.bar(
-        perf_df["model"],
-        perf_df["train-time(mins)"],
-        color=color_train,
-        label="train-time",
-    )
-    # Then, plot "eval-time" on top of "train-time"
-    ax1.bar(
-        perf_df["model"],
-        perf_df["eval-time(mins)"],
-        bottom=perf_df["train-time(mins)"],
-        color=color_eval,
-        label="eval-time",
-    )
-    ax1.tick_params(axis="y")
-    ax1.legend(loc="upper left")
-    if "meteor" in perf_df.columns:
-        ax2 = ax1.twinx()
-        color_meteor = "tab:blue"
-        ax2.set_ylabel("METEOR", color=color_meteor)
-        ax2.plot(
-            perf_df["model"],
-            perf_df["meteor"],
-            color=color_meteor,
-            marker="o",
-            label="meteor",
-        )
-        ax2.tick_params(axis="y", labelcolor=color_meteor)
-        ax2.legend(loc="upper right")
-        ax2.set_ylim(ax2.get_ylim()[0], ylim)
-    # Show numbers in bars
-    for p in ax1.patches:
-        height = p.get_height()
-        if height == 0:  # Skip bars with height 0
-            continue
-        ax1.annotate(
-            f"{height:.2f}",
-            (p.get_x() + p.get_width() / 2.0, p.get_y() + height),
-            ha="center",
-            va="center",
-            xytext=(0, -10),
-            textcoords="offset points",
-        )
-    fig.tight_layout()
-    plt.show()
-def translate_via_llm(text):
-    base_url = os.getenv("OPENAI_BASE_URL") or "http://localhost:8000/v1"
-    llm = ChatOpenAI(
-        model="gpt-4o",
-        temperature=0,
-        max_tokens=None,
-        timeout=None,
-        max_retries=2,
-        base_url=base_url,
-    )
-    prompt = ChatPromptTemplate.from_messages(
-        [
-            (
-                "human",
-                "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{input}",
-            ),
-        ]
-    )
-    chain = prompt | llm
-    response = chain.invoke(
-        {
-            "input": text,
-        }
-    )
-    return response.content
-def translate(text, cache_dict):
-    if text in cache_dict:
-        return cache_dict[text]
-    else:
-        translated_text = translate_via_llm(text)
-        cache_dict[text] = translated_text
-        return translated_text