LIFineTuned

Paused

App Files Files Community

alexkueck commited on Jun 19, 2023

Commit

72b1673

1 Parent(s): 4dc9c10

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -195

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import gradio as gr
 import torch
 from utils import *
 from presets import *
 ######################################################################
@@ -19,12 +20,9 @@ base_model = "project-baize/baize-v2-7b"  #load_8bit = False (in load_tokenizer_
 tokenizer,model,device = load_tokenizer_and_model(base_model,False)
 dataset_neu = daten_laden("alexkueck/tis")
-###################################
-#Vorbereiten für das training der neuen Daten
-#Datensets in den Tokenizer schieben...
-def tokenize_function(examples):
-    return tokenizer(examples["text"])
 #alles zusammen auf das neue datenset anwenden - batched = True und 4 Prozesse, um die Berechnung zu beschleunigen. Die "text" - Spalte braucht man anschließend nicht mehr, daher weglassen.
 tokenized_datasets = dataset_neu.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
@@ -36,203 +34,84 @@ tokenized_datasets = dataset_neu.map(tokenize_function, batched=True, num_proc=4
 # block_size = tokenizer.model_max_length
 block_size = 128
-########################################################################
-#Chat KI nutzen, um Text zu generieren...
-def predict(text,
-            chatbotGr,
-            history,
-            top_p,
-            temperature,
-            max_length_tokens,
-            max_context_length_tokens,):
-    if text=="":
-        yield chatbotGr,history,"Empty context."
-        return
-    try:
-        model
-    except:
-        yield [[text,"No Model Found"]],[],"No Model Found"
-        return
-    inputs = generate_prompt_with_history(text,history,tokenizer,max_length=max_context_length_tokens)
-    if inputs is None:
-        yield chatbotGr,history,"Input too long."
-        return
-    else:
-        prompt,inputs=inputs
-        begin_length = len(prompt)
-    input_ids = inputs["input_ids"][:,-max_context_length_tokens:].to(device)
-    torch.cuda.empty_cache()
-    #torch.no_grad() bedeutet, dass für die betreffenden tensoren keine Ableitungen berechnet werden bei der backpropagation
-    #hier soll das NN ja auch nicht geändert werden 8backprop ist nicht nötig), da es um interference-prompts geht!
-    with torch.no_grad():
-        #die vergangenen prompts werden alle als Tupel in history abgelegt sortiert nach 'Human' und 'AI'- dass sind daher auch die stop-words, die den jeweils nächsten Eintrag kennzeichnen
-        for x in greedy_search(input_ids,model,tokenizer,stop_words=["[|Human|]", "[|AI|]"],max_length=max_length_tokens,temperature=temperature,top_p=top_p):
-            if is_stop_word_or_prefix(x,["[|Human|]", "[|AI|]"]) is False:
-                if "[|Human|]" in x:
-                    x = x[:x.index("[|Human|]")].strip()
-                if "[|AI|]" in x:
-                    x = x[:x.index("[|AI|]")].strip()
-                x = x.strip()
-                a, b=   [[y[0],convert_to_markdown(y[1])] for y in history]+[[text, convert_to_markdown(x)]],history + [[text,x]]
-                yield a, b, "Generating..."
-            if shared_state.interrupted:
-                shared_state.recover()
-                try:
-                    yield a, b, "Stop: Success"
-                    return
-                except:
-                    pass
-    del input_ids
-    gc.collect()
-    torch.cuda.empty_cache()
-    try:
-        yield a,b,"Generate: Success"
-    except:
-        pass
-def reset_chat():
-    #id_new = chatbot.new_conversation()
-    #chatbot.change_conversation(id_new)
-    reset_textbox()
-##########################################################
-#Übersetzungs Ki nutzen
-def translate():
-    return "Kommt noch!"
-#Programmcode KI
-def coding():
-    return "Kommt noch!"
 #######################################################################
 #Darstellung mit Gradio
-with open("custom.css", "r", encoding="utf-8") as f:
-    customCSS = f.read()
-with gr.Blocks(theme=small_and_beautiful_theme) as demo:
-    history = gr.State([])
-    user_question = gr.State("")
-    gr.Markdown("KIs am LI - wähle aus, was du bzgl. KI-Bots ausprobieren möchtest!")
-    with gr.Tabs():
-        with gr.TabItem("LI-Chat"):
-            with gr.Row():
-                gr.HTML(title)
-                status_display = gr.Markdown("Erfolg", elem_id="status_display")
-            gr.Markdown(description_top)
-            with gr.Row(scale=1).style(equal_height=True):
-                with gr.Column(scale=5):
-                    with gr.Row(scale=1):
-                        chatbotGr = gr.Chatbot(elem_id="LI_chatbot").style(height="100%")
-                    with gr.Row(scale=1):
-                        with gr.Column(scale=12):
-                            user_input = gr.Textbox(
-                                show_label=False, placeholder="Gib deinen Text / Frage ein."
-                            ).style(container=False)
-                        with gr.Column(min_width=100, scale=1):
-                            submitBtn = gr.Button("Absenden")
-                        with gr.Column(min_width=100, scale=1):
-                            cancelBtn = gr.Button("Stoppen")
-                    with gr.Row(scale=1):
-                        emptyBtn = gr.Button(
-                            "🧹 Neuer Chat",
-                        )
-                with gr.Column():
-                    with gr.Column(min_width=50, scale=1):
-                        with gr.Tab(label="Parameter zum Model"):
-                            gr.Markdown("# Parameters")
-                            top_p = gr.Slider(
-                                minimum=-0,
-                                maximum=1.0,
-                                value=0.95,
-                                step=0.05,
-                                interactive=True,
-                                label="Top-p",
-                            )
-                            temperature = gr.Slider(
-                                minimum=0.1,
-                                maximum=2.0,
-                                value=1,
-                                step=0.1,
-                                interactive=True,
-                                label="Temperature",
-                            )
-                            max_length_tokens = gr.Slider(
-                                minimum=0,
-                                maximum=512,
-                                value=512,
-                                step=8,
-                                interactive=True,
-                                label="Max Generation Tokens",
-                            )
-                            max_context_length_tokens = gr.Slider(
-                                minimum=0,
-                                maximum=4096,
-                                value=2048,
-                                step=128,
-                                interactive=True,
-                                label="Max History Tokens",
-                            )
-            gr.Markdown(description)
-        with gr.TabItem("Übersetzungen"):
-            with gr.Row():
-                    gr.Textbox(
-                                show_label=False, placeholder="Ist noch in Arbeit..."
-                            ).style(container=False)
-        with gr.TabItem("Code-Generierungen"):
-            with gr.Row():
-                    gr.Textbox(
-                                show_label=False, placeholder="Ist noch in Arbeit..."
-                            ).style(container=False)
-    predict_args = dict(
-        fn=predict,
-        inputs=[
-            user_question,
-            chatbotGr,
-            history,
-            top_p,
-            temperature,
-            max_length_tokens,
-            max_context_length_tokens,
-        ],
-        outputs=[chatbotGr, history, status_display],
-        show_progress=True,
-    )
-    #neuer Chat
-    reset_args = dict(
-        #fn=reset_chat, inputs=[], outputs=[user_input, status_display]
-        fn=reset_textbox, inputs=[], outputs=[user_input, status_display]
-    )
-    # Chatbot
-    transfer_input_args = dict(
-        fn=transfer_input, inputs=[user_input], outputs=[user_question, user_input, submitBtn], show_progress=True
-    )
-    #Listener auf Start-Click auf Button oder Return
-    predict_event1 = user_input.submit(**transfer_input_args).then(**predict_args)
-    predict_event2 = submitBtn.click(**transfer_input_args).then(**predict_args)
-    #Listener, Wenn reset...
-    emptyBtn.click(
-        reset_state,
-        outputs=[chatbotGr, history, status_display],
-        show_progress=True,
-    )
-    emptyBtn.click(**reset_args)
-demo.title = "LI Chat"
-#demo.queue(concurrency_count=1).launch(share=True)
-demo.queue(concurrency_count=1).launch(debug=True)

 import torch
 from utils import *
 from presets import *
+from transformers import Trainer, TrainingArguments
 ######################################################################
 tokenizer,model,device = load_tokenizer_and_model(base_model,False)
 dataset_neu = daten_laden("alexkueck/tis")
+#############################################
+#Vorbereiten für das Training der neuen Daten
+#############################################
 #alles zusammen auf das neue datenset anwenden - batched = True und 4 Prozesse, um die Berechnung zu beschleunigen. Die "text" - Spalte braucht man anschließend nicht mehr, daher weglassen.
 tokenized_datasets = dataset_neu.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
 # block_size = tokenizer.model_max_length
 block_size = 128
+#nochmal die map-Funktion auf das bereits tokenisierte Datenset anwenden
+#die bereits tokenisierten Datensatze ändern sich dadurch: die samples enthalten nun Mengen aus block_size Tokens
+lm_datasets = tokenized_datasets.map(
+    group_texts,
+    batched=True,
+    batch_size=1000,
+    num_proc=4,
+)
+#die Daten wurden nun "gereinigt" und für das Model vorbereitet.
+#z.B. anschauen mit: tokenizer.decode(lm_datasets["train"][1]["input_ids"])
+####################################################
+#Training
+####################################################
+#Training Args
+model_name = base_model.split("/")[-1]
+training_args = TrainingArguments(
+    f"{model_name}-finetuned-tis",
+    evaluation_strategy = "epoch",
+    learning_rate=2e-5,
+    weight_decay=0.01,
+    push_to_hub=True,
+)
+############################################
+def trainieren_neu():
+    #Trainer zusammenstellen
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=lm_datasets["train"],
+        eval_dataset=lm_datasets["validation"],
+    )
+    #trainer ausführen
+    trainer.train()
+    #in den Hub laden
+    trainer.push_to_hub()
+#####################################################
+#Hilfsfunktionen für das training
+#####################################################
+#Datensets in den Tokenizer schieben...
+def tokenize_function(examples):
+    return tokenizer(examples["text"])
+#Funktion, die den gegebenen Text aus dem Datenset gruppiert
+def group_texts(examples):
+    # Concatenate all texts.
+    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+    total_length = len(concatenated_examples[list(examples.keys())[0]])
+    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+    # customize this part to your needs.
+    total_length = (total_length // block_size) * block_size
+    # Split by chunks of max_len.
+    result = {
+        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+        for k, t in concatenated_examples.items()
+    }
+    result["labels"] = result["input_ids"].copy()
+    return result
 #######################################################################
 #Darstellung mit Gradio
+with gr.Blocks() as demo:
+    output = gr.Textbox(label="Output Box")
+    start_btn = gr.Button("Start")
+    start_btn.click(fn=greet, inputs, outputs=output, api_name="trainieren_neu")
+demo.launch()