import spaces import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained("instruction-pretrain/instruction-synthesizer") model.to('cuda') tokenizer = AutoTokenizer.from_pretrained("instruction-pretrain/instruction-synthesizer") def parse_pred(pred): """Extract the list of instruction-response pairs from the prediction""" QA_str_list = pred.split('') if not pred.endswith(''): QA_str_list = QA_str_list[:-1] QA_list = [] raw_questions = [] for QA_str in QA_str_list: try: assert len(QA_str.split('')) == 2, f'invalid QA string: {QA_str}' Q_str, A_str = QA_str.split('') Q_str, A_str = Q_str.strip(), A_str.strip() assert Q_str.startswith(''), f'invalid question string: {Q_str} in QA_str: {QA_str}' assert len(A_str) > 0, f'invalid answer string in QA_str: {QA_str}' Q_str = Q_str.replace('', '').strip() assert Q_str.lower() not in raw_questions, f'duplicate question: {Q_str}' QA_list.append({'Q': Q_str, 'A': A_str}) raw_questions.append(Q_str.lower()) except: pass return QA_list def get_instruction_response_pairs(context): '''Prompt the synthesizer to generate instruction-response pairs based on the given context''' prompt = f' {context} \n\n' inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(model.device) outputs = model.generate(input_ids=inputs, max_new_tokens=400, do_sample=False)[0] pred_start = int(inputs.shape[-1]) pred = tokenizer.decode(outputs[pred_start:], skip_special_tokens=True) return parse_pred(pred) @spaces.GPU def generate_pairs(context): instruction_response_pairs = get_instruction_response_pairs(context) output = "" for index, pair in enumerate(instruction_response_pairs): output += f"## Instruction {index + 1}:\n{pair['Q']}\n## Response {index + 1}:\n{pair['A']}\n\n" return output description = """ ## Instruction Pre-Training: Language Models as Supervised Multitask Learners This demo implements the instruction synthesis approach from the paper ["Instruction Pre-Training: Language Models are Supervised Multitask Learners"](https://huggingface.co./papers/2406.14491). ### Method: 1. An instruction synthesizer is trained on diverse datasets to generate instruction-response pairs from raw text. 2. The synthesizer augments raw pre-training corpora with synthesized instruction-response pairs. 3. Language models are then pre-trained on this augmented data, combining unsupervised and supervised multitask learning. This approach enhances model performance and generalization, particularly benefiting from further instruction tuning. Try it out by entering some text below! """ examples = [ "Hugging Face, Inc. is a French-American company incorporated under the Delaware General Corporation Law[1] and based in New York City that develops computation tools for building applications using machine learning. It is most notable for its transformers library built for natural language processing applications and its platform that allows users to share machine learning models and datasets and showcase their work.", "In order to make your Space work with ZeroGPU you need to decorate the Python functions that actually require a GPU with @spaces.GPU \n During the time when a decorated function is invoked, the Space will be attributed a GPU, and it will release it upon completion of the function.", "A spectre is haunting Europe – the spectre of communism. All the powers of old Europe have entered into a holy alliance to exorcise this spectre: Pope and Tsar, Metternich and Guizot, French Radicals and German police-spies" ] # Create Gradio interface iface = gr.Interface( fn=generate_pairs, inputs=gr.Textbox(lines=5, label="Enter context here"), outputs=gr.Textbox(lines=20, label="Generated Instruction-Response Pairs"), title="Instruction-Response Pair Generator", description=description, examples=examples ) # Launch the interface iface.launch()