import os import gradio as gr from llm2vec import LLM2Vec from transformers import AutoTokenizer, AutoModel, AutoConfig from peft import PeftModel import torch torch.backends.cuda.enable_mem_efficient_sdp(False) torch.backends.cuda.enable_flash_sdp(False) # Read tokens from environment variables GROQ_API_KEY = os.getenv('GROQ_API_KEY') HF_TOKEN = os.getenv('HF_TOKEN') if not GROQ_API_KEY or not HF_TOKEN: raise ValueError("GROQ_API_KEY and HF_TOKEN must be set as environment variables.") os.environ['GROQ_API_KEY'] = GROQ_API_KEY os.environ['HF_TOKEN'] = HF_TOKEN # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained("McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp") config = AutoConfig.from_pretrained("McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp", trust_remote_code=True) model = AutoModel.from_pretrained("McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp", trust_remote_code=True, config=config, torch_dtype=torch.bfloat16, device_map="cuda" if torch.cuda.is_available() else "cpu") model = PeftModel.from_pretrained(model, "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp") model = model.merge_and_unload() # Load unsupervised SimCSE model model = PeftModel.from_pretrained(model, "McGill-NLP/LLM2Vec-Sheared-LLaMA-mntp-unsup-simcse") # Wrapper for encoding and pooling operations l2v = LLM2Vec(model, tokenizer, pooling_mode="mean", max_length=512) def encode_texts(input_texts): encodings = [l2v.encode(text) for text in input_texts] return encodings # Define Gradio interface iface = gr.Interface( fn=encode_texts, inputs=gr.Textbox(lines=5, placeholder="Enter texts separated by newlines..."), outputs=gr.JSON() ) # Launch Gradio app iface.launch(share=True)