import json import uuid from typing import Iterator, Union, List, Dict from dotenv import load_dotenv; load_dotenv() import os import requests AVAILABLE_MODELS = [ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", "o1-mini", "claude-3-sonnet-20240229", "gemini-1.5-pro", "gemini-1.5-flash", "o1-preview", "gpt-4o" ] def API_Inference( messages: List[Dict[str, str]], model: str = "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", stream: bool = False, max_tokens: int = 4000, temperature: float = 0.7, top_p: float = 0.95, ) -> Union[str, Iterator[str], None]: if model not in AVAILABLE_MODELS: raise ValueError( f"Model {model} not available. Available models: {', '.join(AVAILABLE_MODELS)}" ) if model == "claude-3-sonnet-20240229": messages = [{"role": "system", "content": "."}] + [msg for msg in messages if msg["role"] != "system"] api_endpoint = os.environ.get("AMIGO_BASE_URL") headers = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate, br, zstd", "Authorization": "Bearer ", "Content-Type": "application/json", "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0" ), "X-Device-UUID": str(uuid.uuid4()), } payload = { "messages": messages, "model": model, "max_tokens": max_tokens, "stream": stream, "presence_penalty": 0, "temperature": temperature, "top_p": top_p, } try: response = requests.post(api_endpoint, headers=headers, json=payload, stream=stream) response.raise_for_status() except requests.exceptions.RequestException as e: print("An error occurred while making the request:", e) return None def process_response() -> Iterator[str]: for line in response.iter_lines(): if line: # Decode the line from bytes to string decoded_line = line.decode('utf-8').strip() if decoded_line.startswith("data: "): data_str = decoded_line[6:] if data_str == "[DONE]": break try: # Load the JSON data data_json = json.loads(data_str) # Extract the content from the response choices = data_json.get("choices", []) if choices: delta = choices[0].get("delta", {}) content = delta.get("content", "") if content: yield content except json.JSONDecodeError: print(f"Received non-JSON data: {data_str}") if stream: return process_response() else: return "".join(process_response()) if __name__ == "__main__": # Example usage with the new format conversation = [ {"role": "system", "content": "You are a helpful and friendly AI assistant."}, {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "Paris"}, {"role": "user", "content": "Who are you. Are you GPT-4o or gpt-3.5?"} ] # For non-streaming response response = API_Inference(conversation, stream=False, model="claude-3-sonnet-20240229") print(response) print("--" * 50) # # For streaming response for chunk in API_Inference(conversation, stream=True, model="gpt-4o"): print(chunk, end="", flush=True)