import os from io import BytesIO import pandas as pd from dotenv import load_dotenv load_dotenv() import openai import streamlit as st # # set OPENAI_API_KEY environment variable from .streamlit/secrets.toml file openai.api_key = st.secrets["OPENAI_API_KEY"] # # set OPENAI_API_KEY environment variable from .env file # openai.api_key = os.getenv("OPENAI_API_KEY") # # read in llm-data-cleaner/prompts/gpt4-system-message.txt file into variable system_message # system_message = open('../prompts/gpt4-system-message.txt', 'r').read() class OpenAIChatCompletions: def __init__(self, model="gpt-4", system_message=None): self.model = model self.system_message = system_message # function to input args such as model, prompt, etc. and return completion def openai_chat_completion(self, prompt, n_shot=None): messages = [{"role": "system", "content": self.system_message}] if self.system_message else [] # add n_shot number of samples to messages list ... if n_shot is None, then only system_message and prompt will be added to messages list if n_shot is not None: messages = self._add_samples(messages, n_samples=n_shot) messages.append({"role": "user", "content": prompt}) # set up the API request parameters for OpenAI chat_request_kwargs = dict( model=self.model, messages=messages, ) # make the API request to OpenAI response = openai.ChatCompletion.create(**chat_request_kwargs) # return only the completion text # return response['choices'][0]['message']['content'] # return response return response # function to use test data to predict completions def predict_jsonl( self, path_or_buf='../data/cookies_train.jsonl', # path_or_buf='~/data/cookies_train.jsonl', n_samples=None, n_shot=None ): jsonObj = pd.read_json(path_or_buf=path_or_buf, lines=True) if n_samples is not None: jsonObj = jsonObj.sample(n_samples, random_state=42) iter_range = range(len(jsonObj)) prompts = [jsonObj.iloc[i]['prompt'] for i in iter_range] completions = [jsonObj.iloc[i]['completion'] for i in iter_range] predictions = [self.openai_chat_completion(prompt, n_shot=n_shot) for prompt in prompts] return prompts, completions, predictions # a method that adds prompt and completion samples to messages @staticmethod def _add_samples(messages, n_samples=None): if n_samples is None: return messages samples = OpenAIChatCompletions._sample_jsonl(n_samples=n_samples) for i in range(n_samples): messages.append({"role": "user", "content": samples.iloc[i]['prompt']}) messages.append({"role": "assistant", "content": samples.iloc[i]['completion']}) return messages # a method that samples n rows from a jsonl file, returning a pandas dataframe @staticmethod def _sample_jsonl( path_or_buf='data/cookies_train.jsonl', # path_or_buf='~/data/cookies_train.jsonl', n_samples=5 ): # jsonObj = pd.read_json(path_or_buf=path_or_buf, lines=True) # if running locally, True # else running on HF Spaces, False if "Kaleidoscope Data" in os.getcwd(): # file_path = os.path.join(os.getcwd(), "..", path_or_buf) file_path = os.path.join("/".join(os.getcwd().split('/')[:-1]), path_or_buf) else: file_path = os.path.join(os.getcwd(), path_or_buf) try: with open(file_path, "r") as file: jsonl_str = file.read() jsonObj = pd.read_json(BytesIO(jsonl_str.encode()), lines=True, engine="pyarrow") except FileNotFoundError: # Handle the case where the file is not found # Display an error message or take appropriate action st.write(f"File not found: {file_path}") return jsonObj.sample(n_samples, random_state=42)