import numpy as np import pandas as pd from cohort_members import cohort_data from tech_stuff import api_key from openai import OpenAI client = OpenAI(api_key=api_key) def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) def _get_embedding(text, model="text-embedding-3-large"): try: text = text.replace("\n", " ") except: None return client.embeddings.create(input = [text], model=model).data[0].embedding def build_complementary(profile): complementary = client.chat.completions.create( model="gpt-4", messages=[ { "role": "system", "content": "Follow the Entrepreneur First Edge method to list out as bullet points the main complementary characteristic of the proposed profiles. We aim at building the teams that are most likely to success while making sure profiles don't overlap too much" }, { "role": "user", "content": f"PROFILE: \n\n{profile}\n\nCOMPLEMENTARY PROFILE:" } ], temperature=1, max_tokens=1110, top_p=1, frequency_penalty=0, presence_penalty=0 ).choices[0].message.content return complementary def get_similar_profiles(profile, prefix, k=20): query = prefix + profile + cohort_data[profile] complementary = build_complementary(query) df = pd.read_csv("embedded_cohort.csv") embedding_query = _get_embedding(query, model="text-embedding-3-large") df['similarity'] = df.embeddings.apply(lambda x: cosine_similarity(eval(x), embedding_query)) df = df.sort_values('similarity', ascending=False).head(int(k)) raw_results = df["Name"] + df["Description"] results = [] for result in raw_results.to_list(): if result[:20] == (profile + cohort_data[profile])[:20]: print("ah") else: results.append(result) final_md = "" for result in results: final_md += "### " + result.replace("\n","\n\n") return final_md