File size: 1,980 Bytes
d0a793d
 
8f309ea
d0a793d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57a9174
 
 
 
 
 
3dd7bab
57a9174
 
 
 
 
 
 
 
 
 
 
 
 
d0a793d
 
 
57a9174
d0a793d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import numpy as np
import pandas as pd
from cohort_members_uk_paris import cohort_data
from tech_stuff import api_key

from openai import OpenAI

client = OpenAI(api_key=api_key)

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def _get_embedding(text, model="text-embedding-3-large"):
    try:
        text = text.replace("\n", " ")
    except:
        None
    return client.embeddings.create(input = [text], model=model).data[0].embedding

def build_complementary(profile):
    complementary = client.chat.completions.create(
    model="gpt-4",
    messages=[
    {
      "role": "system",
      "content": "Follow the Entrepreneur First Edge method to list out as bullet points the main complementary characteristic of the proposed profiles. We aim at building the teams that are most likely to success while making sure profiles don't overlap too much"
    },
    {
      "role": "user",
      "content": f"PROFILE: \n\n{profile}\n\nCOMPLEMENTARY PROFILE:"
        }
      ],
      temperature=1,
      max_tokens=1110,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    ).choices[0].message.content
    return complementary

def get_similar_profiles(profile, prefix, k=20):
  query = prefix + profile + cohort_data[profile]
  complementary = build_complementary(query)
  df = pd.read_csv("embedded_cohort.csv")
  embedding_query = _get_embedding(query, model="text-embedding-3-large")
  df['similarity'] = df.embeddings.apply(lambda x: cosine_similarity(eval(x), embedding_query))
  df = df.sort_values('similarity', ascending=False).head(int(k))
  raw_results = df["Name"] + df["Description"]
  
  
  results = []
  for result in raw_results.to_list():
    if result[:20]  == (profile + cohort_data[profile])[:20]:
        print("ah")
    else:
      results.append(result)
     
    final_md = ""
    for result in results:
        final_md += "### " + result.replace("\n","\n\n")
    
  return final_md