terapyon commited on
Commit
59d39d4
·
1 Parent(s): 9d69587

Made data for all data

Browse files
Files changed (2) hide show
  1. src/episode.py +1 -1
  2. src/store.py +54 -2
src/episode.py CHANGED
@@ -87,7 +87,7 @@ def make_df(episode: Episode) -> pd.DataFrame:
87
  data = []
88
  for text in episode.texts:
89
  data.append([episode.id_, text.part, text.start, text.end, text.text])
90
- df = pd.DataFrame(data, columns=["id", "part", "start", "end", "text"])
91
  return df
92
 
93
 
 
87
  data = []
88
  for text in episode.texts:
89
  data.append([episode.id_, text.part, text.start, text.end, text.text])
90
+ df = pd.DataFrame(data, columns=["id", "part", "start", "end_", "text"])
91
  return df
92
 
93
 
src/store.py CHANGED
@@ -1,5 +1,6 @@
1
  from pathlib import Path
2
  import duckdb
 
3
  from config import DUCKDB_FILE
4
 
5
 
@@ -15,7 +16,7 @@ def create_table():
15
  );
16
  """
17
  episodes_create = """CREATE TABLE episodes (
18
- id BIGINT, part BIGINT, start INTERVAL, end_ INTERVAL, text TEXT,
19
  PRIMARY KEY (id, part)
20
  );
21
  """
@@ -38,7 +39,43 @@ def insert_podcast():
38
  SELECT id, title, date, [], length, audio
39
  FROM read_parquet(?);
40
  """
41
- conn.execute(sql, [str(STORE_DIR / 'podcast-title-list-202301-202501.parquet')])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  conn.commit()
43
  conn.close()
44
 
@@ -51,6 +88,21 @@ if __name__ == "__main__":
51
  create_table()
52
  elif args[1] == "podcastinsert":
53
  insert_podcast()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  else:
55
  print("Usage: python store.py create")
56
  sys.exit(1)
 
1
  from pathlib import Path
2
  import duckdb
3
+ from embedding import get_embeddings
4
  from config import DUCKDB_FILE
5
 
6
 
 
16
  );
17
  """
18
  episodes_create = """CREATE TABLE episodes (
19
+ id BIGINT, part BIGINT, start BIGINT, end_ BIGINT, text TEXT,
20
  PRIMARY KEY (id, part)
21
  );
22
  """
 
39
  SELECT id, title, date, [], length, audio
40
  FROM read_parquet(?);
41
  """
42
+ conn.execute(sql, [str(STORE_DIR / 'title-list-202301-202501.parquet')])
43
+ conn.commit()
44
+ conn.close()
45
+
46
+
47
+ def insert_episodes():
48
+ conn = duckdb.connect(DUCKDB_FILE)
49
+ sql = """INSERT INTO episodes
50
+ SELECT id, part, start, end_, text
51
+ FROM read_parquet(?);
52
+ """
53
+ conn.execute(sql, [str(STORE_DIR / 'podcast-*.parquet')])
54
+ conn.commit()
55
+ conn.close()
56
+
57
+
58
+ def embed_store():
59
+ conn = duckdb.connect(DUCKDB_FILE)
60
+ sql_select = """SELECT id, part, text FROM episodes;"""
61
+ data = conn.execute(sql_select).df()
62
+ targets = data["text"].tolist()
63
+ enbeddings = get_embeddings(targets)
64
+ for id_, part, emb in zip(data["id"], data["part"], enbeddings):
65
+ # print(id_, title)
66
+ conn.execute(
67
+ "INSERT INTO embeddings VALUES (?, ?, ?)", (id_, part, emb.tolist())
68
+ )
69
+ conn.commit()
70
+ conn.close()
71
+
72
+
73
+ def create_index():
74
+ conn = duckdb.connect(DUCKDB_FILE)
75
+ conn.execute("LOAD vss;")
76
+ conn.execute("SET hnsw_enable_experimental_persistence=true;")
77
+ conn.execute("""CREATE INDEX embeddings_index
78
+ ON embeddings USING HNSW (embedding);""")
79
  conn.commit()
80
  conn.close()
81
 
 
88
  create_table()
89
  elif args[1] == "podcastinsert":
90
  insert_podcast()
91
+ elif args[1] == "episodeinsert":
92
+ insert_episodes()
93
+ elif args[1] == "embed":
94
+ embed_store()
95
+ elif args[1] == "index":
96
+ create_index()
97
+ elif args[1] == "all":
98
+ create_table()
99
+ insert_podcast()
100
+ insert_episodes()
101
+ embed_store()
102
+ create_index()
103
+ else:
104
+ print("Usage: python store.py all")
105
+ sys.exit(1)
106
  else:
107
  print("Usage: python store.py create")
108
  sys.exit(1)