terapyon commited on
Commit
31e8063
·
1 Parent(s): b1bf02b

make parquet for each episodes

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. src/episode.py +9 -5
.gitignore CHANGED
@@ -173,5 +173,6 @@ cython_debug/
173
  db/*
174
  data/*
175
  sample/*
 
176
  .python-version
177
  uv.lock
 
173
  db/*
174
  data/*
175
  sample/*
176
+ store/*
177
  .python-version
178
  uv.lock
src/episode.py CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
8
 
9
  HERE = Path(__file__).parent
10
  DATA_DIR = HERE.parent / "data"
 
11
  divider_time = timedelta(minutes=5)
12
  RE_PODCAST = re.compile(r"[_-](\d+)[_-]")
13
 
@@ -103,11 +104,14 @@ def get_srt_files():
103
 
104
  def main():
105
  lst = sorted(get_srt_files(), key=lambda x: x["id"])
106
- print(lst)
107
- # for item in lst:
108
- # episode = make_episode(item["id"], item.get("title"), DATA_DIR / item["srt"])
109
- # df = make_df(episode)
110
- # df.to_parquet(DATA_DIR / f"{item['id']}.parquet")
 
 
 
111
 
112
 
113
  if __name__ == "__main__":
 
8
 
9
  HERE = Path(__file__).parent
10
  DATA_DIR = HERE.parent / "data"
11
+ STORE_DIR = HERE.parent / "store"
12
  divider_time = timedelta(minutes=5)
13
  RE_PODCAST = re.compile(r"[_-](\d+)[_-]")
14
 
 
104
 
105
  def main():
106
  lst = sorted(get_srt_files(), key=lambda x: x["id"])
107
+ print(f"{len(lst)=}")
108
+ for item in lst:
109
+ print(item["id"])
110
+ episode = make_episode(item["id"], item.get("title"), DATA_DIR / item["srt"])
111
+ df = make_df(episode)
112
+ # print(df)
113
+ df.to_parquet(STORE_DIR / f"podcast-{item['id']}.parquet")
114
+ # break
115
 
116
 
117
  if __name__ == "__main__":