Spaces:
Running
Running
make parquet for each episodes
Browse files- .gitignore +1 -0
- src/episode.py +9 -5
.gitignore
CHANGED
@@ -173,5 +173,6 @@ cython_debug/
|
|
173 |
db/*
|
174 |
data/*
|
175 |
sample/*
|
|
|
176 |
.python-version
|
177 |
uv.lock
|
|
|
173 |
db/*
|
174 |
data/*
|
175 |
sample/*
|
176 |
+
store/*
|
177 |
.python-version
|
178 |
uv.lock
|
src/episode.py
CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
|
|
8 |
|
9 |
HERE = Path(__file__).parent
|
10 |
DATA_DIR = HERE.parent / "data"
|
|
|
11 |
divider_time = timedelta(minutes=5)
|
12 |
RE_PODCAST = re.compile(r"[_-](\d+)[_-]")
|
13 |
|
@@ -103,11 +104,14 @@ def get_srt_files():
|
|
103 |
|
104 |
def main():
|
105 |
lst = sorted(get_srt_files(), key=lambda x: x["id"])
|
106 |
-
print(lst)
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
111 |
|
112 |
|
113 |
if __name__ == "__main__":
|
|
|
8 |
|
9 |
HERE = Path(__file__).parent
|
10 |
DATA_DIR = HERE.parent / "data"
|
11 |
+
STORE_DIR = HERE.parent / "store"
|
12 |
divider_time = timedelta(minutes=5)
|
13 |
RE_PODCAST = re.compile(r"[_-](\d+)[_-]")
|
14 |
|
|
|
104 |
|
105 |
def main():
|
106 |
lst = sorted(get_srt_files(), key=lambda x: x["id"])
|
107 |
+
print(f"{len(lst)=}")
|
108 |
+
for item in lst:
|
109 |
+
print(item["id"])
|
110 |
+
episode = make_episode(item["id"], item.get("title"), DATA_DIR / item["srt"])
|
111 |
+
df = make_df(episode)
|
112 |
+
# print(df)
|
113 |
+
df.to_parquet(STORE_DIR / f"podcast-{item['id']}.parquet")
|
114 |
+
# break
|
115 |
|
116 |
|
117 |
if __name__ == "__main__":
|