Spaces:
Runtime error
Runtime error
remove logging
Browse files- src/extract_questions.py +1 -4
- src/podcast_data.py +4 -7
- src/summarize.py +4 -10
src/extract_questions.py
CHANGED
@@ -86,10 +86,7 @@ if __name__ == "__main__":
|
|
86 |
questions.append(episode_questions)
|
87 |
|
88 |
print("*" * 25)
|
89 |
-
print(
|
90 |
-
print(f"Total completion tokens: {cb.completion_tokens}")
|
91 |
-
print(f"Total tokens: {cb.total_tokens}")
|
92 |
-
print(f"Total cost (USD): ${cb.total_cost}")
|
93 |
print("*" * 25)
|
94 |
|
95 |
wandb.log(
|
|
|
86 |
questions.append(episode_questions)
|
87 |
|
88 |
print("*" * 25)
|
89 |
+
print(cb)
|
|
|
|
|
|
|
90 |
print("*" * 25)
|
91 |
|
92 |
wandb.log(
|
src/podcast_data.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import logging
|
2 |
import time
|
3 |
from dataclasses import asdict
|
4 |
|
@@ -10,8 +9,6 @@ from tqdm import tqdm
|
|
10 |
import wandb
|
11 |
from config import config
|
12 |
|
13 |
-
logger = logging.getLogger(__name__)
|
14 |
-
|
15 |
|
16 |
def retry_access_yt_object(url, max_retries=5, interval_secs=5):
|
17 |
"""
|
@@ -28,7 +25,7 @@ def retry_access_yt_object(url, max_retries=5, interval_secs=5):
|
|
28 |
return yt # Return the YouTube object if successful.
|
29 |
except Exception as err:
|
30 |
last_exception = err # Keep track of the last exception raised.
|
31 |
-
|
32 |
f"Failed to create YouTube object or access title. Retrying... ({i+1}/{max_retries})"
|
33 |
)
|
34 |
time.sleep(interval_secs) # Wait for the specified interval before retrying.
|
@@ -43,7 +40,7 @@ if __name__ == "__main__":
|
|
43 |
playlist = Playlist(config.playlist_url)
|
44 |
playlist_video_urls = playlist.video_urls
|
45 |
|
46 |
-
|
47 |
|
48 |
video_data = []
|
49 |
for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
|
@@ -61,9 +58,9 @@ if __name__ == "__main__":
|
|
61 |
curr_video_data["total_words"] = len(transcript.split())
|
62 |
video_data.append(curr_video_data)
|
63 |
except:
|
64 |
-
|
65 |
|
66 |
-
|
67 |
|
68 |
df = pd.DataFrame(video_data)
|
69 |
df.to_csv(config.yt_scraped_data_path, index=False)
|
|
|
|
|
1 |
import time
|
2 |
from dataclasses import asdict
|
3 |
|
|
|
9 |
import wandb
|
10 |
from config import config
|
11 |
|
|
|
|
|
12 |
|
13 |
def retry_access_yt_object(url, max_retries=5, interval_secs=5):
|
14 |
"""
|
|
|
25 |
return yt # Return the YouTube object if successful.
|
26 |
except Exception as err:
|
27 |
last_exception = err # Keep track of the last exception raised.
|
28 |
+
print(
|
29 |
f"Failed to create YouTube object or access title. Retrying... ({i+1}/{max_retries})"
|
30 |
)
|
31 |
time.sleep(interval_secs) # Wait for the specified interval before retrying.
|
|
|
40 |
playlist = Playlist(config.playlist_url)
|
41 |
playlist_video_urls = playlist.video_urls
|
42 |
|
43 |
+
print(f"There are total {len(playlist_video_urls)} videos in the playlist.")
|
44 |
|
45 |
video_data = []
|
46 |
for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
|
|
|
58 |
curr_video_data["total_words"] = len(transcript.split())
|
59 |
video_data.append(curr_video_data)
|
60 |
except:
|
61 |
+
print(f"Failed to scrape {video}")
|
62 |
|
63 |
+
print(f"Total podcast episodes scraped: {len(video_data)}")
|
64 |
|
65 |
df = pd.DataFrame(video_data)
|
66 |
df.to_csv(config.yt_scraped_data_path, index=False)
|
src/summarize.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import logging
|
2 |
import os
|
3 |
from dataclasses import asdict
|
4 |
|
@@ -15,8 +14,6 @@ from wandb.integration.langchain import WandbTracer
|
|
15 |
|
16 |
from config import config
|
17 |
|
18 |
-
logger = logging.getLogger(__name__)
|
19 |
-
|
20 |
|
21 |
def get_data(
|
22 |
artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
|
@@ -38,7 +35,7 @@ def summarize_episode(episode_df: pd.DataFrame):
|
|
38 |
# split the documents
|
39 |
text_splitter = TokenTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)
|
40 |
docs = text_splitter.split_documents(data)
|
41 |
-
|
42 |
|
43 |
# initialize LLM
|
44 |
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
|
@@ -98,12 +95,9 @@ if __name__ == "__main__":
|
|
98 |
summary = summarize_episode(episode_data)
|
99 |
summaries.append(summary["output_text"])
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
logger.info(f"Total tokens: {cb.total_tokens}")
|
105 |
-
logger.info(f"Total cost (USD): ${cb.total_cost}")
|
106 |
-
logger.info("*" * 25)
|
107 |
|
108 |
wandb.log(
|
109 |
{
|
|
|
|
|
1 |
import os
|
2 |
from dataclasses import asdict
|
3 |
|
|
|
14 |
|
15 |
from config import config
|
16 |
|
|
|
|
|
17 |
|
18 |
def get_data(
|
19 |
artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
|
|
|
35 |
# split the documents
|
36 |
text_splitter = TokenTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)
|
37 |
docs = text_splitter.split_documents(data)
|
38 |
+
print(f"Number of documents for podcast {data[0].metadata['title']}: {len(docs)}")
|
39 |
|
40 |
# initialize LLM
|
41 |
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
|
|
|
95 |
summary = summarize_episode(episode_data)
|
96 |
summaries.append(summary["output_text"])
|
97 |
|
98 |
+
print("*" * 25)
|
99 |
+
print(cb)
|
100 |
+
print("*" * 25)
|
|
|
|
|
|
|
101 |
|
102 |
wandb.log(
|
103 |
{
|