Gladiator commited on
Commit
c075310
1 Parent(s): 442e8fa

remove logging

Browse files
src/extract_questions.py CHANGED
@@ -86,10 +86,7 @@ if __name__ == "__main__":
86
  questions.append(episode_questions)
87
 
88
  print("*" * 25)
89
- print(f"Total prompt tokens: {cb.prompt_tokens}")
90
- print(f"Total completion tokens: {cb.completion_tokens}")
91
- print(f"Total tokens: {cb.total_tokens}")
92
- print(f"Total cost (USD): ${cb.total_cost}")
93
  print("*" * 25)
94
 
95
  wandb.log(
 
86
  questions.append(episode_questions)
87
 
88
  print("*" * 25)
89
+ print(cb)
 
 
 
90
  print("*" * 25)
91
 
92
  wandb.log(
src/podcast_data.py CHANGED
@@ -1,4 +1,3 @@
1
- import logging
2
  import time
3
  from dataclasses import asdict
4
 
@@ -10,8 +9,6 @@ from tqdm import tqdm
10
  import wandb
11
  from config import config
12
 
13
- logger = logging.getLogger(__name__)
14
-
15
 
16
  def retry_access_yt_object(url, max_retries=5, interval_secs=5):
17
  """
@@ -28,7 +25,7 @@ def retry_access_yt_object(url, max_retries=5, interval_secs=5):
28
  return yt # Return the YouTube object if successful.
29
  except Exception as err:
30
  last_exception = err # Keep track of the last exception raised.
31
- logger.warning(
32
  f"Failed to create YouTube object or access title. Retrying... ({i+1}/{max_retries})"
33
  )
34
  time.sleep(interval_secs) # Wait for the specified interval before retrying.
@@ -43,7 +40,7 @@ if __name__ == "__main__":
43
  playlist = Playlist(config.playlist_url)
44
  playlist_video_urls = playlist.video_urls
45
 
46
- logger.info(f"There are total {len(playlist_video_urls)} videos in the playlist.")
47
 
48
  video_data = []
49
  for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
@@ -61,9 +58,9 @@ if __name__ == "__main__":
61
  curr_video_data["total_words"] = len(transcript.split())
62
  video_data.append(curr_video_data)
63
  except:
64
- logger.warning(f"Failed to scrape {video}")
65
 
66
- logger.info(f"Total podcast episodes scraped: {len(video_data)}")
67
 
68
  df = pd.DataFrame(video_data)
69
  df.to_csv(config.yt_scraped_data_path, index=False)
 
 
1
  import time
2
  from dataclasses import asdict
3
 
 
9
  import wandb
10
  from config import config
11
 
 
 
12
 
13
  def retry_access_yt_object(url, max_retries=5, interval_secs=5):
14
  """
 
25
  return yt # Return the YouTube object if successful.
26
  except Exception as err:
27
  last_exception = err # Keep track of the last exception raised.
28
+ print(
29
  f"Failed to create YouTube object or access title. Retrying... ({i+1}/{max_retries})"
30
  )
31
  time.sleep(interval_secs) # Wait for the specified interval before retrying.
 
40
  playlist = Playlist(config.playlist_url)
41
  playlist_video_urls = playlist.video_urls
42
 
43
+ print(f"There are total {len(playlist_video_urls)} videos in the playlist.")
44
 
45
  video_data = []
46
  for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
 
58
  curr_video_data["total_words"] = len(transcript.split())
59
  video_data.append(curr_video_data)
60
  except:
61
+ print(f"Failed to scrape {video}")
62
 
63
+ print(f"Total podcast episodes scraped: {len(video_data)}")
64
 
65
  df = pd.DataFrame(video_data)
66
  df.to_csv(config.yt_scraped_data_path, index=False)
src/summarize.py CHANGED
@@ -1,4 +1,3 @@
1
- import logging
2
  import os
3
  from dataclasses import asdict
4
 
@@ -15,8 +14,6 @@ from wandb.integration.langchain import WandbTracer
15
 
16
  from config import config
17
 
18
- logger = logging.getLogger(__name__)
19
-
20
 
21
  def get_data(
22
  artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
@@ -38,7 +35,7 @@ def summarize_episode(episode_df: pd.DataFrame):
38
  # split the documents
39
  text_splitter = TokenTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)
40
  docs = text_splitter.split_documents(data)
41
- logger.info(f"Number of documents for podcast {data[0].metadata['title']}: {len(docs)}")
42
 
43
  # initialize LLM
44
  llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
@@ -98,12 +95,9 @@ if __name__ == "__main__":
98
  summary = summarize_episode(episode_data)
99
  summaries.append(summary["output_text"])
100
 
101
- logger.info("*" * 25)
102
- logger.info(f"Total prompt tokens: {cb.prompt_tokens}")
103
- logger.info(f"Total completion tokens: {cb.completion_tokens}")
104
- logger.info(f"Total tokens: {cb.total_tokens}")
105
- logger.info(f"Total cost (USD): ${cb.total_cost}")
106
- logger.info("*" * 25)
107
 
108
  wandb.log(
109
  {
 
 
1
  import os
2
  from dataclasses import asdict
3
 
 
14
 
15
  from config import config
16
 
 
 
17
 
18
  def get_data(
19
  artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
 
35
  # split the documents
36
  text_splitter = TokenTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)
37
  docs = text_splitter.split_documents(data)
38
+ print(f"Number of documents for podcast {data[0].metadata['title']}: {len(docs)}")
39
 
40
  # initialize LLM
41
  llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
 
95
  summary = summarize_episode(episode_data)
96
  summaries.append(summary["output_text"])
97
 
98
+ print("*" * 25)
99
+ print(cb)
100
+ print("*" * 25)
 
 
 
101
 
102
  wandb.log(
103
  {