Spaces:

Gladiator
/

gradient_dissent_bot

Runtime error

Gladiator commited on Apr 22, 2023

Commit

c075310

•

1 Parent(s): 442e8fa

remove logging

Files changed (3) hide show

src/extract_questions.py CHANGED Viewed

@@ -86,10 +86,7 @@ if __name__ == "__main__":
             questions.append(episode_questions)
         print("*" * 25)
-        print(f"Total prompt tokens: {cb.prompt_tokens}")
-        print(f"Total completion tokens: {cb.completion_tokens}")
-        print(f"Total tokens: {cb.total_tokens}")
-        print(f"Total cost (USD): ${cb.total_cost}")
         print("*" * 25)
         wandb.log(

             questions.append(episode_questions)
         print("*" * 25)
+        print(cb)
         print("*" * 25)
         wandb.log(

src/podcast_data.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import logging
 import time
 from dataclasses import asdict
@@ -10,8 +9,6 @@ from tqdm import tqdm
 import wandb
 from config import config
-logger = logging.getLogger(__name__)
 def retry_access_yt_object(url, max_retries=5, interval_secs=5):
     """
@@ -28,7 +25,7 @@ def retry_access_yt_object(url, max_retries=5, interval_secs=5):
             return yt  # Return the YouTube object if successful.
         except Exception as err:
             last_exception = err  # Keep track of the last exception raised.
-            logger.warning(
                 f"Failed to create YouTube object or access title. Retrying... ({i+1}/{max_retries})"
             )
             time.sleep(interval_secs)  # Wait for the specified interval before retrying.
@@ -43,7 +40,7 @@ if __name__ == "__main__":
     playlist = Playlist(config.playlist_url)
     playlist_video_urls = playlist.video_urls
-    logger.info(f"There are total {len(playlist_video_urls)} videos in the playlist.")
     video_data = []
     for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
@@ -61,9 +58,9 @@ if __name__ == "__main__":
             curr_video_data["total_words"] = len(transcript.split())
             video_data.append(curr_video_data)
         except:
-            logger.warning(f"Failed to scrape {video}")
-    logger.info(f"Total podcast episodes scraped: {len(video_data)}")
     df = pd.DataFrame(video_data)
     df.to_csv(config.yt_scraped_data_path, index=False)

 import time
 from dataclasses import asdict
 import wandb
 from config import config
 def retry_access_yt_object(url, max_retries=5, interval_secs=5):
     """
             return yt  # Return the YouTube object if successful.
         except Exception as err:
             last_exception = err  # Keep track of the last exception raised.
+            print(
                 f"Failed to create YouTube object or access title. Retrying... ({i+1}/{max_retries})"
             )
             time.sleep(interval_secs)  # Wait for the specified interval before retrying.
     playlist = Playlist(config.playlist_url)
     playlist_video_urls = playlist.video_urls
+    print(f"There are total {len(playlist_video_urls)} videos in the playlist.")
     video_data = []
     for video in tqdm(playlist_video_urls, total=len(playlist_video_urls)):
             curr_video_data["total_words"] = len(transcript.split())
             video_data.append(curr_video_data)
         except:
+            print(f"Failed to scrape {video}")
+    print(f"Total podcast episodes scraped: {len(video_data)}")
     df = pd.DataFrame(video_data)
     df.to_csv(config.yt_scraped_data_path, index=False)

src/summarize.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import logging
 import os
 from dataclasses import asdict
@@ -15,8 +14,6 @@ from wandb.integration.langchain import WandbTracer
 from config import config
-logger = logging.getLogger(__name__)
 def get_data(
     artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
@@ -38,7 +35,7 @@ def summarize_episode(episode_df: pd.DataFrame):
     # split the documents
     text_splitter = TokenTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)
     docs = text_splitter.split_documents(data)
-    logger.info(f"Number of documents for podcast {data[0].metadata['title']}: {len(docs)}")
     # initialize LLM
     llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
@@ -98,12 +95,9 @@ if __name__ == "__main__":
             summary = summarize_episode(episode_data)
             summaries.append(summary["output_text"])
-        logger.info("*" * 25)
-        logger.info(f"Total prompt tokens: {cb.prompt_tokens}")
-        logger.info(f"Total completion tokens: {cb.completion_tokens}")
-        logger.info(f"Total tokens: {cb.total_tokens}")
-        logger.info(f"Total cost (USD): ${cb.total_cost}")
-        logger.info("*" * 25)
         wandb.log(
             {

 import os
 from dataclasses import asdict
 from config import config
 def get_data(
     artifact_name: str = "gladiator/gradient_dissent_bot/yt_podcast_data:latest",
     # split the documents
     text_splitter = TokenTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=0)
     docs = text_splitter.split_documents(data)
+    print(f"Number of documents for podcast {data[0].metadata['title']}: {len(docs)}")
     # initialize LLM
     llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
             summary = summarize_episode(episode_data)
             summaries.append(summary["output_text"])
+        print("*" * 25)
+        print(cb)
+        print("*" * 25)
         wandb.log(
             {