skyvera commited on
Commit
6718307
·
verified ·
1 Parent(s): cff31fb

Initial commit

Browse files
__pycache__/app.cpython-37.pyc ADDED
Binary file (395 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from cs_ai_kt_transcribe_share.kt_transcript import KnowledgeTranscriber
3
+
4
+ def process_inputs(api_key, drive_folder_link):
5
+ try:
6
+ # Assuming drive_link is the folder_path and transcribe_flag is derived or fixed
7
+ folder_path = drive_folder_link # Example adjustment, actual implementation may vary
8
+ transcribe_flag = True # Example, adjust based on actual use case
9
+ kt = KnowledgeTranscriber(api_key) # Create an instance of KnowledgeTranscriber
10
+ kt.process_folder('scripts/ktTranscript/cs_ai_kt_transcribe_share/Input-Folder', True, drive_folder_link) # Call the method
11
+ return "Success"
12
+ except ValueError as e:
13
+ return str(e) # Return the exception message as the result
14
+
15
+ demo = gr.Interface(
16
+ process_inputs,
17
+ [
18
+ gr.Textbox(
19
+ label="OpenAI Key",
20
+ info="Enter your OpenAI API Key here e.g., sk-xxxxxxxxxxxxxx"
21
+ ),
22
+ gr.Textbox(
23
+ label="Drive Folder Link",
24
+ info="Enter your Drive Folder Link here (it must be public) e.g., https://drive.google.com/drive/folders/your-folder-id"
25
+ ),
26
+ ],
27
+ gr.Textbox(label="Result"),
28
+ theme=gr.themes.Base()
29
+ )
30
+ demo.launch() # Share your demo with just 1 extra parameter 🚀
cs_ai_kt_transcribe_share/.DS_Store ADDED
Binary file (6.15 kB). View file
 
cs_ai_kt_transcribe_share/.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=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
cs_ai_kt_transcribe_share/.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
cs_ai_kt_transcribe_share/README.md ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CSAI Knowledge Aggregator
2
+
3
+ ## Introduction
4
+ With the common requirement for various central teams to record, attend, and then process KT recording sessions or parse through documents, PDFs, etc. folders, there was a clear need to simplify and automate the gathering of knowledge and generate some value more expediently.
5
+
6
+ Created for personal use, this has been split out for other interested parties to make use of. In the current version on the 'share' branch, this tool is tailored specifically for transcribing mp4 KT recordings (or utilizing existing transcripts provided by various video platforms such as Zoom or Loom) to parse out various knowledge outputs and create KB articles. While initially focused on Central Support knowledge capture, with some minor adjustments, it can be tailored for other applications.
7
+
8
+ With a long-term goal of generalized content capture and curation, certain outputs may not be relevant for all use cases. Some parameterization has already been implemented but can be further adjusted.
9
+
10
+ [Ideal KT Input Guidance Runbook](https://docs.google.com/document/d/1uw-nBHeE0egGvlqlFvtl8bNu2x6zCIkAVYH85h38cIk/edit)
11
+
12
+ [Example Outputs](https://drive.google.com/drive/u/0/folders/1nB0ORAuOWca5zuWAVo8TXfuxnwzi09yR)
13
+
14
+
15
+ ### Current Outputs
16
+
17
+ - High-level Summary
18
+ - Topic Specific Summaries
19
+ - Glossary
20
+ - Troubleshooting Steps
21
+ - Word Cloud and Matching Symptoms
22
+ - KB For each Summary and the Troubleshooting Steps
23
+ - Screenshots of any captured Timestamps in Summary/Troubleshooting Steps
24
+
25
+ **Note**: Processing.log is also generated in the working directory.
26
+
27
+ ## Prerequisites
28
+ - Python 3.11
29
+ - [ffmpeg](https://www.ffmpeg.org/) - Pre-req for Pydub's AV manipulation.
30
+
31
+ ## Installation
32
+
33
+ ### Clone the Repository
34
+ ```
35
+ git clone -b share --single-branch https://github.com/trilogy-group/cs-ai-kt-transcribe.git
36
+ ```
37
+
38
+ ### Set up the Python Environment
39
+ Pick your poison
40
+
41
+ ```
42
+ pyenv virtualenv [env_name]
43
+ pyenv activate [env_name]
44
+ ```
45
+ ```
46
+ python3 -m venv [env_name]
47
+ source ./bin/activate
48
+ ```
49
+
50
+ ### Installing Dependencies
51
+
52
+ From your primary venv directory:
53
+
54
+ ```
55
+ ./bin/python -m pip install -r requirements.txt
56
+ ```
57
+
58
+ ### Generate and Populate .env file
59
+
60
+ Within the primary venv directory, create a file named '.env' and populate it with the content below, replacing with your OpenAI API Key:
61
+
62
+ ```
63
+ OPENAI_API_KEY=[YOUR_API_KEY]
64
+ ```
65
+
66
+ ## Usage
67
+
68
+ ### Basic Usage
69
+
70
+ Topic and Transcribe are optional parameters that can be passed in to handle two special-cases - long-form multi-topic videos and skipping transcriptions.
71
+
72
+ By default, the script assumes you are providing video content (.mp4 format) in the input directory for a single topic that requires transcribing. Each Video (or transcript, if the optional flag is set to False) within the provided input directory will be processed in sequence. A folder is generated matching the video or transcript files name and the various outputs are placed within. Audio/Video precursor artefacts are placed within a generated "Processed" folder.
73
+
74
+ Once the basic set up above is completed, an Input directory can be generated to store your videos/transcripts to process. Then, run the script using the form below:
75
+
76
+ ```
77
+ kt-transcript.py [--topic [TOPIC]] [--transcribe [TRANSCRIBE]] [input_folder]
78
+ ```
79
+
80
+ #### Example Usage
81
+ ```
82
+ ./bin/python cs-ai-kt-transcribe/kt-transcript.py --topic True --transcribe True ./Input-Folder
83
+ ```
84
+
85
+
86
+ ### Arguments
87
+
88
+ ```
89
+ positional arguments:
90
+ input_folder The folder containing videos/transcripts to process relative to the current working directory.
91
+
92
+ options:
93
+ --topic If set to True, will generate topic-specific summaries in addition to the high-level summary.
94
+ --transcribe If set to False, will skip transcribing and leverage an existing '*_full_transcript.txt' file to generate outputs.
95
+ ```
96
+
97
+ ### Customizing Outputs
98
+
99
+ Within the prompts directory in your pyenv, you will find a selection of prompt files that can be tweaked and adjusted to alter the final behaviours of the LLM processing. The prompts provided are tailored for Kandy, a VoIP Telephony product. While this has limited impact on its ability to parse other content, specialising the Persona segment of the prompt for a particular skillset does produce higher-quality results.
100
+
101
+ While the specific content of the videos being parsed will likely determine the ideal use case, the topic prompt can be altered to provide more targeted/specialized summaries. Note that the "[REPLACE_ME]" placeholder within the topic prompt is handled within the topic processing logic and is not intended to be manually replaced before running. The identified topics are replaced at runtime.
102
+
103
+ If the transcription element is leveraged and you encounter certain terminology/acronyms not properly being captured, you can seed the prompt to improve outputs: [OpenAI Whisper Docs](https://platform.openai.com/docs/guides/speech-to-text/prompting)
cs_ai_kt_transcribe_share/__init__.py ADDED
File without changes
cs_ai_kt_transcribe_share/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (197 Bytes). View file
 
cs_ai_kt_transcribe_share/__pycache__/kt_transcript.cpython-312.pyc ADDED
Binary file (48.3 kB). View file
 
cs_ai_kt_transcribe_share/generate-transcripts.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "service_account",
3
+ "project_id": "generate-transcripts",
4
+ "private_key_id": "e3077532c8a4559587d123bc6e08dd0fae4e88cf",
5
+ "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDxruvw5+5RF25s\n6MFEGU1N+cRuETi7c0/QW4/P7fpGYgeHNjxxAyjUEM0nXvokFYYWCs+X/BQlxvIG\nTAZRYy8Arq8XkDqo84Nqt4Db5G2zbEz0Hy9OQFMsp+Z3rfy096IplZliQXG/knXB\nWlQKHAFmVJdwvGrS/vDVudGraOvJlWV+KbUlhVUnlvtY9UKwMmkWdy/JNPms75A2\nR3P8WTVQ+tIN+8jhB7jniIG4v7E4pN9ahgLKkVG3cycdOurQ6RahHDUDrW0aiQlx\nZklvbFJ3FYS+8quXxLzy3qh8hvB+VS59Sf+M7Hsmy1TIKP9JTdGxer2htMCxPF+M\nVaPIO2u7AgMBAAECggEADx4thRrE9g6cqjRfDob0pnmGt4ndPiCG8BYiJ1QQEUIF\njYgvfLRBZA6O/rT6ubIc+pj/hlbO3dQVhJ5R0Np+8LB8pGExwirFYDINXfPqhswx\n/h1LRL5SVZtFF9xqz9kjrC0+9H36gwzCz+76Dc3VqpSFCpjGp98pqnzeqDW+dhQC\nsFHbH7ZE8uOC0KhcZJEVoxBYHKUAqBr7KvyCwpkrJR+MgCuXYeyxK1FkZsZdV0/P\n7bU+xH2lYNH0zSorX6B0oen4Weu5hWTg/8GJt1TFht7RQiVpZQDRf56fxw2e9zUi\nBee+sQiOowkyBlcY54bRjDqHv54i06884fQpKjlg3QKBgQD4/EThybaUHZM2WvWg\npNv9+frCheg0tE/+zw4pCJHAOzHI9aFPTkCh4Ha5D3Lfg6mNL8xOnDX73TeL0Rkz\nHALAIw3KFUi5Ol5+QH2A5hmOT64nyBgUPQi1cDic1COjJOQ1YhhDSfqkguUwW+SW\nvPCtF/wZNoVKzF/kBzjqfQ7bRwKBgQD4ffz2+u2ntQcBBisHvj+k1XxCfNZaDkig\nDIpFS0V1oP97M+L63j1jpiYAulzUWZTcvBcgIvc12cZbAELH3KBhpbQacb7BG18x\nYmVdusgkwAB5aTbahWdKBReMeV818U4tBznQ8feSqUemH1NJYTMEn7wHWyzPXClZ\noXXT5zG97QKBgHAluKT2pU07sQYnnEFaYFiaUy9w/zFWEj4UZy1mVbf/MaJxuNGu\nzn/bSb57TeONOz6DzAOQauRiNNW+Php+QGLopxph0drUhqFZTO5VNieajjgO8YGU\nKCsJPAavqsmeIUhGQeV1GfclmjPZADJBMirVBdEZdJpHSd28vw8nexybAoGAGjv6\n4/xT8Nu81I/ZJSUhxP5A8ygI3VMaucL5Kh2rRKLttg0hToj+BDI7WuJSAJ1AiRlW\ne68VWFsYhOz/H+icChMbc5tEn1CLeURBHWcl0DNxGWBRSTrCT98/wDXkJmbGcRyN\nLxEMqTAz/0bZXSrk3v0aNdCbZ3z8ZJkWPnvvFXUCgYAhNt1ZOPp3hJRpaBtvK/cR\nODVWaTv2ZxORU+tbf+Uv9My3Ac2Wsrpzyg2V4DZPyojSyABoH0fnvn+f5ntX55pX\nYoWsHd9SAYQD8km6SuMcwHwTg1FC59djVImlLDQyPhfUUSQNb47iS0cRgfPnR54M\ny8BCLcgxqBbeIrkrXQKjbA==\n-----END PRIVATE KEY-----\n",
6
+ "client_email": "generate-transcripts@generate-transcripts.iam.gserviceaccount.com",
7
+ "client_id": "114541892332257991546",
8
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
+ "token_uri": "https://oauth2.googleapis.com/token",
10
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/generate-transcripts%40generate-transcripts.iam.gserviceaccount.com",
12
+ "universe_domain": "googleapis.com"
13
+ }
cs_ai_kt_transcribe_share/kt_transcript.py ADDED
@@ -0,0 +1,844 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import openai
4
+ import logging
5
+ import json
6
+ import shutil
7
+ import argparse
8
+ import numpy as np
9
+ from moviepy.editor import VideoFileClip
10
+ from PIL import Image
11
+ from datetime import datetime, timedelta
12
+ from math import ceil
13
+ from dotenv import load_dotenv
14
+ from pydub import AudioSegment
15
+ from tqdm import tqdm
16
+ from google.oauth2 import service_account
17
+ from googleapiclient.discovery import build
18
+ from googleapiclient.http import MediaFileUpload
19
+ from googleapiclient.http import MediaIoBaseDownload
20
+ import shutil
21
+
22
+
23
+ parser = argparse.ArgumentParser(description='Process KT videos in a given folder to generate transcripts and summaries of what was discussed.')
24
+ parser.add_argument(
25
+ 'input_folder',
26
+ nargs='?', # Optional
27
+ default='.', # Use the current working directory if no folder is specified
28
+ help='The folder containing videos to process relative to the current working directory.'
29
+ )
30
+ parser.add_argument(
31
+ '--topic',
32
+ nargs='?', # Optional
33
+ default=False,
34
+ help='If set to True, will generate topic-specific summaries in addition to the high-level summary.'
35
+ )
36
+ parser.add_argument(
37
+ '--transcribe',
38
+ nargs='?', # Optional
39
+ default=True,
40
+ help='If set to False, will skip transcribing and leverage an existing _full_transcript.txt file to generate outputs.'
41
+ )
42
+ args = parser.parse_args()
43
+
44
+ log_file_path = os.path.join(os.path.abspath(args.input_folder), "processing.log")
45
+ logging.basicConfig(
46
+ level=logging.INFO,
47
+ handlers=[
48
+ logging.StreamHandler(),
49
+ logging.FileHandler(log_file_path, mode='a')
50
+ ],
51
+ format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
52
+ datefmt='%Y-%m-%d %H:%M:%S'
53
+ )
54
+
55
+
56
+ class KnowledgeTranscriber(object):
57
+ MAX_SIZE = 5000000 # 5 MB
58
+ MAX_SIZE_MB = MAX_SIZE / (1024 * 1024) # Convert bytes to MB
59
+ BITRATE = 128000 # 128 kbps
60
+
61
+ def __init__(self, api_key):
62
+ self.client = openai.OpenAI(api_key=api_key)
63
+
64
+ script_dir = os.path.dirname(os.path.abspath(__file__)) # Absolute directory of the script
65
+ prompts = {
66
+ "summary_prompt": os.path.join(script_dir, "prompts", "1-summary_prompt.txt"),
67
+ "topic_prompt": os.path.join(script_dir, "prompts", "2-topic_prompt.txt"),
68
+ "troubleshooting_prompt": os.path.join(script_dir, "prompts", "3-troubleshooting_prompt.txt"),
69
+ "glossary_prompt": os.path.join(script_dir, "prompts", "4-glossary_prompt.txt"),
70
+ "tags_prompt": os.path.join(script_dir, "prompts", "5-tags_prompt.txt"),
71
+ "article_prompt": os.path.join(script_dir, "prompts", "6-article_prompt.txt")
72
+ }
73
+
74
+ def process_folder(self, folder_path, transcribe_flag, drive_folder_link):
75
+ """
76
+ Takes a folder path and processes all videos or transcripts in the folder.
77
+ First downloads all videos from the specified Google Drive folder to the local folder path.
78
+ :param folder_path: The path to the folder containing videos or transcripts to process.
79
+ :param transcribe_flag: Flag to indicate if transcription is needed.
80
+ :param drive_folder_id: The ID of the Google Drive folder containing the videos.
81
+ """
82
+
83
+ ###Added from here Initialize Google Drive service
84
+ drive_service = self.get_drive_service()
85
+
86
+ # Extract the folder ID from the URL.
87
+ folder_id = self.extract_drive_folder_id(drive_folder_link)
88
+
89
+ # List all files in the Google Drive folder
90
+ drive_files = self.list_files_in_folder(drive_service, folder_id)
91
+ logging.info(f"Downloading files from Google Drive folder: {folder_id}")
92
+
93
+ # TODO: Sharing the app is going to be the last step.
94
+
95
+ # Download each file to the local folder_path
96
+ for file in tqdm(drive_files, desc="Downloading Files"):
97
+ file_name = file['name']
98
+ file_id = file['id']
99
+ local_file_path = os.path.join(folder_path, file_name)
100
+ if not os.path.exists(local_file_path): # Avoid re-downloading files
101
+ logging.info(f"Downloading file: {file_name}")
102
+ self.download_file(drive_service, file_id, local_file_path)
103
+ else:
104
+ logging.info(f"File already exists: {file_name}")
105
+ ###End of Added
106
+
107
+ ### All below under this line is same with the original script. Until Next Added from here
108
+ logging.info(f"Processing files in folder: {folder_path}")
109
+ for filename in tqdm(os.listdir(folder_path), desc="Processing Files"):
110
+ if transcribe_flag == "False":
111
+ if filename.endswith("_full_transcript.txt"):
112
+
113
+ # Processing for transcript files generated by Zoom/Loom/etc. already.
114
+ logging.info(f"Processing transcript: {filename}")
115
+
116
+ base_name = filename.replace("_full_transcript.txt", "")
117
+ new_folder_path = os.path.join(folder_path, base_name)
118
+ logging.info(f"New folder path: {new_folder_path}")
119
+
120
+ # Folder handling outside main video processing functions.
121
+ if not os.path.exists(new_folder_path):
122
+ os.makedirs(new_folder_path)
123
+ original_path = os.path.join(folder_path, filename)
124
+ new_path = os.path.join(new_folder_path, filename)
125
+ logging.info(f"Moving file from {original_path} to {new_path}")
126
+ shutil.move(original_path, new_path)
127
+
128
+ # Generate our ouputs
129
+ self.generate_transcript_outputs(new_path)
130
+ self.process_articles(new_folder_path)
131
+
132
+ logging.info(f"Processing complete for: {filename}.")
133
+
134
+ else:
135
+ if filename.endswith(".mp4"):
136
+ # Process for video files
137
+ logging.info(f"Processing video: {filename}")
138
+ video_path = os.path.join(folder_path, filename)
139
+ self.process_video(video_path, folder_path)
140
+ logging.info(f"Processing complete for: {filename}.")
141
+
142
+ ###Added from here
143
+ # After processing files
144
+ logging.info(f"Processing complete for all files in folder: {folder_path}. Uploading processed files to Google Drive.")
145
+
146
+ # Iterate over files in the input folder and upload each to Google Drive
147
+ self.sync_folder_to_drive(drive_service, folder_path, folder_id, is_root=True)
148
+
149
+ logging.info(f"Uploading processed files to Google Drive complete for all files in folder: {folder_path}. Success.")
150
+
151
+ # Calling the cleanup function
152
+ input_folder_path = os.path.abspath(folder_path)
153
+ self.cleanup_input_folder(input_folder_path)
154
+
155
+ ###End of Added
156
+
157
+
158
+ def check_and_process(self, file_path, process_func, file_description):
159
+ """
160
+ Validates if a file exists and processes it if it doesn't.
161
+ :param file_path: The path to the file to check.
162
+ :param process_func: The function to call to process the file if it doesn't exist.
163
+ :param file_description: A description of the file to use in logging.
164
+ """
165
+ if not os.path.exists(file_path):
166
+ logging.info(f"Processing {file_description}: {file_path}")
167
+ process_func()
168
+ else:
169
+ logging.info(f"{file_description} already exists.")
170
+
171
+ def process_video(self, input_video, folder_path):
172
+ """
173
+ Takes a video path, processes the video into a transcript, and a collection of knowledge outputs.
174
+ :param input_video: The path to the video to process.
175
+ :param folder_path: The path to the folder containing the video to process.
176
+ """
177
+ base_name = os.path.splitext(os.path.basename(input_video))[0]
178
+ output_folder = os.path.join(folder_path, f"{base_name}_output")
179
+ processed_folder = os.path.join(folder_path, "Processed")
180
+ os.makedirs(output_folder, exist_ok=True)
181
+ os.makedirs(processed_folder, exist_ok=True)
182
+
183
+ output_audio = os.path.join(output_folder, f"{base_name}.mp3")
184
+ processed_audio = os.path.join(processed_folder, f"{base_name}.mp3")
185
+ processed_video = os.path.join(processed_folder, f"{base_name}.mp4")
186
+ transcript_file = os.path.join(output_folder, f"{base_name}_full_transcript.txt")
187
+
188
+ # Checks to avoid re-processing to save time and calls to GPT.
189
+
190
+ self.check_and_process(
191
+ output_audio,
192
+ lambda: self.video_to_audio(input_video, output_audio),
193
+ "Audio file"
194
+ )
195
+
196
+ self.check_and_process(
197
+ transcript_file,
198
+ lambda: self.transcribe_and_combine_audio(output_audio),
199
+ "Transcript file"
200
+ )
201
+
202
+ transcript_outputs_exist = os.path.exists(os.path.join(output_folder, f"{base_name}_summary.txt")) or \
203
+ os.path.exists(os.path.join(output_folder, f"{base_name}_troubleshooting_steps.txt")) or \
204
+ os.path.exists(os.path.join(output_folder, f"{base_name}_glossary.txt")) or \
205
+ os.path.exists(os.path.join(output_folder, f"{base_name}_tags_and_symptoms.txt"))
206
+
207
+ if not transcript_outputs_exist:
208
+ logging.info(f"Generating transcript outputs for: {transcript_file}")
209
+ self.generate_transcript_outputs(transcript_file)
210
+ else:
211
+ logging.info("Transcript-related outputs already exist.")
212
+
213
+ # Handling the screenshot capture and processing
214
+ logging.info(f"Checking summary file for timestamps: {transcript_file}")
215
+ summary_file = os.path.join(output_folder, f"{base_name}_full_transcript_summary.txt")
216
+ troubleshooting_file = os.path.join(output_folder, f"{base_name}_full_transcript_troubleshooting_steps.txt")
217
+ timestamp_list = self.find_timestamps(summary_file) + self.find_timestamps(troubleshooting_file)
218
+
219
+ if timestamp_list:
220
+ logging.info(f"Timestamps found in summary file: {summary_file}")
221
+ screenshot_folder = os.path.join(output_folder, "Screenshots")
222
+ os.makedirs(screenshot_folder, exist_ok=True)
223
+ self.parse_and_extract_frames(input_video, screenshot_folder, timestamp_list)
224
+ else:
225
+ logging.info(f"No timestamps found in summary file: {summary_file}")
226
+
227
+ self.check_and_process(
228
+ processed_audio,
229
+ lambda: shutil.move(output_audio, processed_audio),
230
+ "Processed audio"
231
+ )
232
+
233
+ self.check_and_process(
234
+ processed_video,
235
+ lambda: shutil.move(input_video, processed_video),
236
+ "Processed video"
237
+ )
238
+
239
+ # Generate final articles from summary and troubleshooting steps.
240
+ self.process_articles(output_folder)
241
+
242
+ logging.info(f"Files saved to: {output_folder}")
243
+ logging.info(f"Processing complete for: {input_video}.")
244
+
245
+
246
+ def transcribe_and_combine_audio(self, audio_file_path):
247
+ """
248
+ Takes an audio file path, splits the audio into parts if needed, transcribes the audio parts, and combines the transcriptions into a single transcript file.
249
+ :param audio_file_path: The path to the audio file to process.
250
+ :return: The path to the transcript file.
251
+ """
252
+ base_file_path = os.path.splitext(audio_file_path)[0]
253
+ transcript_file_path = f"{base_file_path}_full_transcript.txt"
254
+ manifest_file_path = f"{base_file_path}_manifest.txt"
255
+
256
+ # Load or initialize the manifest for keeping track of processed parts
257
+ if os.path.exists(manifest_file_path):
258
+ with open(manifest_file_path, "r") as manifest_file:
259
+ processed_parts = set(manifest_file.read().splitlines())
260
+ else:
261
+ processed_parts = set()
262
+
263
+ # Transcribe each part of the audio file, as needed
264
+ parts_to_transcribe = sorted(self.get_or_create_audio_parts(audio_file_path))
265
+ for part in parts_to_transcribe:
266
+ part_transcript_file = f"{part}_transcript.txt"
267
+ if part in processed_parts:
268
+ logging.info(f"Transcription part already exists: {part_transcript_file}")
269
+ else:
270
+ logging.info(f"Transcribing audio part: {part}")
271
+ transcription = self.transcribe_audio_part(part)
272
+ with open(part_transcript_file, "w") as part_file:
273
+ part_file.write(transcription)
274
+ processed_parts.add(part)
275
+ logging.info(f"Transcription complete for: {part}")
276
+ with open(manifest_file_path, "w") as manifest_file:
277
+ manifest_file.write("\n".join(sorted(processed_parts)))
278
+ # Check if the part is not the main audio file before removing
279
+ if part != audio_file_path:
280
+ os.remove(part)
281
+ logging.info(f"Removed audio part: {part}")
282
+
283
+ # Once all parts have been transcribed, combine them into the full transcript file
284
+ with open(transcript_file_path, "w") as transcript_file:
285
+ for part in parts_to_transcribe:
286
+ logging.info(f"Combining transcript part: {part}")
287
+ part_transcript_file = f"{part}_transcript.txt"
288
+ with open(part_transcript_file, "r") as part_file:
289
+ transcript_file.write(part_file.read() + "\n")
290
+ os.remove(part_transcript_file)
291
+ logging.info(f"Removed transcript part: {part_transcript_file}")
292
+
293
+
294
+ # Now, we need to take the transcript file and adjust the timestamps to account for the audio parts
295
+ with open(transcript_file_path, 'r') as file:
296
+ full_transcript = file.read()
297
+
298
+ adjusted_content = self.adjust_timestamps(full_transcript)
299
+ with open(transcript_file_path, 'w') as file:
300
+ file.write(adjusted_content)
301
+
302
+ logging.info(f"Transcript saved to: {transcript_file_path}")
303
+ os.remove(manifest_file_path)
304
+ return transcript_file_path
305
+
306
+ def parse_time(self, time_str):
307
+ """Convert a timestamp string to seconds."""
308
+ h, m, s = map(float, time_str.split(':'))
309
+ return h * 3600 + m * 60 + s
310
+
311
+ def format_time(self, seconds):
312
+ """Convert seconds back to a timestamp string."""
313
+ h = int(seconds // 3600)
314
+ m = int((seconds % 3600) // 60)
315
+ s = seconds % 60
316
+ return f"{h:02}:{m:02}:{s:06.3f}"
317
+
318
+ def adjust_timestamps(self, vtt_content):
319
+ """
320
+ Takes a VTT content string and adjusts the timestamps to account for the audio parts.
321
+ :param vtt_content: The VTT content to process.
322
+ :return: The adjusted VTT content.
323
+ """
324
+ sections = vtt_content.split("WEBVTT")
325
+ adjusted_sections = []
326
+ time_offset = 0
327
+
328
+ for section in sections[1:]: # Skip the first section as it's likely the header
329
+ lines = section.strip().split("\n")
330
+ adjusted_lines = []
331
+
332
+ for line in lines:
333
+ if '-->' in line:
334
+ start, end = line.split(' --> ')
335
+ start_sec = self.parse_time(start) + time_offset
336
+ end_sec = self.parse_time(end) + time_offset
337
+ adjusted_line = f"{self.format_time(start_sec)} --> {self.format_time(end_sec)}"
338
+ adjusted_lines.append(adjusted_line)
339
+ else:
340
+ adjusted_lines.append(line)
341
+
342
+ # Update the time offset using the last timestamp of the current section
343
+ if adjusted_lines:
344
+ last_time = adjusted_lines[-2] # The second last line contains the last timestamp
345
+ _, end = last_time.split(' --> ')
346
+ time_offset = self.parse_time(end)
347
+
348
+ adjusted_sections.append('\n'.join(adjusted_lines))
349
+
350
+ return "WEBVTT\n\n".join(adjusted_sections)
351
+
352
+
353
+ def extract_frames_by_range(self, video_path, target_folder, start_time, end_time, fps=1):
354
+ """
355
+ Takes a video path, a start time, and an end time, and extracts frames from the video between the given timestamps.
356
+ :param video_path: The path to the video to process.
357
+ :param target_folder: The path to the folder to save the extracted frames to.
358
+ :param start_time: The start time to extract frames from in HH:MM:SS.mmm format.
359
+ :param end_time: The end time to extract frames to in HH:MM:SS.mmm format.
360
+ :param fps: The frames per second to extract from the video.
361
+ """
362
+
363
+ # Convert start_time and end_time from HH:MM:SS.mmm to seconds
364
+ start_seconds = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_time.split(":")))
365
+ end_seconds = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_time.split(":")))
366
+
367
+ # Create the target folder if it doesn't exist
368
+ if not os.path.exists(target_folder):
369
+ os.makedirs(target_folder)
370
+
371
+ with VideoFileClip(video_path) as video:
372
+ # Calculate the interval between frames based on the desired fps
373
+ interval = 1 / fps
374
+
375
+ # Adjust the loop to iterate over the desired timestamp range
376
+ current_time = start_seconds
377
+ while current_time < end_seconds:
378
+ frame = video.get_frame(current_time)
379
+ timestamp = self.format_time(int(current_time))
380
+ frame_path = os.path.join(target_folder, f"{timestamp}.png")
381
+ Image.fromarray(np.uint8(frame)).save(frame_path)
382
+ current_time += interval
383
+
384
+
385
+ def find_timestamps(self, file_path):
386
+ """
387
+ Takes a file path and finds the timestamps within the file.
388
+ Searches for timestamps in the format "at 00:00:20.360" and "[00:00:28.559]".
389
+ :param file_path: The path to the file to process.
390
+ :return: A list of timestamps.
391
+ """
392
+ # Updated pattern to match both "at 00:00:20.360" and "[00:00:28.559]"
393
+ timestamp_pattern = r'at (\d{2}:\d{2}:\d{2}\.\d{3})|\[(\d{2}:\d{2}:\d{2}\.\d{3})\]'
394
+
395
+ timestamps = []
396
+
397
+ with open(file_path, 'r') as file:
398
+ content = file.read()
399
+ # Find all matches and process them to flatten the list and remove None
400
+ raw_matches = re.findall(timestamp_pattern, content)
401
+ for match in raw_matches:
402
+ # match is a tuple where one group is the timestamp and the other is empty
403
+ timestamp = match[0] if match[0] else match[1]
404
+ timestamps.append(timestamp)
405
+
406
+ return timestamps
407
+
408
+ def parse_and_extract_frames(self, video_path, target_path, timestamps):
409
+ """
410
+ Takes a video path and a list of timestamps, and extracts frames from the video around the given timestamps.
411
+ :param video_path: The path to the video to process.
412
+ :param target_path: The path to the folder to save the extracted frames to.
413
+ :param timestamps: A list of timestamps to extract frames around.
414
+ """
415
+ # Function to adjust the timestamp by a given number of seconds
416
+ def adjust_timestamp(timestamp, seconds):
417
+ timestamp_dt = datetime.strptime(timestamp, "%H:%M:%S.%f")
418
+ adjusted_timestamp = timestamp_dt + timedelta(seconds=seconds)
419
+ return adjusted_timestamp.strftime("%H:%M:%S.%f")[:-3]
420
+
421
+ for timestamp in timestamps:
422
+ start_timestamp = adjust_timestamp(timestamp, -5)
423
+ end_timestamp = adjust_timestamp(timestamp, 5)
424
+ self.extract_frames_by_range(video_path, target_path, start_timestamp, end_timestamp)
425
+
426
+
427
+ def get_or_create_audio_parts(self, audio_file_path):
428
+ """
429
+ Takes an audio file path and splits the audio into parts if needed.
430
+ :param audio_file_path: The path to the audio file to process.
431
+ :return: A list of paths to the audio parts.
432
+ """
433
+ # Check if the audio needs to be split by checking its file size - this is approximate, but close enough for gov work
434
+ file_size_mb = os.path.getsize(audio_file_path) / (1024 * 1024)
435
+ parts_directory = os.path.join(os.path.dirname(audio_file_path), "parts")
436
+ os.makedirs(parts_directory, exist_ok=True)
437
+
438
+ # If the audio file has already been split, return the existing parts - else, split the audio file
439
+ existing_parts = [os.path.join(parts_directory, f) for f in os.listdir(parts_directory) if os.path.isfile(os.path.join(parts_directory, f))]
440
+ if existing_parts:
441
+ logging.info("Found existing audio parts. Resuming transcription.")
442
+ return existing_parts
443
+ logging.info(f"Audio file size: {file_size_mb} MB")
444
+ if file_size_mb > self.MAX_SIZE_MB:
445
+ logging.info(f"Audio file size exceeds maximum size of {self.MAX_SIZE_MB} MB. Splitting audio file into parts.")
446
+ return self.split_audio_file_by_size(audio_file_path)
447
+ else:
448
+ logging.info(f"Audio file size is within maximum size of {self.MAX_SIZE_MB} MB. No need to split the audio file.")
449
+ return [audio_file_path]
450
+
451
+ def split_audio_file_by_size(self, audio_file_path):
452
+ """
453
+ Takes an audio file path and splits the audio into parts based on the maximum size.
454
+ :param audio_file_path: The path to the audio file to process.
455
+ :return: A list of paths to the audio parts.
456
+ """
457
+ logging.info(f"Splitting audio file: {audio_file_path}")
458
+ audio = AudioSegment.from_file(audio_file_path)
459
+ max_chunk_duration_ms = ((self.MAX_SIZE * 8) / self.BITRATE) * 1000
460
+ logging.info(f"Max chunk duration: {max_chunk_duration_ms} ms")
461
+ num_chunks = ceil(len(audio) / max_chunk_duration_ms)
462
+ logging.info(f"Number of chunks: {num_chunks}")
463
+ chunk_length = len(audio) // num_chunks
464
+ chunks = [audio[i * chunk_length: (i + 1) * chunk_length] for i in range(num_chunks)]
465
+ chunk_files = []
466
+ for i, chunk in enumerate(chunks):
467
+ chunk_name = f"{os.path.splitext(audio_file_path)[0]}_part{i}.mp3"
468
+ logging.info(f"Exporting audio chunk: {chunk_name}")
469
+ chunk.export(chunk_name, format="mp3")
470
+ chunk_files.append(chunk_name)
471
+ logging.info(f"Audio file split into {len(chunk_files)} parts.")
472
+ return chunk_files
473
+
474
+ def video_to_audio(self, input_video, output_audio):
475
+ """
476
+ Takes a video file path and strips out the audio to save as an MP3 file.
477
+ :param input_video: The path to the video file to process.
478
+ :param output_audio: The path to the audio file to save the converted audio to.
479
+ """
480
+ if not os.path.exists(output_audio):
481
+ video = AudioSegment.from_file(input_video, "mp4")
482
+ video.export(output_audio, format="mp3", bitrate="128k")
483
+ logging.info(f"Audio file exported: {output_audio}")
484
+ else:
485
+ logging.info("Audio file already exists")
486
+
487
+ def transcribe_audio_part(self, part):
488
+ """
489
+ Takes an audio file part path and transcribes the audio into text via whisper LLM.
490
+ :param part: The path to the audio file to process.
491
+ :return: The transcribed text.
492
+ """
493
+ logging.info(f"Transcribing audio part: {part}")
494
+ with open(part, "rb") as audio_file:
495
+ transcript = self.client.audio.transcriptions.create(
496
+ model="whisper-1",
497
+ file=audio_file,
498
+ response_format="vtt"
499
+ # This prompt can be used to help the LLM understand the context of the audio and certain terms of art that may be used.
500
+ #,prompt="UCaaS, CPaaS, STaaS, DRaaS, BLF, CDR, CIM, GCCH, GVBM, HEPIC, SBC, PSTN, SMB, OrecX, Prov"
501
+ )
502
+ return transcript
503
+
504
+ def audio_to_transcript(self, input_audio):
505
+ """
506
+ Takes an audio file path and transcribes the audio into text via whisper LLM.
507
+ :param input_audio: The path to the audio file to process.
508
+ :return: The path to the transcript file.
509
+ """
510
+ logging.info(f"Transcribing audio: {input_audio}")
511
+ with open(input_audio, "rb") as audio_file:
512
+ transcript = self.client.audio.transcriptions.create(
513
+ model="whisper-1",
514
+ file=audio_file,
515
+ response_format="vtt"
516
+ # This prompt can be used to help the LLM understand the context of the audio and certain terms of art that may be used.
517
+ #,prompt="UCaaS, CPaaS, STaaS, DRaaS, BLF, CDR, CIM, GCCH, GVBM, HEPIC, SBC, PSTN, SMB, OrecX, Prov"
518
+ )
519
+ logging.info("Transcript created")
520
+ base_name = os.path.splitext(input_audio)[0]
521
+ output_file = f"{base_name}_transcript.txt"
522
+
523
+ with open(output_file, "w") as f:
524
+ json.dump(transcript, f, indent=4)
525
+
526
+ logging.info(f"Transcript saved to: {output_file}")
527
+ return output_file
528
+
529
+
530
+ def process_and_save_output(self, base_name, prompt_key, transcript_content, additional_content=None, file_suffix=""):
531
+ """
532
+ Takes a transcript and generates the output for a given prompt.
533
+ :param base_name: The base name of the transcript file.
534
+ :param prompt_key: The key of the prompt to use.
535
+ :param transcript_content: The content of the transcript to process.
536
+ :param additional_content: Additional content to use in the prompt.
537
+ :param file_suffix: The suffix to use for the output file.
538
+ :return: The path to the output file.
539
+ """
540
+ file_name = f"{base_name}_{file_suffix}.txt"
541
+
542
+ # Check if the file already exists
543
+ if os.path.exists(file_name):
544
+ logging.info(f"{file_suffix.replace('_', ' ').capitalize()} file already exists: {file_name}")
545
+ return file_name
546
+
547
+ # Load and process the prompt
548
+ prompt = self.load_prompt(prompt_key)
549
+ if additional_content:
550
+ conversation_history = self.build_conversation_history(self.load_prompt("summary_prompt"), transcript_content, additional_content['summary'], additional_content['topic'])
551
+ else:
552
+ conversation_history = self.build_conversation_history(prompt, transcript_content)
553
+
554
+ response = self.send_conversation(conversation_history)
555
+ content = response.choices[0].message.content
556
+ clean_lines = [line.strip() for line in content.split('\n') if line.strip() != '']
557
+ clean_content = '\n\n'.join(clean_lines)
558
+
559
+ # Write the processed content to the file
560
+ with open(file_name, "w") as f:
561
+ f.write(clean_content)
562
+ logging.info(f"{file_suffix.replace('_', ' ').capitalize()} saved to: {file_name}")
563
+
564
+ return file_name
565
+
566
+ def generate_transcript_outputs(self, transcript_file):
567
+ """
568
+ Takes a transcript file and generates the summary outputs.
569
+ :param transcript_file: The path to the transcript file to process.
570
+ """
571
+ with open(transcript_file, "r") as file:
572
+ transcript_content = file.read()
573
+ base_name = os.path.splitext(transcript_file)[0]
574
+
575
+ # Generate the summary
576
+ self.process_and_save_output(base_name, "summary_prompt", transcript_content, file_suffix="summary")
577
+ with open(f"{base_name}_summary.txt", "r") as file:
578
+ summary_file = file.read()
579
+
580
+ # Generate topic specific summaries
581
+ topic_prompts = self.generate_topic_prompts(summary_file)
582
+ # if script run with --topic, generate topic specific summaries.
583
+ # if extract_topics:
584
+ #Topic extraction assumed to be true
585
+ for i, topic_prompt in enumerate(topic_prompts):
586
+ additional_content = {"summary": summary_file, "topic": topic_prompt}
587
+ self.process_and_save_output(base_name, "summary_prompt", transcript_content, additional_content, file_suffix=f"topic{i}_summary")
588
+
589
+ # Generate the troubleshooting steps
590
+ self.process_and_save_output(base_name, "troubleshooting_prompt", transcript_content, file_suffix="troubleshooting_steps")
591
+
592
+ # Generate the glossary
593
+ self.process_and_save_output(base_name, "glossary_prompt", transcript_content, file_suffix="glossary")
594
+
595
+ # Generate the tags and symptoms
596
+ self.process_and_save_output(base_name, "tags_prompt", transcript_content, file_suffix="tags_and_symptoms")
597
+
598
+ logging.info(f"Transcript outputs saved to: {os.path.splitext(transcript_file)[0]}")
599
+
600
+
601
+ def extract_topics(self, response_text):
602
+ """
603
+ Takes a response text and extracts the topics from it.
604
+ :param response_text: The response text to process.
605
+ :return: A list of topics.
606
+ """
607
+ # Regular expression to match the pattern "Topic X: Title"
608
+ pattern = r"Topic \d+: .+"
609
+ topics = re.findall(pattern, response_text)
610
+ return topics
611
+
612
+
613
+ # OpenAI Functions
614
+
615
+ def generate_topic_prompts(self, response_text):
616
+ """
617
+ Takes a response text and generates the topic prompts.
618
+ :param response_text: The response text to process.
619
+ :return: A list of topic prompts.
620
+ """
621
+ topics = self.extract_topics(response_text)
622
+ base_prompt = self.load_prompt("topic_prompt")
623
+ topic_prompts = []
624
+ for topic in topics:
625
+ modified_prompt = base_prompt.replace("[REPLACE_ME]", topic)
626
+ topic_prompts.append(modified_prompt)
627
+ return topic_prompts
628
+
629
+
630
+ def load_prompt(self,prompt_key):
631
+ """
632
+ Takes a prompt key and loads the prompt from the prompts folder.
633
+ :param prompt_key: The key of the prompt to load.
634
+ :return: The prompt content.
635
+ """
636
+ prompt_path = self.prompts[prompt_key]
637
+ print(f"Loading prompt from: {prompt_path}") # Debugging line
638
+ with open(self.prompts[prompt_key], 'r') as file:
639
+ return file.read()
640
+
641
+
642
+ def send_conversation(self, conversation_history):
643
+ """
644
+ Takes a conversation history and sends it to the OpenAI API to generate a response.
645
+ :param conversation_history: The conversation history to send.
646
+ :return: The response from the LLM
647
+ """
648
+ response = openai.chat.completions.create(
649
+ model="gpt-4-1106-preview",
650
+ #model="gpt-3.5-turbo-1106",
651
+ messages=conversation_history,
652
+ max_tokens=4096,
653
+ temperature=0.00,
654
+ )
655
+ return response
656
+
657
+
658
+ def build_conversation_history(self, system_prompt, user_prompt1, assistant_response=None, user_prompt2=None):
659
+ """
660
+ Takes a system prompt, user prompt, and optional assistant response and user prompt and builds a conversation history.
661
+ :param system_prompt: The system prompt to use.
662
+ :param user_prompt1: The first user prompt to use.
663
+ :param assistant_response: The assistant response to use.
664
+ :param user_prompt2: The second user prompt to use.
665
+ :return: The conversation history.
666
+ """
667
+ conversation_history = [
668
+ {"role": "system", "content": system_prompt},
669
+ {"role": "user", "content": user_prompt1}
670
+ ]
671
+ # Check if both or none of the optional parameters are provided
672
+ if (assistant_response is not None and user_prompt2 is not None) or (assistant_response is None and user_prompt2 is None):
673
+ # Append the optional prompts if both are provided
674
+ if assistant_response is not None:
675
+ conversation_history.append({"role": "assistant", "content": assistant_response})
676
+ conversation_history.append({"role": "user", "content": user_prompt2})
677
+ else:
678
+ raise ValueError("Both 'assistant_response' and 'user_prompt2' must be provided together or not at all.")
679
+
680
+ return conversation_history
681
+
682
+ def generate_article(self, input_file):
683
+ """
684
+ Takes an input file path and generates a article from it.
685
+ :param input_file_path: The path to the input file to process.
686
+ :return: The article.
687
+ """
688
+ article_prompt = self.load_prompt("article_prompt")
689
+ with open(input_file, "r") as file:
690
+ file_content = file.read()
691
+ article_convo = self.build_conversation_history(article_prompt, file_content)
692
+ response = self.send_conversation(article_convo)
693
+ content = response.choices[0].message.content
694
+ clean_lines = [line.strip() for line in content.split('\n') if line.strip() != '']
695
+ clean_content = '\n\n'.join(clean_lines)
696
+ return clean_content
697
+
698
+ def process_articles(self, input_path):
699
+ """
700
+ Takes a path to a folder containing input files and generates articles from them.
701
+ :param input_path: The path to the folder containing input files to process.
702
+ """
703
+ logging.info(f"Processing article inputs in folder: {input_path}")
704
+ for filename in tqdm(os.listdir(input_path), desc="Processing Files"):
705
+ if filename.endswith("_summary.txt") or filename.endswith("_troubleshooting_steps.txt"):
706
+ logging.info(f"Processing article input: {filename}")
707
+ input_file = os.path.join(input_path, filename)
708
+ article = self.generate_article(input_file)
709
+ output_file = os.path.join(input_path, f"{os.path.splitext(filename)[0]}_article.txt")
710
+ with open(output_file, "w") as f:
711
+ f.write(article)
712
+ logging.info(f"Article saved to: {output_file}")
713
+
714
+ # Everything below is added to adjust existing script to run self service.
715
+ # Until load_dotenv()
716
+
717
+
718
+ def get_drive_service(self):
719
+ SCOPES = ['https://www.googleapis.com/auth/drive']
720
+ SERVICE_ACCOUNT_FILE = os.path.join(os.path.dirname(__file__), 'generate-transcripts.json')
721
+
722
+ credentials = service_account.Credentials.from_service_account_file(
723
+ SERVICE_ACCOUNT_FILE, scopes=SCOPES)
724
+
725
+ return build('drive', 'v3', credentials=credentials)
726
+
727
+ def extract_drive_folder_id(self, drive_link):
728
+ # This can be expanded to handle various Google Drive link formats
729
+ match = re.search(r'folders/([^/?]+)', drive_link)
730
+ if match:
731
+ return match.group(1)
732
+ else:
733
+ raise ValueError("Invalid Google Drive folder link.")
734
+
735
+ def list_files_in_folder(self, service, folder_id):
736
+ results = service.files().list(
737
+ q=f"'{folder_id}' in parents and trashed=false",
738
+ pageSize=100,
739
+ fields="nextPageToken, files(id, name)").execute()
740
+ return results.get('files', [])
741
+
742
+ def download_file(self, service, file_id, file_path):
743
+ # Ensure the directory where the file will be saved exists
744
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
745
+
746
+ request = service.files().get_media(fileId=file_id)
747
+ with open(file_path, 'wb') as fh:
748
+ downloader = MediaIoBaseDownload(fh, request)
749
+ done = False
750
+ while done is False:
751
+ status, done = downloader.next_chunk()
752
+
753
+ def find_or_create_drive_folder(self, service, folder_name, parent_folder_id):
754
+ # Check if folder exists
755
+ query = f"mimeType='application/vnd.google-apps.folder' and name='{folder_name}' and '{parent_folder_id}' in parents and trashed=false"
756
+ response = service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
757
+ files = response.get('files', [])
758
+ if files:
759
+ # Folder exists, return its ID
760
+ return files[0]['id']
761
+ else:
762
+ # Folder doesn't exist, create it
763
+ folder_metadata = {
764
+ 'name': folder_name,
765
+ 'mimeType': 'application/vnd.google-apps.folder',
766
+ 'parents': [parent_folder_id]
767
+ }
768
+ folder = service.files().create(body=folder_metadata, fields='id').execute()
769
+ return folder.get('id')
770
+
771
+ def upload_file(self, service, file_path, drive_folder_id):
772
+ file_metadata = {'name': os.path.basename(file_path), 'parents': [drive_folder_id]}
773
+ media = MediaFileUpload(file_path, resumable=True)
774
+ file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
775
+ logging.info(f"Uploaded {file_path} to Google Drive with ID {file.get('id')}")
776
+
777
+ def sync_folder_to_drive(self, service, local_folder_path, drive_parent_folder_id, is_root=True):
778
+ """
779
+ Synchronize a local folder structure and its files with Google Drive.
780
+
781
+ :param service: Authenticated Google Drive service instance.
782
+ :param local_folder_path: Path to the local folder to sync.
783
+ :param drive_parent_folder_id: The Google Drive folder ID to sync with.
784
+ :param is_root: Boolean indicating if the current folder is the root of the sync operation.
785
+ """
786
+ # If it's the root directory, upload files directly in it, then handle directories
787
+ if is_root:
788
+ for item_name in os.listdir(local_folder_path):
789
+ item_path = os.path.join(local_folder_path, item_name)
790
+ if os.path.isfile(item_path):
791
+ # Uploads 'processing.log' and any other files directly under the root
792
+ self.upload_file(service, item_path, drive_parent_folder_id)
793
+
794
+ # Process directories and their contents
795
+ for item_name in os.listdir(local_folder_path):
796
+ item_path = os.path.join(local_folder_path, item_name)
797
+ if os.path.isdir(item_path):
798
+ # It's a directory, find or create a corresponding folder on Drive
799
+ drive_folder_id = self.find_or_create_drive_folder(service, item_name, drive_parent_folder_id)
800
+ # Recursively sync the subfolder
801
+ self.sync_folder_to_drive(service, item_path, drive_folder_id, is_root=False)
802
+ elif os.path.isfile(item_path) and not is_root:
803
+ # For files in subdirectories, upload them to their respective folder on Google Drive
804
+ self.upload_file(service, item_path, drive_parent_folder_id)
805
+
806
+ def cleanup_input_folder(self, folder_path):
807
+ """
808
+ Deletes all files and folders under the specified folder_path.
809
+
810
+ :param folder_path: Path to the folder to clean up.
811
+ """
812
+ # Safety check to prevent accidental deletion of unintended directories
813
+ if "Input-Folder" in folder_path:
814
+ # List all items in the folder
815
+ for item_name in os.listdir(folder_path):
816
+ item_path = os.path.join(folder_path, item_name)
817
+ try:
818
+ # Check if it's a file and delete it
819
+ if os.path.isfile(item_path) or os.path.islink(item_path):
820
+ os.unlink(item_path)
821
+ # Else, it's a directory, delete the directory tree
822
+ elif os.path.isdir(item_path):
823
+ shutil.rmtree(item_path)
824
+ logging.info(f"Deleted {item_path}")
825
+ except Exception as e:
826
+ logging.error(f"Failed to delete {item_path}. Reason: {e}")
827
+ else:
828
+ logging.error("Safety check failed. The folder path does not seem to be correct.")
829
+
830
+
831
+
832
+
833
+ # Above is newly added codes
834
+ # Load environment variables and API key via .env file
835
+ load_dotenv()
836
+ api_key = os.getenv("OPENAI_API_KEY")
837
+
838
+ # Example usage
839
+ if __name__ == "__main__":
840
+ input_folder_path = os.path.abspath(args.input_folder)
841
+ transcribe = args.transcribe
842
+ extract_topics = args.topic
843
+ processor = KnowledgeTranscriber(api_key)
844
+ processor.process_folder(input_folder_path, transcribe)
cs_ai_kt_transcribe_share/prompts/1-summary_prompt.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are analyzing a technical transcript from an engineer's video recording of a Knowledge Transfer session. The transcript includes timestamps and occasional references to visual elements displayed on-screen. Your objectives are:
2
+
3
+ Comprehensive Understanding: Thoroughly review the entire transcript, paying close attention to technical details and the context provided by the engineer.
4
+ Topic Segmentation: Identify and separate different topics discussed in the video. Use the timestamps to accurately delineate when each topic starts and ends.
5
+ Detailed Summaries with Visual Cues: For each identified topic, provide a detailed summary. Include the following elements:
6
+
7
+ Timestamps: Clearly mention the timestamps at the beginning of each topic summary.
8
+ Visual References: Highlight any references made by the speaker to the on-screen content. Use phrases like "As shown on the screen at [timestamp]", or "Referring to the display at [timestamp]" to make these moments stand out.
9
+ Technical Accuracy: Ensure that technical details are accurately captured and clearly explained, suitable for use by another engineer.
10
+ Final Format:
11
+
12
+ Video High-Level Overview: Provide a unified summary of the entire transcript, including overarching themes or objectives.
13
+
14
+ Topic Segmentation:
15
+
16
+ Topic 1 [Timestamp Range]: [Topic-specific, detailed summary with visual cues]
17
+ Topic 2 [Timestamp Range]: [Topic-specific, detailed summary with visual cues]
18
+ Continue for each identified topic, maintaining this format.
19
+ Remember, the goal is to create a summary that is both technically comprehensive and easily navigable, with clear references to visual elements and timestamps for effective cross-referencing with video content.
cs_ai_kt_transcribe_share/prompts/2-topic_prompt.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are analyzing a specific segment of a technical transcript from an engineer's video recording. This segment is focused on the topic of [REPLACE_ME]. The transcript includes timestamps which are crucial for this task. Your objectives are:
2
+
3
+ Comprehensive Review with Timestamps: Read the entire transcript thoroughly, paying special attention to the timestamps that correlate with different stages of the troubleshooting process.
4
+
5
+ Dual-Level Summarization with Timestamp References:
6
+
7
+ High-Level Summary with Timestamps: Provide a general overview of the topic of [REPLACE_ME], highlighting key moments in the process. Incorporate timestamps to reference significant points or changes in the troubleshooting procedure. This summary should be understandable to those not deeply versed in the technical details.
8
+ Detailed Technical Overview with Timestamps: Create a step-by-step technical guide detailing the procedures performed in relation to the topic of [REPLACE_ME]. Ensure this includes specific timestamps for each major step or instruction mentioned in the transcript. This detailed guide should serve as a comprehensive technical reference for engineers.
9
+ Format of the Summary with Timestamps:
10
+
11
+ Topic: [REPLACE_ME]
12
+ High-Level Summary: [Non-technical summary with key timestamps highlighted]
13
+ Technical Overview: [In-depth technical guide with specific timestamps for each major step or instruction]
14
+ Your goal is to produce summaries that are not only informative at a general level but also provide detailed technical guidance, with timestamps serving as a navigational tool to correlate the text with specific segments of the video.
cs_ai_kt_transcribe_share/prompts/3-troubleshooting_prompt.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are analyzing a technical transcript from an engineer's Knowledge Transfer session video recording. The transcript may cover multiple topics, and your focus is on identifying and detailing the troubleshooting process. Your objectives are:
2
+
3
+ In-Depth Transcript Review: Thoroughly read the entire transcript to fully grasp the content and context of the recording. Pay attention to technical specifics and nuances in the engineer's explanation.
4
+
5
+ Detailed Troubleshooting Overview: Generate a comprehensive overview of the troubleshooting process discussed in the transcript. This should include:
6
+
7
+ Abstract Overview: Provide an initial abstract of the problem being addressed. Include the technical context and any critical background information.
8
+ Timestamp-Referenced Troubleshooting Steps: Enumerate the steps taken to troubleshoot the problem, ensuring each step is detailed and technically accurate. Reference timestamps wherever possible to indicate when in the video each step is discussed or demonstrated.
9
+ Emphasis on Technical Precision and Clarity:
10
+
11
+ Avoid vague explanations; strive for clarity and specificity in each step.
12
+ Include exact tool names, error codes, system messages, or any relevant technical details as demonstrated or mentioned in the video.
13
+ Output Template:
14
+
15
+ Abstract Overview:
16
+ [Abstract of the issue, including any relevant technical context and background]
17
+
18
+ Troubleshooting Steps:
19
+
20
+ [Step Name/Summary] [Timestamp]
21
+ [Detailed guidance necessary to fulfil the step, including specific actions, tool names, error codes, etc.]
22
+ Continue this format for each troubleshooting step identified in the transcript.
23
+
24
+ Your goal is to create a guide that is both technically comprehensive and easy to follow, providing actionable steps for engineers to replicate the troubleshooting process effectively.
cs_ai_kt_transcribe_share/prompts/4-glossary_prompt.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ You are an expert at Influitive, the Customer Advocacy platform. You are reading a technical transcript of an engineer's video recording of a Knowledge Transfer session. There may be more than one topic discussed within the video, and your goal will be the following:
2
+
3
+ 1. Review the entire transcript to ensure you fully understand the recording.
4
+ 2. Generate a complete glossary of all technical terminology and acronyms identified within the transcript relevant to the issue or its solution.
5
+
6
+ Please include as much technical detail on the actions taken as possible.
cs_ai_kt_transcribe_share/prompts/5-tags_prompt.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ You are an expert at Influitive, the Customer Advocacy platform. You are reading a technical transcript of an engineer's video recording of a Knowledge Transfer session. There may be more than one topic discussed within the video, and your goal will be the following:
2
+
3
+ 1. Review the entire transcript to ensure you fully understand the recording.
4
+ 2. Generate a word cloud tag collection that best represents the discussed issue. Limit this to only the most relevant items, and exclude specific tools or generic things like "error message". Focus on the subject, features, and symptoms related to the topic.
5
+ 3. Generate a reverse-engineered list of symptoms that a customer might report related to this issue that the generated troubleshooting steps might be used to address.
6
+
7
+ Please include as much technical detail as possible.
cs_ai_kt_transcribe_share/prompts/6-article_prompt.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an expert at transforming text content from legacy support articles into a new standard HTML format. Your task is to meticulously convert the provided legacy content into our ideal template format. The output must be in detailed HTML code, maintaining all original details.
2
+
3
+ Please demonstrate how you would transform a section of a hypothetical legacy article into our desired template. Do not summarize or omit any technical details. The transformation should retain the full depth of the original content in HTML format. For example, convert the following legacy section into the new template format. Remember to include all tags, attributes, and text as found in the original.
4
+
5
+ In addition to fitting the legacy content within the "Information" section, please generate the following and include them in their appropriate location:
6
+
7
+ 1. Tags relevant to the content in the article.
8
+ 2. A short article Overview.
9
+ 3. A selection of frequently asked questions based on the article content.
10
+
11
+ Remember, do not include any markdown wrappers with your answer like ```html; only provide the edited HTML code.
12
+
13
+ Here is an example of the final template:
14
+
15
+ <h1 id="h_01HGKSMPRSDHAY1F7ET44QWYF6" class="title">Example Article Title</h1>
16
+ <p>
17
+ <strong>Tags:&nbsp;</strong>Topic, Domain, Issue, Solution, Feature
18
+ </p>
19
+ <p>&nbsp;</p>
20
+ <h1 id="h_01HGKSMPRS9JVGC6E9EA3DB98K">Overview</h1>
21
+ <p>
22
+ This is an example of an article overview, which details the content of the article
23
+ to help human users and LLMs quickly identify whether the content is relevant
24
+ to the topic they want to review.
25
+ </p>
26
+ <p>&nbsp;</p>
27
+ <h1 id="h_01HGKSMPRSYYE7GMM7DZ0VCKXK">Information</h1>
28
+ <p>
29
+ This is a short description of the focus of the steps to follow and contains
30
+ a simple table of contents linking to the sub-sections for longer-form articles.
31
+ </p>
32
+ <ul>
33
+ <li>
34
+ <a href="#h_01HGEGEACR3P9G6ANV7YMZSSW5">Topic 1</a>
35
+ </li>
36
+ <li>
37
+ <a href="#h_01HGEGECZFCZXP3WAEE6M0AFF4">Topic 2</a>
38
+ </li>
39
+ </ul>
40
+ <p>&nbsp;</p>
41
+ <h3 id="h_01HGEGEACR3P9G6ANV7YMZSSW5">Topic 1</h3>
42
+ <p>
43
+ This is an example of a topic that contains some details about a topic rather
44
+ than any specific process to follow. These topics will provide some information
45
+ to the reader.
46
+ </p>
47
+ <p>&nbsp;</p>
48
+ <h3 id="h_01HGEGECZFCZXP3WAEE6M0AFF4">Topic 2</h3>
49
+ <p>
50
+ This is an example of a topic that has step-by-step guidance.
51
+ </p>
52
+ <p>&nbsp;</p>
53
+ <h4 id="h_01HGKSMPRSTG6RVHMWK6RX7NSM">Step 1</h4>
54
+ <p class="wysiwyg-indent1">
55
+ This is a more detailed description of the actions needed to fulfil the step.
56
+ </p>
57
+ <h4 id="h_01HGKSMPRS0X842R2AG487JGJW">Step 2</h4>
58
+ <p class="wysiwyg-indent1">
59
+ This is a more detailed description of the actions needed to fulfil the step.
60
+ </p>
61
+ <h4 id="h_01HGKSMPRSA7K6PVR6J2VZ1FNS">Step 3</h4>
62
+ <p class="wysiwyg-indent1">
63
+ This is a more detailed description of the actions needed to fulfil the step.
64
+ </p>
65
+ <p>&nbsp;</p>
66
+ <h1 id="h_01HGKSMPRSER9ENY5QWWRZZ863">FAQ</h1>
67
+ <h3 id="h_01HGKSMPRSHXQ9M5E03BY1Z6BS">Question 1</h3>
68
+ <p class="wysiwyg-indent1">This is the answer to the question.</p>
69
+ <h3 id="h_01HGKSMPRSEYMTYT6PNHG2J1HD">Question 2</h3>
70
+ <p class="wysiwyg-indent1">This is the answer to the question.</p>
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ api_key,drive_link,output,flag,username,timestamp
2
+ ,,"{""label"": null, ""confidences"": null}",,,2024-04-04 17:06:24.671339
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ ffprobe
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.6.0
2
+ anyio==4.2.0
3
+ certifi==2023.11.17
4
+ charset-normalizer==3.3.2
5
+ decorator==4.4.2
6
+ distro==1.9.0
7
+ h11==0.14.0
8
+ httpcore==1.0.2
9
+ httpx==0.26.0
10
+ idna==3.6
11
+ imageio==2.33.1
12
+ imageio-ffmpeg==0.4.9
13
+ moviepy==1.0.3
14
+ numpy==1.26.3
15
+ openai==1.8.0
16
+ pillow==10.2.0
17
+ proglog==0.1.10
18
+ pydantic==2.5.3
19
+ pydantic_core==2.14.6
20
+ pydub==0.25.1
21
+ python-dotenv==1.0.0
22
+ requests==2.31.0
23
+ sniffio==1.3.0
24
+ tqdm==4.66.1
25
+ typing_extensions==4.9.0
26
+ urllib3==2.1.0