Spaces:
Sleeping
Sleeping
Initial commit
Browse files- __pycache__/app.cpython-37.pyc +0 -0
- app.py +30 -0
- cs_ai_kt_transcribe_share/.DS_Store +0 -0
- cs_ai_kt_transcribe_share/.env +1 -0
- cs_ai_kt_transcribe_share/.gitignore +160 -0
- cs_ai_kt_transcribe_share/README.md +103 -0
- cs_ai_kt_transcribe_share/__init__.py +0 -0
- cs_ai_kt_transcribe_share/__pycache__/__init__.cpython-312.pyc +0 -0
- cs_ai_kt_transcribe_share/__pycache__/kt_transcript.cpython-312.pyc +0 -0
- cs_ai_kt_transcribe_share/generate-transcripts.json +13 -0
- cs_ai_kt_transcribe_share/kt_transcript.py +844 -0
- cs_ai_kt_transcribe_share/prompts/1-summary_prompt.txt +19 -0
- cs_ai_kt_transcribe_share/prompts/2-topic_prompt.txt +14 -0
- cs_ai_kt_transcribe_share/prompts/3-troubleshooting_prompt.txt +24 -0
- cs_ai_kt_transcribe_share/prompts/4-glossary_prompt.txt +6 -0
- cs_ai_kt_transcribe_share/prompts/5-tags_prompt.txt +7 -0
- cs_ai_kt_transcribe_share/prompts/6-article_prompt.txt +70 -0
- flagged/log.csv +2 -0
- packages.txt +2 -0
- requirements.txt +26 -0
__pycache__/app.cpython-37.pyc
ADDED
Binary file (395 Bytes). View file
|
|
app.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from cs_ai_kt_transcribe_share.kt_transcript import KnowledgeTranscriber
|
3 |
+
|
4 |
+
def process_inputs(api_key, drive_folder_link):
|
5 |
+
try:
|
6 |
+
# Assuming drive_link is the folder_path and transcribe_flag is derived or fixed
|
7 |
+
folder_path = drive_folder_link # Example adjustment, actual implementation may vary
|
8 |
+
transcribe_flag = True # Example, adjust based on actual use case
|
9 |
+
kt = KnowledgeTranscriber(api_key) # Create an instance of KnowledgeTranscriber
|
10 |
+
kt.process_folder('scripts/ktTranscript/cs_ai_kt_transcribe_share/Input-Folder', True, drive_folder_link) # Call the method
|
11 |
+
return "Success"
|
12 |
+
except ValueError as e:
|
13 |
+
return str(e) # Return the exception message as the result
|
14 |
+
|
15 |
+
demo = gr.Interface(
|
16 |
+
process_inputs,
|
17 |
+
[
|
18 |
+
gr.Textbox(
|
19 |
+
label="OpenAI Key",
|
20 |
+
info="Enter your OpenAI API Key here e.g., sk-xxxxxxxxxxxxxx"
|
21 |
+
),
|
22 |
+
gr.Textbox(
|
23 |
+
label="Drive Folder Link",
|
24 |
+
info="Enter your Drive Folder Link here (it must be public) e.g., https://drive.google.com/drive/folders/your-folder-id"
|
25 |
+
),
|
26 |
+
],
|
27 |
+
gr.Textbox(label="Result"),
|
28 |
+
theme=gr.themes.Base()
|
29 |
+
)
|
30 |
+
demo.launch() # Share your demo with just 1 extra parameter 🚀
|
cs_ai_kt_transcribe_share/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
cs_ai_kt_transcribe_share/.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
OPENAI_API_KEY=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
|
cs_ai_kt_transcribe_share/.gitignore
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
cs_ai_kt_transcribe_share/README.md
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# CSAI Knowledge Aggregator
|
2 |
+
|
3 |
+
## Introduction
|
4 |
+
With the common requirement for various central teams to record, attend, and then process KT recording sessions or parse through documents, PDFs, etc. folders, there was a clear need to simplify and automate the gathering of knowledge and generate some value more expediently.
|
5 |
+
|
6 |
+
Created for personal use, this has been split out for other interested parties to make use of. In the current version on the 'share' branch, this tool is tailored specifically for transcribing mp4 KT recordings (or utilizing existing transcripts provided by various video platforms such as Zoom or Loom) to parse out various knowledge outputs and create KB articles. While initially focused on Central Support knowledge capture, with some minor adjustments, it can be tailored for other applications.
|
7 |
+
|
8 |
+
With a long-term goal of generalized content capture and curation, certain outputs may not be relevant for all use cases. Some parameterization has already been implemented but can be further adjusted.
|
9 |
+
|
10 |
+
[Ideal KT Input Guidance Runbook](https://docs.google.com/document/d/1uw-nBHeE0egGvlqlFvtl8bNu2x6zCIkAVYH85h38cIk/edit)
|
11 |
+
|
12 |
+
[Example Outputs](https://drive.google.com/drive/u/0/folders/1nB0ORAuOWca5zuWAVo8TXfuxnwzi09yR)
|
13 |
+
|
14 |
+
|
15 |
+
### Current Outputs
|
16 |
+
|
17 |
+
- High-level Summary
|
18 |
+
- Topic Specific Summaries
|
19 |
+
- Glossary
|
20 |
+
- Troubleshooting Steps
|
21 |
+
- Word Cloud and Matching Symptoms
|
22 |
+
- KB For each Summary and the Troubleshooting Steps
|
23 |
+
- Screenshots of any captured Timestamps in Summary/Troubleshooting Steps
|
24 |
+
|
25 |
+
**Note**: Processing.log is also generated in the working directory.
|
26 |
+
|
27 |
+
## Prerequisites
|
28 |
+
- Python 3.11
|
29 |
+
- [ffmpeg](https://www.ffmpeg.org/) - Pre-req for Pydub's AV manipulation.
|
30 |
+
|
31 |
+
## Installation
|
32 |
+
|
33 |
+
### Clone the Repository
|
34 |
+
```
|
35 |
+
git clone -b share --single-branch https://github.com/trilogy-group/cs-ai-kt-transcribe.git
|
36 |
+
```
|
37 |
+
|
38 |
+
### Set up the Python Environment
|
39 |
+
Pick your poison
|
40 |
+
|
41 |
+
```
|
42 |
+
pyenv virtualenv [env_name]
|
43 |
+
pyenv activate [env_name]
|
44 |
+
```
|
45 |
+
```
|
46 |
+
python3 -m venv [env_name]
|
47 |
+
source ./bin/activate
|
48 |
+
```
|
49 |
+
|
50 |
+
### Installing Dependencies
|
51 |
+
|
52 |
+
From your primary venv directory:
|
53 |
+
|
54 |
+
```
|
55 |
+
./bin/python -m pip install -r requirements.txt
|
56 |
+
```
|
57 |
+
|
58 |
+
### Generate and Populate .env file
|
59 |
+
|
60 |
+
Within the primary venv directory, create a file named '.env' and populate it with the content below, replacing with your OpenAI API Key:
|
61 |
+
|
62 |
+
```
|
63 |
+
OPENAI_API_KEY=[YOUR_API_KEY]
|
64 |
+
```
|
65 |
+
|
66 |
+
## Usage
|
67 |
+
|
68 |
+
### Basic Usage
|
69 |
+
|
70 |
+
Topic and Transcribe are optional parameters that can be passed in to handle two special-cases - long-form multi-topic videos and skipping transcriptions.
|
71 |
+
|
72 |
+
By default, the script assumes you are providing video content (.mp4 format) in the input directory for a single topic that requires transcribing. Each Video (or transcript, if the optional flag is set to False) within the provided input directory will be processed in sequence. A folder is generated matching the video or transcript files name and the various outputs are placed within. Audio/Video precursor artefacts are placed within a generated "Processed" folder.
|
73 |
+
|
74 |
+
Once the basic set up above is completed, an Input directory can be generated to store your videos/transcripts to process. Then, run the script using the form below:
|
75 |
+
|
76 |
+
```
|
77 |
+
kt-transcript.py [--topic [TOPIC]] [--transcribe [TRANSCRIBE]] [input_folder]
|
78 |
+
```
|
79 |
+
|
80 |
+
#### Example Usage
|
81 |
+
```
|
82 |
+
./bin/python cs-ai-kt-transcribe/kt-transcript.py --topic True --transcribe True ./Input-Folder
|
83 |
+
```
|
84 |
+
|
85 |
+
|
86 |
+
### Arguments
|
87 |
+
|
88 |
+
```
|
89 |
+
positional arguments:
|
90 |
+
input_folder The folder containing videos/transcripts to process relative to the current working directory.
|
91 |
+
|
92 |
+
options:
|
93 |
+
--topic If set to True, will generate topic-specific summaries in addition to the high-level summary.
|
94 |
+
--transcribe If set to False, will skip transcribing and leverage an existing '*_full_transcript.txt' file to generate outputs.
|
95 |
+
```
|
96 |
+
|
97 |
+
### Customizing Outputs
|
98 |
+
|
99 |
+
Within the prompts directory in your pyenv, you will find a selection of prompt files that can be tweaked and adjusted to alter the final behaviours of the LLM processing. The prompts provided are tailored for Kandy, a VoIP Telephony product. While this has limited impact on its ability to parse other content, specialising the Persona segment of the prompt for a particular skillset does produce higher-quality results.
|
100 |
+
|
101 |
+
While the specific content of the videos being parsed will likely determine the ideal use case, the topic prompt can be altered to provide more targeted/specialized summaries. Note that the "[REPLACE_ME]" placeholder within the topic prompt is handled within the topic processing logic and is not intended to be manually replaced before running. The identified topics are replaced at runtime.
|
102 |
+
|
103 |
+
If the transcription element is leveraged and you encounter certain terminology/acronyms not properly being captured, you can seed the prompt to improve outputs: [OpenAI Whisper Docs](https://platform.openai.com/docs/guides/speech-to-text/prompting)
|
cs_ai_kt_transcribe_share/__init__.py
ADDED
File without changes
|
cs_ai_kt_transcribe_share/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (197 Bytes). View file
|
|
cs_ai_kt_transcribe_share/__pycache__/kt_transcript.cpython-312.pyc
ADDED
Binary file (48.3 kB). View file
|
|
cs_ai_kt_transcribe_share/generate-transcripts.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"type": "service_account",
|
3 |
+
"project_id": "generate-transcripts",
|
4 |
+
"private_key_id": "e3077532c8a4559587d123bc6e08dd0fae4e88cf",
|
5 |
+
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQDxruvw5+5RF25s\n6MFEGU1N+cRuETi7c0/QW4/P7fpGYgeHNjxxAyjUEM0nXvokFYYWCs+X/BQlxvIG\nTAZRYy8Arq8XkDqo84Nqt4Db5G2zbEz0Hy9OQFMsp+Z3rfy096IplZliQXG/knXB\nWlQKHAFmVJdwvGrS/vDVudGraOvJlWV+KbUlhVUnlvtY9UKwMmkWdy/JNPms75A2\nR3P8WTVQ+tIN+8jhB7jniIG4v7E4pN9ahgLKkVG3cycdOurQ6RahHDUDrW0aiQlx\nZklvbFJ3FYS+8quXxLzy3qh8hvB+VS59Sf+M7Hsmy1TIKP9JTdGxer2htMCxPF+M\nVaPIO2u7AgMBAAECggEADx4thRrE9g6cqjRfDob0pnmGt4ndPiCG8BYiJ1QQEUIF\njYgvfLRBZA6O/rT6ubIc+pj/hlbO3dQVhJ5R0Np+8LB8pGExwirFYDINXfPqhswx\n/h1LRL5SVZtFF9xqz9kjrC0+9H36gwzCz+76Dc3VqpSFCpjGp98pqnzeqDW+dhQC\nsFHbH7ZE8uOC0KhcZJEVoxBYHKUAqBr7KvyCwpkrJR+MgCuXYeyxK1FkZsZdV0/P\n7bU+xH2lYNH0zSorX6B0oen4Weu5hWTg/8GJt1TFht7RQiVpZQDRf56fxw2e9zUi\nBee+sQiOowkyBlcY54bRjDqHv54i06884fQpKjlg3QKBgQD4/EThybaUHZM2WvWg\npNv9+frCheg0tE/+zw4pCJHAOzHI9aFPTkCh4Ha5D3Lfg6mNL8xOnDX73TeL0Rkz\nHALAIw3KFUi5Ol5+QH2A5hmOT64nyBgUPQi1cDic1COjJOQ1YhhDSfqkguUwW+SW\nvPCtF/wZNoVKzF/kBzjqfQ7bRwKBgQD4ffz2+u2ntQcBBisHvj+k1XxCfNZaDkig\nDIpFS0V1oP97M+L63j1jpiYAulzUWZTcvBcgIvc12cZbAELH3KBhpbQacb7BG18x\nYmVdusgkwAB5aTbahWdKBReMeV818U4tBznQ8feSqUemH1NJYTMEn7wHWyzPXClZ\noXXT5zG97QKBgHAluKT2pU07sQYnnEFaYFiaUy9w/zFWEj4UZy1mVbf/MaJxuNGu\nzn/bSb57TeONOz6DzAOQauRiNNW+Php+QGLopxph0drUhqFZTO5VNieajjgO8YGU\nKCsJPAavqsmeIUhGQeV1GfclmjPZADJBMirVBdEZdJpHSd28vw8nexybAoGAGjv6\n4/xT8Nu81I/ZJSUhxP5A8ygI3VMaucL5Kh2rRKLttg0hToj+BDI7WuJSAJ1AiRlW\ne68VWFsYhOz/H+icChMbc5tEn1CLeURBHWcl0DNxGWBRSTrCT98/wDXkJmbGcRyN\nLxEMqTAz/0bZXSrk3v0aNdCbZ3z8ZJkWPnvvFXUCgYAhNt1ZOPp3hJRpaBtvK/cR\nODVWaTv2ZxORU+tbf+Uv9My3Ac2Wsrpzyg2V4DZPyojSyABoH0fnvn+f5ntX55pX\nYoWsHd9SAYQD8km6SuMcwHwTg1FC59djVImlLDQyPhfUUSQNb47iS0cRgfPnR54M\ny8BCLcgxqBbeIrkrXQKjbA==\n-----END PRIVATE KEY-----\n",
|
6 |
+
"client_email": "generate-transcripts@generate-transcripts.iam.gserviceaccount.com",
|
7 |
+
"client_id": "114541892332257991546",
|
8 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
9 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
10 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
11 |
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/generate-transcripts%40generate-transcripts.iam.gserviceaccount.com",
|
12 |
+
"universe_domain": "googleapis.com"
|
13 |
+
}
|
cs_ai_kt_transcribe_share/kt_transcript.py
ADDED
@@ -0,0 +1,844 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import openai
|
4 |
+
import logging
|
5 |
+
import json
|
6 |
+
import shutil
|
7 |
+
import argparse
|
8 |
+
import numpy as np
|
9 |
+
from moviepy.editor import VideoFileClip
|
10 |
+
from PIL import Image
|
11 |
+
from datetime import datetime, timedelta
|
12 |
+
from math import ceil
|
13 |
+
from dotenv import load_dotenv
|
14 |
+
from pydub import AudioSegment
|
15 |
+
from tqdm import tqdm
|
16 |
+
from google.oauth2 import service_account
|
17 |
+
from googleapiclient.discovery import build
|
18 |
+
from googleapiclient.http import MediaFileUpload
|
19 |
+
from googleapiclient.http import MediaIoBaseDownload
|
20 |
+
import shutil
|
21 |
+
|
22 |
+
|
23 |
+
parser = argparse.ArgumentParser(description='Process KT videos in a given folder to generate transcripts and summaries of what was discussed.')
|
24 |
+
parser.add_argument(
|
25 |
+
'input_folder',
|
26 |
+
nargs='?', # Optional
|
27 |
+
default='.', # Use the current working directory if no folder is specified
|
28 |
+
help='The folder containing videos to process relative to the current working directory.'
|
29 |
+
)
|
30 |
+
parser.add_argument(
|
31 |
+
'--topic',
|
32 |
+
nargs='?', # Optional
|
33 |
+
default=False,
|
34 |
+
help='If set to True, will generate topic-specific summaries in addition to the high-level summary.'
|
35 |
+
)
|
36 |
+
parser.add_argument(
|
37 |
+
'--transcribe',
|
38 |
+
nargs='?', # Optional
|
39 |
+
default=True,
|
40 |
+
help='If set to False, will skip transcribing and leverage an existing _full_transcript.txt file to generate outputs.'
|
41 |
+
)
|
42 |
+
args = parser.parse_args()
|
43 |
+
|
44 |
+
log_file_path = os.path.join(os.path.abspath(args.input_folder), "processing.log")
|
45 |
+
logging.basicConfig(
|
46 |
+
level=logging.INFO,
|
47 |
+
handlers=[
|
48 |
+
logging.StreamHandler(),
|
49 |
+
logging.FileHandler(log_file_path, mode='a')
|
50 |
+
],
|
51 |
+
format='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s',
|
52 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
53 |
+
)
|
54 |
+
|
55 |
+
|
56 |
+
class KnowledgeTranscriber(object):
|
57 |
+
MAX_SIZE = 5000000 # 5 MB
|
58 |
+
MAX_SIZE_MB = MAX_SIZE / (1024 * 1024) # Convert bytes to MB
|
59 |
+
BITRATE = 128000 # 128 kbps
|
60 |
+
|
61 |
+
def __init__(self, api_key):
|
62 |
+
self.client = openai.OpenAI(api_key=api_key)
|
63 |
+
|
64 |
+
script_dir = os.path.dirname(os.path.abspath(__file__)) # Absolute directory of the script
|
65 |
+
prompts = {
|
66 |
+
"summary_prompt": os.path.join(script_dir, "prompts", "1-summary_prompt.txt"),
|
67 |
+
"topic_prompt": os.path.join(script_dir, "prompts", "2-topic_prompt.txt"),
|
68 |
+
"troubleshooting_prompt": os.path.join(script_dir, "prompts", "3-troubleshooting_prompt.txt"),
|
69 |
+
"glossary_prompt": os.path.join(script_dir, "prompts", "4-glossary_prompt.txt"),
|
70 |
+
"tags_prompt": os.path.join(script_dir, "prompts", "5-tags_prompt.txt"),
|
71 |
+
"article_prompt": os.path.join(script_dir, "prompts", "6-article_prompt.txt")
|
72 |
+
}
|
73 |
+
|
74 |
+
def process_folder(self, folder_path, transcribe_flag, drive_folder_link):
|
75 |
+
"""
|
76 |
+
Takes a folder path and processes all videos or transcripts in the folder.
|
77 |
+
First downloads all videos from the specified Google Drive folder to the local folder path.
|
78 |
+
:param folder_path: The path to the folder containing videos or transcripts to process.
|
79 |
+
:param transcribe_flag: Flag to indicate if transcription is needed.
|
80 |
+
:param drive_folder_id: The ID of the Google Drive folder containing the videos.
|
81 |
+
"""
|
82 |
+
|
83 |
+
###Added from here Initialize Google Drive service
|
84 |
+
drive_service = self.get_drive_service()
|
85 |
+
|
86 |
+
# Extract the folder ID from the URL.
|
87 |
+
folder_id = self.extract_drive_folder_id(drive_folder_link)
|
88 |
+
|
89 |
+
# List all files in the Google Drive folder
|
90 |
+
drive_files = self.list_files_in_folder(drive_service, folder_id)
|
91 |
+
logging.info(f"Downloading files from Google Drive folder: {folder_id}")
|
92 |
+
|
93 |
+
# TODO: Sharing the app is going to be the last step.
|
94 |
+
|
95 |
+
# Download each file to the local folder_path
|
96 |
+
for file in tqdm(drive_files, desc="Downloading Files"):
|
97 |
+
file_name = file['name']
|
98 |
+
file_id = file['id']
|
99 |
+
local_file_path = os.path.join(folder_path, file_name)
|
100 |
+
if not os.path.exists(local_file_path): # Avoid re-downloading files
|
101 |
+
logging.info(f"Downloading file: {file_name}")
|
102 |
+
self.download_file(drive_service, file_id, local_file_path)
|
103 |
+
else:
|
104 |
+
logging.info(f"File already exists: {file_name}")
|
105 |
+
###End of Added
|
106 |
+
|
107 |
+
### All below under this line is same with the original script. Until Next Added from here
|
108 |
+
logging.info(f"Processing files in folder: {folder_path}")
|
109 |
+
for filename in tqdm(os.listdir(folder_path), desc="Processing Files"):
|
110 |
+
if transcribe_flag == "False":
|
111 |
+
if filename.endswith("_full_transcript.txt"):
|
112 |
+
|
113 |
+
# Processing for transcript files generated by Zoom/Loom/etc. already.
|
114 |
+
logging.info(f"Processing transcript: {filename}")
|
115 |
+
|
116 |
+
base_name = filename.replace("_full_transcript.txt", "")
|
117 |
+
new_folder_path = os.path.join(folder_path, base_name)
|
118 |
+
logging.info(f"New folder path: {new_folder_path}")
|
119 |
+
|
120 |
+
# Folder handling outside main video processing functions.
|
121 |
+
if not os.path.exists(new_folder_path):
|
122 |
+
os.makedirs(new_folder_path)
|
123 |
+
original_path = os.path.join(folder_path, filename)
|
124 |
+
new_path = os.path.join(new_folder_path, filename)
|
125 |
+
logging.info(f"Moving file from {original_path} to {new_path}")
|
126 |
+
shutil.move(original_path, new_path)
|
127 |
+
|
128 |
+
# Generate our ouputs
|
129 |
+
self.generate_transcript_outputs(new_path)
|
130 |
+
self.process_articles(new_folder_path)
|
131 |
+
|
132 |
+
logging.info(f"Processing complete for: {filename}.")
|
133 |
+
|
134 |
+
else:
|
135 |
+
if filename.endswith(".mp4"):
|
136 |
+
# Process for video files
|
137 |
+
logging.info(f"Processing video: {filename}")
|
138 |
+
video_path = os.path.join(folder_path, filename)
|
139 |
+
self.process_video(video_path, folder_path)
|
140 |
+
logging.info(f"Processing complete for: {filename}.")
|
141 |
+
|
142 |
+
###Added from here
|
143 |
+
# After processing files
|
144 |
+
logging.info(f"Processing complete for all files in folder: {folder_path}. Uploading processed files to Google Drive.")
|
145 |
+
|
146 |
+
# Iterate over files in the input folder and upload each to Google Drive
|
147 |
+
self.sync_folder_to_drive(drive_service, folder_path, folder_id, is_root=True)
|
148 |
+
|
149 |
+
logging.info(f"Uploading processed files to Google Drive complete for all files in folder: {folder_path}. Success.")
|
150 |
+
|
151 |
+
# Calling the cleanup function
|
152 |
+
input_folder_path = os.path.abspath(folder_path)
|
153 |
+
self.cleanup_input_folder(input_folder_path)
|
154 |
+
|
155 |
+
###End of Added
|
156 |
+
|
157 |
+
|
158 |
+
def check_and_process(self, file_path, process_func, file_description):
|
159 |
+
"""
|
160 |
+
Validates if a file exists and processes it if it doesn't.
|
161 |
+
:param file_path: The path to the file to check.
|
162 |
+
:param process_func: The function to call to process the file if it doesn't exist.
|
163 |
+
:param file_description: A description of the file to use in logging.
|
164 |
+
"""
|
165 |
+
if not os.path.exists(file_path):
|
166 |
+
logging.info(f"Processing {file_description}: {file_path}")
|
167 |
+
process_func()
|
168 |
+
else:
|
169 |
+
logging.info(f"{file_description} already exists.")
|
170 |
+
|
171 |
+
def process_video(self, input_video, folder_path):
|
172 |
+
"""
|
173 |
+
Takes a video path, processes the video into a transcript, and a collection of knowledge outputs.
|
174 |
+
:param input_video: The path to the video to process.
|
175 |
+
:param folder_path: The path to the folder containing the video to process.
|
176 |
+
"""
|
177 |
+
base_name = os.path.splitext(os.path.basename(input_video))[0]
|
178 |
+
output_folder = os.path.join(folder_path, f"{base_name}_output")
|
179 |
+
processed_folder = os.path.join(folder_path, "Processed")
|
180 |
+
os.makedirs(output_folder, exist_ok=True)
|
181 |
+
os.makedirs(processed_folder, exist_ok=True)
|
182 |
+
|
183 |
+
output_audio = os.path.join(output_folder, f"{base_name}.mp3")
|
184 |
+
processed_audio = os.path.join(processed_folder, f"{base_name}.mp3")
|
185 |
+
processed_video = os.path.join(processed_folder, f"{base_name}.mp4")
|
186 |
+
transcript_file = os.path.join(output_folder, f"{base_name}_full_transcript.txt")
|
187 |
+
|
188 |
+
# Checks to avoid re-processing to save time and calls to GPT.
|
189 |
+
|
190 |
+
self.check_and_process(
|
191 |
+
output_audio,
|
192 |
+
lambda: self.video_to_audio(input_video, output_audio),
|
193 |
+
"Audio file"
|
194 |
+
)
|
195 |
+
|
196 |
+
self.check_and_process(
|
197 |
+
transcript_file,
|
198 |
+
lambda: self.transcribe_and_combine_audio(output_audio),
|
199 |
+
"Transcript file"
|
200 |
+
)
|
201 |
+
|
202 |
+
transcript_outputs_exist = os.path.exists(os.path.join(output_folder, f"{base_name}_summary.txt")) or \
|
203 |
+
os.path.exists(os.path.join(output_folder, f"{base_name}_troubleshooting_steps.txt")) or \
|
204 |
+
os.path.exists(os.path.join(output_folder, f"{base_name}_glossary.txt")) or \
|
205 |
+
os.path.exists(os.path.join(output_folder, f"{base_name}_tags_and_symptoms.txt"))
|
206 |
+
|
207 |
+
if not transcript_outputs_exist:
|
208 |
+
logging.info(f"Generating transcript outputs for: {transcript_file}")
|
209 |
+
self.generate_transcript_outputs(transcript_file)
|
210 |
+
else:
|
211 |
+
logging.info("Transcript-related outputs already exist.")
|
212 |
+
|
213 |
+
# Handling the screenshot capture and processing
|
214 |
+
logging.info(f"Checking summary file for timestamps: {transcript_file}")
|
215 |
+
summary_file = os.path.join(output_folder, f"{base_name}_full_transcript_summary.txt")
|
216 |
+
troubleshooting_file = os.path.join(output_folder, f"{base_name}_full_transcript_troubleshooting_steps.txt")
|
217 |
+
timestamp_list = self.find_timestamps(summary_file) + self.find_timestamps(troubleshooting_file)
|
218 |
+
|
219 |
+
if timestamp_list:
|
220 |
+
logging.info(f"Timestamps found in summary file: {summary_file}")
|
221 |
+
screenshot_folder = os.path.join(output_folder, "Screenshots")
|
222 |
+
os.makedirs(screenshot_folder, exist_ok=True)
|
223 |
+
self.parse_and_extract_frames(input_video, screenshot_folder, timestamp_list)
|
224 |
+
else:
|
225 |
+
logging.info(f"No timestamps found in summary file: {summary_file}")
|
226 |
+
|
227 |
+
self.check_and_process(
|
228 |
+
processed_audio,
|
229 |
+
lambda: shutil.move(output_audio, processed_audio),
|
230 |
+
"Processed audio"
|
231 |
+
)
|
232 |
+
|
233 |
+
self.check_and_process(
|
234 |
+
processed_video,
|
235 |
+
lambda: shutil.move(input_video, processed_video),
|
236 |
+
"Processed video"
|
237 |
+
)
|
238 |
+
|
239 |
+
# Generate final articles from summary and troubleshooting steps.
|
240 |
+
self.process_articles(output_folder)
|
241 |
+
|
242 |
+
logging.info(f"Files saved to: {output_folder}")
|
243 |
+
logging.info(f"Processing complete for: {input_video}.")
|
244 |
+
|
245 |
+
|
246 |
+
def transcribe_and_combine_audio(self, audio_file_path):
|
247 |
+
"""
|
248 |
+
Takes an audio file path, splits the audio into parts if needed, transcribes the audio parts, and combines the transcriptions into a single transcript file.
|
249 |
+
:param audio_file_path: The path to the audio file to process.
|
250 |
+
:return: The path to the transcript file.
|
251 |
+
"""
|
252 |
+
base_file_path = os.path.splitext(audio_file_path)[0]
|
253 |
+
transcript_file_path = f"{base_file_path}_full_transcript.txt"
|
254 |
+
manifest_file_path = f"{base_file_path}_manifest.txt"
|
255 |
+
|
256 |
+
# Load or initialize the manifest for keeping track of processed parts
|
257 |
+
if os.path.exists(manifest_file_path):
|
258 |
+
with open(manifest_file_path, "r") as manifest_file:
|
259 |
+
processed_parts = set(manifest_file.read().splitlines())
|
260 |
+
else:
|
261 |
+
processed_parts = set()
|
262 |
+
|
263 |
+
# Transcribe each part of the audio file, as needed
|
264 |
+
parts_to_transcribe = sorted(self.get_or_create_audio_parts(audio_file_path))
|
265 |
+
for part in parts_to_transcribe:
|
266 |
+
part_transcript_file = f"{part}_transcript.txt"
|
267 |
+
if part in processed_parts:
|
268 |
+
logging.info(f"Transcription part already exists: {part_transcript_file}")
|
269 |
+
else:
|
270 |
+
logging.info(f"Transcribing audio part: {part}")
|
271 |
+
transcription = self.transcribe_audio_part(part)
|
272 |
+
with open(part_transcript_file, "w") as part_file:
|
273 |
+
part_file.write(transcription)
|
274 |
+
processed_parts.add(part)
|
275 |
+
logging.info(f"Transcription complete for: {part}")
|
276 |
+
with open(manifest_file_path, "w") as manifest_file:
|
277 |
+
manifest_file.write("\n".join(sorted(processed_parts)))
|
278 |
+
# Check if the part is not the main audio file before removing
|
279 |
+
if part != audio_file_path:
|
280 |
+
os.remove(part)
|
281 |
+
logging.info(f"Removed audio part: {part}")
|
282 |
+
|
283 |
+
# Once all parts have been transcribed, combine them into the full transcript file
|
284 |
+
with open(transcript_file_path, "w") as transcript_file:
|
285 |
+
for part in parts_to_transcribe:
|
286 |
+
logging.info(f"Combining transcript part: {part}")
|
287 |
+
part_transcript_file = f"{part}_transcript.txt"
|
288 |
+
with open(part_transcript_file, "r") as part_file:
|
289 |
+
transcript_file.write(part_file.read() + "\n")
|
290 |
+
os.remove(part_transcript_file)
|
291 |
+
logging.info(f"Removed transcript part: {part_transcript_file}")
|
292 |
+
|
293 |
+
|
294 |
+
# Now, we need to take the transcript file and adjust the timestamps to account for the audio parts
|
295 |
+
with open(transcript_file_path, 'r') as file:
|
296 |
+
full_transcript = file.read()
|
297 |
+
|
298 |
+
adjusted_content = self.adjust_timestamps(full_transcript)
|
299 |
+
with open(transcript_file_path, 'w') as file:
|
300 |
+
file.write(adjusted_content)
|
301 |
+
|
302 |
+
logging.info(f"Transcript saved to: {transcript_file_path}")
|
303 |
+
os.remove(manifest_file_path)
|
304 |
+
return transcript_file_path
|
305 |
+
|
306 |
+
def parse_time(self, time_str):
|
307 |
+
"""Convert a timestamp string to seconds."""
|
308 |
+
h, m, s = map(float, time_str.split(':'))
|
309 |
+
return h * 3600 + m * 60 + s
|
310 |
+
|
311 |
+
def format_time(self, seconds):
|
312 |
+
"""Convert seconds back to a timestamp string."""
|
313 |
+
h = int(seconds // 3600)
|
314 |
+
m = int((seconds % 3600) // 60)
|
315 |
+
s = seconds % 60
|
316 |
+
return f"{h:02}:{m:02}:{s:06.3f}"
|
317 |
+
|
318 |
+
def adjust_timestamps(self, vtt_content):
|
319 |
+
"""
|
320 |
+
Takes a VTT content string and adjusts the timestamps to account for the audio parts.
|
321 |
+
:param vtt_content: The VTT content to process.
|
322 |
+
:return: The adjusted VTT content.
|
323 |
+
"""
|
324 |
+
sections = vtt_content.split("WEBVTT")
|
325 |
+
adjusted_sections = []
|
326 |
+
time_offset = 0
|
327 |
+
|
328 |
+
for section in sections[1:]: # Skip the first section as it's likely the header
|
329 |
+
lines = section.strip().split("\n")
|
330 |
+
adjusted_lines = []
|
331 |
+
|
332 |
+
for line in lines:
|
333 |
+
if '-->' in line:
|
334 |
+
start, end = line.split(' --> ')
|
335 |
+
start_sec = self.parse_time(start) + time_offset
|
336 |
+
end_sec = self.parse_time(end) + time_offset
|
337 |
+
adjusted_line = f"{self.format_time(start_sec)} --> {self.format_time(end_sec)}"
|
338 |
+
adjusted_lines.append(adjusted_line)
|
339 |
+
else:
|
340 |
+
adjusted_lines.append(line)
|
341 |
+
|
342 |
+
# Update the time offset using the last timestamp of the current section
|
343 |
+
if adjusted_lines:
|
344 |
+
last_time = adjusted_lines[-2] # The second last line contains the last timestamp
|
345 |
+
_, end = last_time.split(' --> ')
|
346 |
+
time_offset = self.parse_time(end)
|
347 |
+
|
348 |
+
adjusted_sections.append('\n'.join(adjusted_lines))
|
349 |
+
|
350 |
+
return "WEBVTT\n\n".join(adjusted_sections)
|
351 |
+
|
352 |
+
|
353 |
+
def extract_frames_by_range(self, video_path, target_folder, start_time, end_time, fps=1):
|
354 |
+
"""
|
355 |
+
Takes a video path, a start time, and an end time, and extracts frames from the video between the given timestamps.
|
356 |
+
:param video_path: The path to the video to process.
|
357 |
+
:param target_folder: The path to the folder to save the extracted frames to.
|
358 |
+
:param start_time: The start time to extract frames from in HH:MM:SS.mmm format.
|
359 |
+
:param end_time: The end time to extract frames to in HH:MM:SS.mmm format.
|
360 |
+
:param fps: The frames per second to extract from the video.
|
361 |
+
"""
|
362 |
+
|
363 |
+
# Convert start_time and end_time from HH:MM:SS.mmm to seconds
|
364 |
+
start_seconds = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_time.split(":")))
|
365 |
+
end_seconds = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_time.split(":")))
|
366 |
+
|
367 |
+
# Create the target folder if it doesn't exist
|
368 |
+
if not os.path.exists(target_folder):
|
369 |
+
os.makedirs(target_folder)
|
370 |
+
|
371 |
+
with VideoFileClip(video_path) as video:
|
372 |
+
# Calculate the interval between frames based on the desired fps
|
373 |
+
interval = 1 / fps
|
374 |
+
|
375 |
+
# Adjust the loop to iterate over the desired timestamp range
|
376 |
+
current_time = start_seconds
|
377 |
+
while current_time < end_seconds:
|
378 |
+
frame = video.get_frame(current_time)
|
379 |
+
timestamp = self.format_time(int(current_time))
|
380 |
+
frame_path = os.path.join(target_folder, f"{timestamp}.png")
|
381 |
+
Image.fromarray(np.uint8(frame)).save(frame_path)
|
382 |
+
current_time += interval
|
383 |
+
|
384 |
+
|
385 |
+
def find_timestamps(self, file_path):
|
386 |
+
"""
|
387 |
+
Takes a file path and finds the timestamps within the file.
|
388 |
+
Searches for timestamps in the format "at 00:00:20.360" and "[00:00:28.559]".
|
389 |
+
:param file_path: The path to the file to process.
|
390 |
+
:return: A list of timestamps.
|
391 |
+
"""
|
392 |
+
# Updated pattern to match both "at 00:00:20.360" and "[00:00:28.559]"
|
393 |
+
timestamp_pattern = r'at (\d{2}:\d{2}:\d{2}\.\d{3})|\[(\d{2}:\d{2}:\d{2}\.\d{3})\]'
|
394 |
+
|
395 |
+
timestamps = []
|
396 |
+
|
397 |
+
with open(file_path, 'r') as file:
|
398 |
+
content = file.read()
|
399 |
+
# Find all matches and process them to flatten the list and remove None
|
400 |
+
raw_matches = re.findall(timestamp_pattern, content)
|
401 |
+
for match in raw_matches:
|
402 |
+
# match is a tuple where one group is the timestamp and the other is empty
|
403 |
+
timestamp = match[0] if match[0] else match[1]
|
404 |
+
timestamps.append(timestamp)
|
405 |
+
|
406 |
+
return timestamps
|
407 |
+
|
408 |
+
def parse_and_extract_frames(self, video_path, target_path, timestamps):
|
409 |
+
"""
|
410 |
+
Takes a video path and a list of timestamps, and extracts frames from the video around the given timestamps.
|
411 |
+
:param video_path: The path to the video to process.
|
412 |
+
:param target_path: The path to the folder to save the extracted frames to.
|
413 |
+
:param timestamps: A list of timestamps to extract frames around.
|
414 |
+
"""
|
415 |
+
# Function to adjust the timestamp by a given number of seconds
|
416 |
+
def adjust_timestamp(timestamp, seconds):
|
417 |
+
timestamp_dt = datetime.strptime(timestamp, "%H:%M:%S.%f")
|
418 |
+
adjusted_timestamp = timestamp_dt + timedelta(seconds=seconds)
|
419 |
+
return adjusted_timestamp.strftime("%H:%M:%S.%f")[:-3]
|
420 |
+
|
421 |
+
for timestamp in timestamps:
|
422 |
+
start_timestamp = adjust_timestamp(timestamp, -5)
|
423 |
+
end_timestamp = adjust_timestamp(timestamp, 5)
|
424 |
+
self.extract_frames_by_range(video_path, target_path, start_timestamp, end_timestamp)
|
425 |
+
|
426 |
+
|
427 |
+
def get_or_create_audio_parts(self, audio_file_path):
|
428 |
+
"""
|
429 |
+
Takes an audio file path and splits the audio into parts if needed.
|
430 |
+
:param audio_file_path: The path to the audio file to process.
|
431 |
+
:return: A list of paths to the audio parts.
|
432 |
+
"""
|
433 |
+
# Check if the audio needs to be split by checking its file size - this is approximate, but close enough for gov work
|
434 |
+
file_size_mb = os.path.getsize(audio_file_path) / (1024 * 1024)
|
435 |
+
parts_directory = os.path.join(os.path.dirname(audio_file_path), "parts")
|
436 |
+
os.makedirs(parts_directory, exist_ok=True)
|
437 |
+
|
438 |
+
# If the audio file has already been split, return the existing parts - else, split the audio file
|
439 |
+
existing_parts = [os.path.join(parts_directory, f) for f in os.listdir(parts_directory) if os.path.isfile(os.path.join(parts_directory, f))]
|
440 |
+
if existing_parts:
|
441 |
+
logging.info("Found existing audio parts. Resuming transcription.")
|
442 |
+
return existing_parts
|
443 |
+
logging.info(f"Audio file size: {file_size_mb} MB")
|
444 |
+
if file_size_mb > self.MAX_SIZE_MB:
|
445 |
+
logging.info(f"Audio file size exceeds maximum size of {self.MAX_SIZE_MB} MB. Splitting audio file into parts.")
|
446 |
+
return self.split_audio_file_by_size(audio_file_path)
|
447 |
+
else:
|
448 |
+
logging.info(f"Audio file size is within maximum size of {self.MAX_SIZE_MB} MB. No need to split the audio file.")
|
449 |
+
return [audio_file_path]
|
450 |
+
|
451 |
+
def split_audio_file_by_size(self, audio_file_path):
|
452 |
+
"""
|
453 |
+
Takes an audio file path and splits the audio into parts based on the maximum size.
|
454 |
+
:param audio_file_path: The path to the audio file to process.
|
455 |
+
:return: A list of paths to the audio parts.
|
456 |
+
"""
|
457 |
+
logging.info(f"Splitting audio file: {audio_file_path}")
|
458 |
+
audio = AudioSegment.from_file(audio_file_path)
|
459 |
+
max_chunk_duration_ms = ((self.MAX_SIZE * 8) / self.BITRATE) * 1000
|
460 |
+
logging.info(f"Max chunk duration: {max_chunk_duration_ms} ms")
|
461 |
+
num_chunks = ceil(len(audio) / max_chunk_duration_ms)
|
462 |
+
logging.info(f"Number of chunks: {num_chunks}")
|
463 |
+
chunk_length = len(audio) // num_chunks
|
464 |
+
chunks = [audio[i * chunk_length: (i + 1) * chunk_length] for i in range(num_chunks)]
|
465 |
+
chunk_files = []
|
466 |
+
for i, chunk in enumerate(chunks):
|
467 |
+
chunk_name = f"{os.path.splitext(audio_file_path)[0]}_part{i}.mp3"
|
468 |
+
logging.info(f"Exporting audio chunk: {chunk_name}")
|
469 |
+
chunk.export(chunk_name, format="mp3")
|
470 |
+
chunk_files.append(chunk_name)
|
471 |
+
logging.info(f"Audio file split into {len(chunk_files)} parts.")
|
472 |
+
return chunk_files
|
473 |
+
|
474 |
+
def video_to_audio(self, input_video, output_audio):
|
475 |
+
"""
|
476 |
+
Takes a video file path and strips out the audio to save as an MP3 file.
|
477 |
+
:param input_video: The path to the video file to process.
|
478 |
+
:param output_audio: The path to the audio file to save the converted audio to.
|
479 |
+
"""
|
480 |
+
if not os.path.exists(output_audio):
|
481 |
+
video = AudioSegment.from_file(input_video, "mp4")
|
482 |
+
video.export(output_audio, format="mp3", bitrate="128k")
|
483 |
+
logging.info(f"Audio file exported: {output_audio}")
|
484 |
+
else:
|
485 |
+
logging.info("Audio file already exists")
|
486 |
+
|
487 |
+
def transcribe_audio_part(self, part):
|
488 |
+
"""
|
489 |
+
Takes an audio file part path and transcribes the audio into text via whisper LLM.
|
490 |
+
:param part: The path to the audio file to process.
|
491 |
+
:return: The transcribed text.
|
492 |
+
"""
|
493 |
+
logging.info(f"Transcribing audio part: {part}")
|
494 |
+
with open(part, "rb") as audio_file:
|
495 |
+
transcript = self.client.audio.transcriptions.create(
|
496 |
+
model="whisper-1",
|
497 |
+
file=audio_file,
|
498 |
+
response_format="vtt"
|
499 |
+
# This prompt can be used to help the LLM understand the context of the audio and certain terms of art that may be used.
|
500 |
+
#,prompt="UCaaS, CPaaS, STaaS, DRaaS, BLF, CDR, CIM, GCCH, GVBM, HEPIC, SBC, PSTN, SMB, OrecX, Prov"
|
501 |
+
)
|
502 |
+
return transcript
|
503 |
+
|
504 |
+
def audio_to_transcript(self, input_audio):
|
505 |
+
"""
|
506 |
+
Takes an audio file path and transcribes the audio into text via whisper LLM.
|
507 |
+
:param input_audio: The path to the audio file to process.
|
508 |
+
:return: The path to the transcript file.
|
509 |
+
"""
|
510 |
+
logging.info(f"Transcribing audio: {input_audio}")
|
511 |
+
with open(input_audio, "rb") as audio_file:
|
512 |
+
transcript = self.client.audio.transcriptions.create(
|
513 |
+
model="whisper-1",
|
514 |
+
file=audio_file,
|
515 |
+
response_format="vtt"
|
516 |
+
# This prompt can be used to help the LLM understand the context of the audio and certain terms of art that may be used.
|
517 |
+
#,prompt="UCaaS, CPaaS, STaaS, DRaaS, BLF, CDR, CIM, GCCH, GVBM, HEPIC, SBC, PSTN, SMB, OrecX, Prov"
|
518 |
+
)
|
519 |
+
logging.info("Transcript created")
|
520 |
+
base_name = os.path.splitext(input_audio)[0]
|
521 |
+
output_file = f"{base_name}_transcript.txt"
|
522 |
+
|
523 |
+
with open(output_file, "w") as f:
|
524 |
+
json.dump(transcript, f, indent=4)
|
525 |
+
|
526 |
+
logging.info(f"Transcript saved to: {output_file}")
|
527 |
+
return output_file
|
528 |
+
|
529 |
+
|
530 |
+
def process_and_save_output(self, base_name, prompt_key, transcript_content, additional_content=None, file_suffix=""):
|
531 |
+
"""
|
532 |
+
Takes a transcript and generates the output for a given prompt.
|
533 |
+
:param base_name: The base name of the transcript file.
|
534 |
+
:param prompt_key: The key of the prompt to use.
|
535 |
+
:param transcript_content: The content of the transcript to process.
|
536 |
+
:param additional_content: Additional content to use in the prompt.
|
537 |
+
:param file_suffix: The suffix to use for the output file.
|
538 |
+
:return: The path to the output file.
|
539 |
+
"""
|
540 |
+
file_name = f"{base_name}_{file_suffix}.txt"
|
541 |
+
|
542 |
+
# Check if the file already exists
|
543 |
+
if os.path.exists(file_name):
|
544 |
+
logging.info(f"{file_suffix.replace('_', ' ').capitalize()} file already exists: {file_name}")
|
545 |
+
return file_name
|
546 |
+
|
547 |
+
# Load and process the prompt
|
548 |
+
prompt = self.load_prompt(prompt_key)
|
549 |
+
if additional_content:
|
550 |
+
conversation_history = self.build_conversation_history(self.load_prompt("summary_prompt"), transcript_content, additional_content['summary'], additional_content['topic'])
|
551 |
+
else:
|
552 |
+
conversation_history = self.build_conversation_history(prompt, transcript_content)
|
553 |
+
|
554 |
+
response = self.send_conversation(conversation_history)
|
555 |
+
content = response.choices[0].message.content
|
556 |
+
clean_lines = [line.strip() for line in content.split('\n') if line.strip() != '']
|
557 |
+
clean_content = '\n\n'.join(clean_lines)
|
558 |
+
|
559 |
+
# Write the processed content to the file
|
560 |
+
with open(file_name, "w") as f:
|
561 |
+
f.write(clean_content)
|
562 |
+
logging.info(f"{file_suffix.replace('_', ' ').capitalize()} saved to: {file_name}")
|
563 |
+
|
564 |
+
return file_name
|
565 |
+
|
566 |
+
def generate_transcript_outputs(self, transcript_file):
|
567 |
+
"""
|
568 |
+
Takes a transcript file and generates the summary outputs.
|
569 |
+
:param transcript_file: The path to the transcript file to process.
|
570 |
+
"""
|
571 |
+
with open(transcript_file, "r") as file:
|
572 |
+
transcript_content = file.read()
|
573 |
+
base_name = os.path.splitext(transcript_file)[0]
|
574 |
+
|
575 |
+
# Generate the summary
|
576 |
+
self.process_and_save_output(base_name, "summary_prompt", transcript_content, file_suffix="summary")
|
577 |
+
with open(f"{base_name}_summary.txt", "r") as file:
|
578 |
+
summary_file = file.read()
|
579 |
+
|
580 |
+
# Generate topic specific summaries
|
581 |
+
topic_prompts = self.generate_topic_prompts(summary_file)
|
582 |
+
# if script run with --topic, generate topic specific summaries.
|
583 |
+
# if extract_topics:
|
584 |
+
#Topic extraction assumed to be true
|
585 |
+
for i, topic_prompt in enumerate(topic_prompts):
|
586 |
+
additional_content = {"summary": summary_file, "topic": topic_prompt}
|
587 |
+
self.process_and_save_output(base_name, "summary_prompt", transcript_content, additional_content, file_suffix=f"topic{i}_summary")
|
588 |
+
|
589 |
+
# Generate the troubleshooting steps
|
590 |
+
self.process_and_save_output(base_name, "troubleshooting_prompt", transcript_content, file_suffix="troubleshooting_steps")
|
591 |
+
|
592 |
+
# Generate the glossary
|
593 |
+
self.process_and_save_output(base_name, "glossary_prompt", transcript_content, file_suffix="glossary")
|
594 |
+
|
595 |
+
# Generate the tags and symptoms
|
596 |
+
self.process_and_save_output(base_name, "tags_prompt", transcript_content, file_suffix="tags_and_symptoms")
|
597 |
+
|
598 |
+
logging.info(f"Transcript outputs saved to: {os.path.splitext(transcript_file)[0]}")
|
599 |
+
|
600 |
+
|
601 |
+
def extract_topics(self, response_text):
|
602 |
+
"""
|
603 |
+
Takes a response text and extracts the topics from it.
|
604 |
+
:param response_text: The response text to process.
|
605 |
+
:return: A list of topics.
|
606 |
+
"""
|
607 |
+
# Regular expression to match the pattern "Topic X: Title"
|
608 |
+
pattern = r"Topic \d+: .+"
|
609 |
+
topics = re.findall(pattern, response_text)
|
610 |
+
return topics
|
611 |
+
|
612 |
+
|
613 |
+
# OpenAI Functions
|
614 |
+
|
615 |
+
def generate_topic_prompts(self, response_text):
|
616 |
+
"""
|
617 |
+
Takes a response text and generates the topic prompts.
|
618 |
+
:param response_text: The response text to process.
|
619 |
+
:return: A list of topic prompts.
|
620 |
+
"""
|
621 |
+
topics = self.extract_topics(response_text)
|
622 |
+
base_prompt = self.load_prompt("topic_prompt")
|
623 |
+
topic_prompts = []
|
624 |
+
for topic in topics:
|
625 |
+
modified_prompt = base_prompt.replace("[REPLACE_ME]", topic)
|
626 |
+
topic_prompts.append(modified_prompt)
|
627 |
+
return topic_prompts
|
628 |
+
|
629 |
+
|
630 |
+
def load_prompt(self,prompt_key):
|
631 |
+
"""
|
632 |
+
Takes a prompt key and loads the prompt from the prompts folder.
|
633 |
+
:param prompt_key: The key of the prompt to load.
|
634 |
+
:return: The prompt content.
|
635 |
+
"""
|
636 |
+
prompt_path = self.prompts[prompt_key]
|
637 |
+
print(f"Loading prompt from: {prompt_path}") # Debugging line
|
638 |
+
with open(self.prompts[prompt_key], 'r') as file:
|
639 |
+
return file.read()
|
640 |
+
|
641 |
+
|
642 |
+
def send_conversation(self, conversation_history):
|
643 |
+
"""
|
644 |
+
Takes a conversation history and sends it to the OpenAI API to generate a response.
|
645 |
+
:param conversation_history: The conversation history to send.
|
646 |
+
:return: The response from the LLM
|
647 |
+
"""
|
648 |
+
response = openai.chat.completions.create(
|
649 |
+
model="gpt-4-1106-preview",
|
650 |
+
#model="gpt-3.5-turbo-1106",
|
651 |
+
messages=conversation_history,
|
652 |
+
max_tokens=4096,
|
653 |
+
temperature=0.00,
|
654 |
+
)
|
655 |
+
return response
|
656 |
+
|
657 |
+
|
658 |
+
def build_conversation_history(self, system_prompt, user_prompt1, assistant_response=None, user_prompt2=None):
|
659 |
+
"""
|
660 |
+
Takes a system prompt, user prompt, and optional assistant response and user prompt and builds a conversation history.
|
661 |
+
:param system_prompt: The system prompt to use.
|
662 |
+
:param user_prompt1: The first user prompt to use.
|
663 |
+
:param assistant_response: The assistant response to use.
|
664 |
+
:param user_prompt2: The second user prompt to use.
|
665 |
+
:return: The conversation history.
|
666 |
+
"""
|
667 |
+
conversation_history = [
|
668 |
+
{"role": "system", "content": system_prompt},
|
669 |
+
{"role": "user", "content": user_prompt1}
|
670 |
+
]
|
671 |
+
# Check if both or none of the optional parameters are provided
|
672 |
+
if (assistant_response is not None and user_prompt2 is not None) or (assistant_response is None and user_prompt2 is None):
|
673 |
+
# Append the optional prompts if both are provided
|
674 |
+
if assistant_response is not None:
|
675 |
+
conversation_history.append({"role": "assistant", "content": assistant_response})
|
676 |
+
conversation_history.append({"role": "user", "content": user_prompt2})
|
677 |
+
else:
|
678 |
+
raise ValueError("Both 'assistant_response' and 'user_prompt2' must be provided together or not at all.")
|
679 |
+
|
680 |
+
return conversation_history
|
681 |
+
|
682 |
+
def generate_article(self, input_file):
|
683 |
+
"""
|
684 |
+
Takes an input file path and generates a article from it.
|
685 |
+
:param input_file_path: The path to the input file to process.
|
686 |
+
:return: The article.
|
687 |
+
"""
|
688 |
+
article_prompt = self.load_prompt("article_prompt")
|
689 |
+
with open(input_file, "r") as file:
|
690 |
+
file_content = file.read()
|
691 |
+
article_convo = self.build_conversation_history(article_prompt, file_content)
|
692 |
+
response = self.send_conversation(article_convo)
|
693 |
+
content = response.choices[0].message.content
|
694 |
+
clean_lines = [line.strip() for line in content.split('\n') if line.strip() != '']
|
695 |
+
clean_content = '\n\n'.join(clean_lines)
|
696 |
+
return clean_content
|
697 |
+
|
698 |
+
def process_articles(self, input_path):
|
699 |
+
"""
|
700 |
+
Takes a path to a folder containing input files and generates articles from them.
|
701 |
+
:param input_path: The path to the folder containing input files to process.
|
702 |
+
"""
|
703 |
+
logging.info(f"Processing article inputs in folder: {input_path}")
|
704 |
+
for filename in tqdm(os.listdir(input_path), desc="Processing Files"):
|
705 |
+
if filename.endswith("_summary.txt") or filename.endswith("_troubleshooting_steps.txt"):
|
706 |
+
logging.info(f"Processing article input: {filename}")
|
707 |
+
input_file = os.path.join(input_path, filename)
|
708 |
+
article = self.generate_article(input_file)
|
709 |
+
output_file = os.path.join(input_path, f"{os.path.splitext(filename)[0]}_article.txt")
|
710 |
+
with open(output_file, "w") as f:
|
711 |
+
f.write(article)
|
712 |
+
logging.info(f"Article saved to: {output_file}")
|
713 |
+
|
714 |
+
# Everything below is added to adjust existing script to run self service.
|
715 |
+
# Until load_dotenv()
|
716 |
+
|
717 |
+
|
718 |
+
def get_drive_service(self):
|
719 |
+
SCOPES = ['https://www.googleapis.com/auth/drive']
|
720 |
+
SERVICE_ACCOUNT_FILE = os.path.join(os.path.dirname(__file__), 'generate-transcripts.json')
|
721 |
+
|
722 |
+
credentials = service_account.Credentials.from_service_account_file(
|
723 |
+
SERVICE_ACCOUNT_FILE, scopes=SCOPES)
|
724 |
+
|
725 |
+
return build('drive', 'v3', credentials=credentials)
|
726 |
+
|
727 |
+
def extract_drive_folder_id(self, drive_link):
|
728 |
+
# This can be expanded to handle various Google Drive link formats
|
729 |
+
match = re.search(r'folders/([^/?]+)', drive_link)
|
730 |
+
if match:
|
731 |
+
return match.group(1)
|
732 |
+
else:
|
733 |
+
raise ValueError("Invalid Google Drive folder link.")
|
734 |
+
|
735 |
+
def list_files_in_folder(self, service, folder_id):
|
736 |
+
results = service.files().list(
|
737 |
+
q=f"'{folder_id}' in parents and trashed=false",
|
738 |
+
pageSize=100,
|
739 |
+
fields="nextPageToken, files(id, name)").execute()
|
740 |
+
return results.get('files', [])
|
741 |
+
|
742 |
+
def download_file(self, service, file_id, file_path):
|
743 |
+
# Ensure the directory where the file will be saved exists
|
744 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
745 |
+
|
746 |
+
request = service.files().get_media(fileId=file_id)
|
747 |
+
with open(file_path, 'wb') as fh:
|
748 |
+
downloader = MediaIoBaseDownload(fh, request)
|
749 |
+
done = False
|
750 |
+
while done is False:
|
751 |
+
status, done = downloader.next_chunk()
|
752 |
+
|
753 |
+
def find_or_create_drive_folder(self, service, folder_name, parent_folder_id):
|
754 |
+
# Check if folder exists
|
755 |
+
query = f"mimeType='application/vnd.google-apps.folder' and name='{folder_name}' and '{parent_folder_id}' in parents and trashed=false"
|
756 |
+
response = service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
|
757 |
+
files = response.get('files', [])
|
758 |
+
if files:
|
759 |
+
# Folder exists, return its ID
|
760 |
+
return files[0]['id']
|
761 |
+
else:
|
762 |
+
# Folder doesn't exist, create it
|
763 |
+
folder_metadata = {
|
764 |
+
'name': folder_name,
|
765 |
+
'mimeType': 'application/vnd.google-apps.folder',
|
766 |
+
'parents': [parent_folder_id]
|
767 |
+
}
|
768 |
+
folder = service.files().create(body=folder_metadata, fields='id').execute()
|
769 |
+
return folder.get('id')
|
770 |
+
|
771 |
+
def upload_file(self, service, file_path, drive_folder_id):
|
772 |
+
file_metadata = {'name': os.path.basename(file_path), 'parents': [drive_folder_id]}
|
773 |
+
media = MediaFileUpload(file_path, resumable=True)
|
774 |
+
file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
775 |
+
logging.info(f"Uploaded {file_path} to Google Drive with ID {file.get('id')}")
|
776 |
+
|
777 |
+
def sync_folder_to_drive(self, service, local_folder_path, drive_parent_folder_id, is_root=True):
|
778 |
+
"""
|
779 |
+
Synchronize a local folder structure and its files with Google Drive.
|
780 |
+
|
781 |
+
:param service: Authenticated Google Drive service instance.
|
782 |
+
:param local_folder_path: Path to the local folder to sync.
|
783 |
+
:param drive_parent_folder_id: The Google Drive folder ID to sync with.
|
784 |
+
:param is_root: Boolean indicating if the current folder is the root of the sync operation.
|
785 |
+
"""
|
786 |
+
# If it's the root directory, upload files directly in it, then handle directories
|
787 |
+
if is_root:
|
788 |
+
for item_name in os.listdir(local_folder_path):
|
789 |
+
item_path = os.path.join(local_folder_path, item_name)
|
790 |
+
if os.path.isfile(item_path):
|
791 |
+
# Uploads 'processing.log' and any other files directly under the root
|
792 |
+
self.upload_file(service, item_path, drive_parent_folder_id)
|
793 |
+
|
794 |
+
# Process directories and their contents
|
795 |
+
for item_name in os.listdir(local_folder_path):
|
796 |
+
item_path = os.path.join(local_folder_path, item_name)
|
797 |
+
if os.path.isdir(item_path):
|
798 |
+
# It's a directory, find or create a corresponding folder on Drive
|
799 |
+
drive_folder_id = self.find_or_create_drive_folder(service, item_name, drive_parent_folder_id)
|
800 |
+
# Recursively sync the subfolder
|
801 |
+
self.sync_folder_to_drive(service, item_path, drive_folder_id, is_root=False)
|
802 |
+
elif os.path.isfile(item_path) and not is_root:
|
803 |
+
# For files in subdirectories, upload them to their respective folder on Google Drive
|
804 |
+
self.upload_file(service, item_path, drive_parent_folder_id)
|
805 |
+
|
806 |
+
def cleanup_input_folder(self, folder_path):
|
807 |
+
"""
|
808 |
+
Deletes all files and folders under the specified folder_path.
|
809 |
+
|
810 |
+
:param folder_path: Path to the folder to clean up.
|
811 |
+
"""
|
812 |
+
# Safety check to prevent accidental deletion of unintended directories
|
813 |
+
if "Input-Folder" in folder_path:
|
814 |
+
# List all items in the folder
|
815 |
+
for item_name in os.listdir(folder_path):
|
816 |
+
item_path = os.path.join(folder_path, item_name)
|
817 |
+
try:
|
818 |
+
# Check if it's a file and delete it
|
819 |
+
if os.path.isfile(item_path) or os.path.islink(item_path):
|
820 |
+
os.unlink(item_path)
|
821 |
+
# Else, it's a directory, delete the directory tree
|
822 |
+
elif os.path.isdir(item_path):
|
823 |
+
shutil.rmtree(item_path)
|
824 |
+
logging.info(f"Deleted {item_path}")
|
825 |
+
except Exception as e:
|
826 |
+
logging.error(f"Failed to delete {item_path}. Reason: {e}")
|
827 |
+
else:
|
828 |
+
logging.error("Safety check failed. The folder path does not seem to be correct.")
|
829 |
+
|
830 |
+
|
831 |
+
|
832 |
+
|
833 |
+
# Above is newly added codes
|
834 |
+
# Load environment variables and API key via .env file
|
835 |
+
load_dotenv()
|
836 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
837 |
+
|
838 |
+
# Example usage
|
839 |
+
if __name__ == "__main__":
|
840 |
+
input_folder_path = os.path.abspath(args.input_folder)
|
841 |
+
transcribe = args.transcribe
|
842 |
+
extract_topics = args.topic
|
843 |
+
processor = KnowledgeTranscriber(api_key)
|
844 |
+
processor.process_folder(input_folder_path, transcribe)
|
cs_ai_kt_transcribe_share/prompts/1-summary_prompt.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are analyzing a technical transcript from an engineer's video recording of a Knowledge Transfer session. The transcript includes timestamps and occasional references to visual elements displayed on-screen. Your objectives are:
|
2 |
+
|
3 |
+
Comprehensive Understanding: Thoroughly review the entire transcript, paying close attention to technical details and the context provided by the engineer.
|
4 |
+
Topic Segmentation: Identify and separate different topics discussed in the video. Use the timestamps to accurately delineate when each topic starts and ends.
|
5 |
+
Detailed Summaries with Visual Cues: For each identified topic, provide a detailed summary. Include the following elements:
|
6 |
+
|
7 |
+
Timestamps: Clearly mention the timestamps at the beginning of each topic summary.
|
8 |
+
Visual References: Highlight any references made by the speaker to the on-screen content. Use phrases like "As shown on the screen at [timestamp]", or "Referring to the display at [timestamp]" to make these moments stand out.
|
9 |
+
Technical Accuracy: Ensure that technical details are accurately captured and clearly explained, suitable for use by another engineer.
|
10 |
+
Final Format:
|
11 |
+
|
12 |
+
Video High-Level Overview: Provide a unified summary of the entire transcript, including overarching themes or objectives.
|
13 |
+
|
14 |
+
Topic Segmentation:
|
15 |
+
|
16 |
+
Topic 1 [Timestamp Range]: [Topic-specific, detailed summary with visual cues]
|
17 |
+
Topic 2 [Timestamp Range]: [Topic-specific, detailed summary with visual cues]
|
18 |
+
Continue for each identified topic, maintaining this format.
|
19 |
+
Remember, the goal is to create a summary that is both technically comprehensive and easily navigable, with clear references to visual elements and timestamps for effective cross-referencing with video content.
|
cs_ai_kt_transcribe_share/prompts/2-topic_prompt.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are analyzing a specific segment of a technical transcript from an engineer's video recording. This segment is focused on the topic of [REPLACE_ME]. The transcript includes timestamps which are crucial for this task. Your objectives are:
|
2 |
+
|
3 |
+
Comprehensive Review with Timestamps: Read the entire transcript thoroughly, paying special attention to the timestamps that correlate with different stages of the troubleshooting process.
|
4 |
+
|
5 |
+
Dual-Level Summarization with Timestamp References:
|
6 |
+
|
7 |
+
High-Level Summary with Timestamps: Provide a general overview of the topic of [REPLACE_ME], highlighting key moments in the process. Incorporate timestamps to reference significant points or changes in the troubleshooting procedure. This summary should be understandable to those not deeply versed in the technical details.
|
8 |
+
Detailed Technical Overview with Timestamps: Create a step-by-step technical guide detailing the procedures performed in relation to the topic of [REPLACE_ME]. Ensure this includes specific timestamps for each major step or instruction mentioned in the transcript. This detailed guide should serve as a comprehensive technical reference for engineers.
|
9 |
+
Format of the Summary with Timestamps:
|
10 |
+
|
11 |
+
Topic: [REPLACE_ME]
|
12 |
+
High-Level Summary: [Non-technical summary with key timestamps highlighted]
|
13 |
+
Technical Overview: [In-depth technical guide with specific timestamps for each major step or instruction]
|
14 |
+
Your goal is to produce summaries that are not only informative at a general level but also provide detailed technical guidance, with timestamps serving as a navigational tool to correlate the text with specific segments of the video.
|
cs_ai_kt_transcribe_share/prompts/3-troubleshooting_prompt.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are analyzing a technical transcript from an engineer's Knowledge Transfer session video recording. The transcript may cover multiple topics, and your focus is on identifying and detailing the troubleshooting process. Your objectives are:
|
2 |
+
|
3 |
+
In-Depth Transcript Review: Thoroughly read the entire transcript to fully grasp the content and context of the recording. Pay attention to technical specifics and nuances in the engineer's explanation.
|
4 |
+
|
5 |
+
Detailed Troubleshooting Overview: Generate a comprehensive overview of the troubleshooting process discussed in the transcript. This should include:
|
6 |
+
|
7 |
+
Abstract Overview: Provide an initial abstract of the problem being addressed. Include the technical context and any critical background information.
|
8 |
+
Timestamp-Referenced Troubleshooting Steps: Enumerate the steps taken to troubleshoot the problem, ensuring each step is detailed and technically accurate. Reference timestamps wherever possible to indicate when in the video each step is discussed or demonstrated.
|
9 |
+
Emphasis on Technical Precision and Clarity:
|
10 |
+
|
11 |
+
Avoid vague explanations; strive for clarity and specificity in each step.
|
12 |
+
Include exact tool names, error codes, system messages, or any relevant technical details as demonstrated or mentioned in the video.
|
13 |
+
Output Template:
|
14 |
+
|
15 |
+
Abstract Overview:
|
16 |
+
[Abstract of the issue, including any relevant technical context and background]
|
17 |
+
|
18 |
+
Troubleshooting Steps:
|
19 |
+
|
20 |
+
[Step Name/Summary] [Timestamp]
|
21 |
+
[Detailed guidance necessary to fulfil the step, including specific actions, tool names, error codes, etc.]
|
22 |
+
Continue this format for each troubleshooting step identified in the transcript.
|
23 |
+
|
24 |
+
Your goal is to create a guide that is both technically comprehensive and easy to follow, providing actionable steps for engineers to replicate the troubleshooting process effectively.
|
cs_ai_kt_transcribe_share/prompts/4-glossary_prompt.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are an expert at Influitive, the Customer Advocacy platform. You are reading a technical transcript of an engineer's video recording of a Knowledge Transfer session. There may be more than one topic discussed within the video, and your goal will be the following:
|
2 |
+
|
3 |
+
1. Review the entire transcript to ensure you fully understand the recording.
|
4 |
+
2. Generate a complete glossary of all technical terminology and acronyms identified within the transcript relevant to the issue or its solution.
|
5 |
+
|
6 |
+
Please include as much technical detail on the actions taken as possible.
|
cs_ai_kt_transcribe_share/prompts/5-tags_prompt.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are an expert at Influitive, the Customer Advocacy platform. You are reading a technical transcript of an engineer's video recording of a Knowledge Transfer session. There may be more than one topic discussed within the video, and your goal will be the following:
|
2 |
+
|
3 |
+
1. Review the entire transcript to ensure you fully understand the recording.
|
4 |
+
2. Generate a word cloud tag collection that best represents the discussed issue. Limit this to only the most relevant items, and exclude specific tools or generic things like "error message". Focus on the subject, features, and symptoms related to the topic.
|
5 |
+
3. Generate a reverse-engineered list of symptoms that a customer might report related to this issue that the generated troubleshooting steps might be used to address.
|
6 |
+
|
7 |
+
Please include as much technical detail as possible.
|
cs_ai_kt_transcribe_share/prompts/6-article_prompt.txt
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
You are an expert at transforming text content from legacy support articles into a new standard HTML format. Your task is to meticulously convert the provided legacy content into our ideal template format. The output must be in detailed HTML code, maintaining all original details.
|
2 |
+
|
3 |
+
Please demonstrate how you would transform a section of a hypothetical legacy article into our desired template. Do not summarize or omit any technical details. The transformation should retain the full depth of the original content in HTML format. For example, convert the following legacy section into the new template format. Remember to include all tags, attributes, and text as found in the original.
|
4 |
+
|
5 |
+
In addition to fitting the legacy content within the "Information" section, please generate the following and include them in their appropriate location:
|
6 |
+
|
7 |
+
1. Tags relevant to the content in the article.
|
8 |
+
2. A short article Overview.
|
9 |
+
3. A selection of frequently asked questions based on the article content.
|
10 |
+
|
11 |
+
Remember, do not include any markdown wrappers with your answer like ```html; only provide the edited HTML code.
|
12 |
+
|
13 |
+
Here is an example of the final template:
|
14 |
+
|
15 |
+
<h1 id="h_01HGKSMPRSDHAY1F7ET44QWYF6" class="title">Example Article Title</h1>
|
16 |
+
<p>
|
17 |
+
<strong>Tags: </strong>Topic, Domain, Issue, Solution, Feature
|
18 |
+
</p>
|
19 |
+
<p> </p>
|
20 |
+
<h1 id="h_01HGKSMPRS9JVGC6E9EA3DB98K">Overview</h1>
|
21 |
+
<p>
|
22 |
+
This is an example of an article overview, which details the content of the article
|
23 |
+
to help human users and LLMs quickly identify whether the content is relevant
|
24 |
+
to the topic they want to review.
|
25 |
+
</p>
|
26 |
+
<p> </p>
|
27 |
+
<h1 id="h_01HGKSMPRSYYE7GMM7DZ0VCKXK">Information</h1>
|
28 |
+
<p>
|
29 |
+
This is a short description of the focus of the steps to follow and contains
|
30 |
+
a simple table of contents linking to the sub-sections for longer-form articles.
|
31 |
+
</p>
|
32 |
+
<ul>
|
33 |
+
<li>
|
34 |
+
<a href="#h_01HGEGEACR3P9G6ANV7YMZSSW5">Topic 1</a>
|
35 |
+
</li>
|
36 |
+
<li>
|
37 |
+
<a href="#h_01HGEGECZFCZXP3WAEE6M0AFF4">Topic 2</a>
|
38 |
+
</li>
|
39 |
+
</ul>
|
40 |
+
<p> </p>
|
41 |
+
<h3 id="h_01HGEGEACR3P9G6ANV7YMZSSW5">Topic 1</h3>
|
42 |
+
<p>
|
43 |
+
This is an example of a topic that contains some details about a topic rather
|
44 |
+
than any specific process to follow. These topics will provide some information
|
45 |
+
to the reader.
|
46 |
+
</p>
|
47 |
+
<p> </p>
|
48 |
+
<h3 id="h_01HGEGECZFCZXP3WAEE6M0AFF4">Topic 2</h3>
|
49 |
+
<p>
|
50 |
+
This is an example of a topic that has step-by-step guidance.
|
51 |
+
</p>
|
52 |
+
<p> </p>
|
53 |
+
<h4 id="h_01HGKSMPRSTG6RVHMWK6RX7NSM">Step 1</h4>
|
54 |
+
<p class="wysiwyg-indent1">
|
55 |
+
This is a more detailed description of the actions needed to fulfil the step.
|
56 |
+
</p>
|
57 |
+
<h4 id="h_01HGKSMPRS0X842R2AG487JGJW">Step 2</h4>
|
58 |
+
<p class="wysiwyg-indent1">
|
59 |
+
This is a more detailed description of the actions needed to fulfil the step.
|
60 |
+
</p>
|
61 |
+
<h4 id="h_01HGKSMPRSA7K6PVR6J2VZ1FNS">Step 3</h4>
|
62 |
+
<p class="wysiwyg-indent1">
|
63 |
+
This is a more detailed description of the actions needed to fulfil the step.
|
64 |
+
</p>
|
65 |
+
<p> </p>
|
66 |
+
<h1 id="h_01HGKSMPRSER9ENY5QWWRZZ863">FAQ</h1>
|
67 |
+
<h3 id="h_01HGKSMPRSHXQ9M5E03BY1Z6BS">Question 1</h3>
|
68 |
+
<p class="wysiwyg-indent1">This is the answer to the question.</p>
|
69 |
+
<h3 id="h_01HGKSMPRSEYMTYT6PNHG2J1HD">Question 2</h3>
|
70 |
+
<p class="wysiwyg-indent1">This is the answer to the question.</p>
|
flagged/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
api_key,drive_link,output,flag,username,timestamp
|
2 |
+
,,"{""label"": null, ""confidences"": null}",,,2024-04-04 17:06:24.671339
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
ffmpeg
|
2 |
+
ffprobe
|
requirements.txt
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
annotated-types==0.6.0
|
2 |
+
anyio==4.2.0
|
3 |
+
certifi==2023.11.17
|
4 |
+
charset-normalizer==3.3.2
|
5 |
+
decorator==4.4.2
|
6 |
+
distro==1.9.0
|
7 |
+
h11==0.14.0
|
8 |
+
httpcore==1.0.2
|
9 |
+
httpx==0.26.0
|
10 |
+
idna==3.6
|
11 |
+
imageio==2.33.1
|
12 |
+
imageio-ffmpeg==0.4.9
|
13 |
+
moviepy==1.0.3
|
14 |
+
numpy==1.26.3
|
15 |
+
openai==1.8.0
|
16 |
+
pillow==10.2.0
|
17 |
+
proglog==0.1.10
|
18 |
+
pydantic==2.5.3
|
19 |
+
pydantic_core==2.14.6
|
20 |
+
pydub==0.25.1
|
21 |
+
python-dotenv==1.0.0
|
22 |
+
requests==2.31.0
|
23 |
+
sniffio==1.3.0
|
24 |
+
tqdm==4.66.1
|
25 |
+
typing_extensions==4.9.0
|
26 |
+
urllib3==2.1.0
|