|
import os |
|
import json |
|
import gradio as gr |
|
import torch |
|
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM |
|
import logging |
|
import traceback |
|
import sys |
|
from audio_processing import AudioProcessor |
|
import spaces |
|
from chunkedTranscriber import ChunkedTranscriber |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|
handlers=[logging.StreamHandler(sys.stdout)] |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
def load_qa_model(): |
|
"""Load question-answering model""" |
|
try: |
|
model_id = "meta-llama/Meta-Llama-3-8B-Instruct" |
|
qa_pipeline = pipeline( |
|
"text-generation", |
|
model="hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |
|
model_kwargs={"torch_dtype": torch.bfloat16}, |
|
device_map="auto", |
|
use_auth_token=os.getenv("HF_TOKEN") |
|
) |
|
return qa_pipeline |
|
except Exception as e: |
|
logger.error(f"Failed to load Q&A model: {str(e)}") |
|
return None |
|
|
|
def load_summarization_model(): |
|
"""Load summarization model""" |
|
try: |
|
summarizer = pipeline( |
|
"summarization", |
|
model="sshleifer/distilbart-cnn-12-6", |
|
device=0 if torch.cuda.is_available() else -1 |
|
) |
|
return summarizer |
|
except Exception as e: |
|
logger.error(f"Failed to load summarization model: {str(e)}") |
|
return None |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
def process_audio(audio_file, translate=False): |
|
"""Process audio file""" |
|
transcriber = ChunkedTranscriber(chunk_size=5, overlap=1) |
|
_translation, _output = transcriber.transcribe_audio(audio_file, translate=True) |
|
return _translation, _output |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU(duration=120) |
|
def summarize_text(text): |
|
"""Summarize text""" |
|
try: |
|
|
|
summarizer = load_summarization_model() |
|
|
|
if summarizer is None: |
|
return "Summarization model could not be loaded." |
|
logger.info("Successfully loaded summarization Model") |
|
|
|
|
|
summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] |
|
return summary |
|
except Exception as e: |
|
logger.error(f"Summarization failed: {str(e)}") |
|
return "Error occurred during summarization." |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
def answer_question(context, question): |
|
"""Answer questions about the text""" |
|
try: |
|
qa_pipeline = load_qa_model() |
|
if qa_pipeline is None: |
|
return "Q&A model could not be loaded." |
|
if not question : |
|
return "Please enter your Question" |
|
|
|
messages = [ |
|
|
|
{"role":"system", "content": """ |
|
Analyze a translated transcript of a conversation that may contain multiple speakers and summarize the information in a structured intelligence document. |
|
|
|
The input format will include word-level or sentence-level timestamps, each indicating the speaker ID, language, and translated text. |
|
|
|
# Input Format Overview |
|
|
|
Word-Level Timestamps Example: |
|
``` |
|
[Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Word>" |
|
``` |
|
Example: |
|
``` |
|
0.01-0.02 - Speaker 1 - Language: English - Translated Text: "Proceed" |
|
0.02-0.025 - Speaker 1 - Language: English - Translated Text: "with" |
|
0.025-0.032 - Speaker 2 - Language: English - Translated Text: "caution" |
|
``` |
|
|
|
Optional Sentence-Level Structure Example: |
|
``` |
|
[Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Sentence>" |
|
``` |
|
Example with Sentence Grouping: |
|
``` |
|
0.01-0.05 - Speaker 1 - Language: English - Translated Text: "Proceed with caution." |
|
0.06-0.12 - Speaker 2 - Language: English - Translated Text: "All systems are ready." |
|
``` |
|
|
|
# Intelligence Summary Document Structure |
|
|
|
Use the format below to create a structured summary for each conversation transcript received: |
|
|
|
### 1. Top-Level Status & Assessment: |
|
- **Threat Level Assessment**: |
|
- Choose one: |
|
- Completely Innocuous |
|
- Likely Innocuous |
|
- Unclear — Requires Investigation |
|
- Likely Dangerous — Immediate Action |
|
- Likely Dangerous — Delayed Action |
|
- 100% Dangerous — Immediate Action |
|
- 100% Dangerous — Delayed Action |
|
- **Humanitarian Alert**: Identify any indications of distress, coercion, or need for assistance, such as signs of duress or requests for help. |
|
|
|
### 2. Basic Metadata: |
|
- **Number of Speakers**: Total and unique speakers detected. |
|
- **Languages**: List of languages used, with indication of who spoke which language. |
|
- **Location**: Actual or inferred locations of participants. |
|
- **Communication Medium**: Identify the method of interaction (e.g., phone call, direct conversation). |
|
|
|
### 3. Conversation Overview: |
|
- **Summary**: Concise breakdown of the main points and context. |
|
- **Alarming Keywords**: Identify any concerning words, including but not limited to keywords like "kill," "attack," "weapon," etc. |
|
- **Suspicious or Cryptic Phrases**: Statements that appear coded or unclear in the context of the discussion. |
|
|
|
### 4. In-Depth Analysis: |
|
- **Network Connections**: Identify mentions of additional individuals or groups involved. |
|
- **Intent & Emotional Tone Detection**: Analyze emotional cues (e.g., anger, fear, calmness, urgency). Identify signs of deception or tension. |
|
- **Behavioral Patterns**: Highlight repeated themes, phrases, or signals of planning and coordination. |
|
- **Code Words & Cryptic Language**: Detect terms that may indicate hidden or covert meaning. |
|
- **Geolocation References**: Point out any inferences regarding regional language or place names. |
|
- **Sentiment on Strategic Issues**: Identify any indication of radical, dissenting, or anti-national views that could imply unrest or extremism. |
|
|
|
### 5. Resource Mentions & Operational Logistics: |
|
- **Resource & Asset Mentions**: List any mention of tools, weapons, vehicles, or supply logistics. |
|
- **Behavioral Deviations**: Identify shifts in tone, speech, or demeanor suggesting stress, coercion, urgency, or preparation. |
|
|
|
### 6. Prioritization, Recommendations & Actionables: |
|
- **High-Risk Alert Priority**: Identify whether the conversation should be flagged for further attention. |
|
- **Recommended Actions**: |
|
- **Surveillance**: Suggest surveillance if concerning patterns or keywords are detected. |
|
- **Intervention**: Recommend intervention for urgent/high-risk cases. |
|
- **Humanitarian Assistance**: Suggest immediate support for any signs of distress. |
|
- **Follow-Up Analysis**: Identify statements that need deeper review for clarity or to understand potential hidden meanings. |
|
|
|
# Steps |
|
|
|
1. Analyze the input conversation for participant information and context. |
|
2. Fill in each section of the Intelligence Summary Document structure. |
|
3. Ensure all details, especially those related to potential risk factors or alerts, are captured and highlighted clearly. |
|
|
|
# Output Format |
|
|
|
Provide one structured Intelligence Summary Document for the conversation in either plain text format or structured JSON. |
|
|
|
# JSON Format Example: |
|
```json |
|
{ |
|
"Top-Level Status & Assessment": { |
|
"Threat Level Assessment": "Unclear - Requires Investigation", |
|
"Humanitarian Alert": "No distress signals detected." |
|
}, |
|
"Basic Metadata": { |
|
"Number of Speakers": 2, |
|
"Languages": { |
|
"Speaker 1": "English", |
|
"Speaker 2": "English" |
|
}, |
|
"Location": "Unknown", |
|
"Communication Medium": "Direct conversation" |
|
}, |
|
"Conversation Overview": { |
|
"Summary": "A cautious approach was suggested by Speaker 1, followed by an assurance from Speaker 2 that systems are ready.", |
|
"Alarming Keywords": [], |
|
"Suspicious or Cryptic Phrases": [] |
|
}, |
|
"In-Depth Analysis": { |
|
"Network Connections": "None identified", |
|
"Intent & Emotional Tone Detection": "Calm, precautionary tone", |
|
"Behavioral Patterns": "Speaker 1 expressing concern, Speaker 2 providing assurance", |
|
"Code Words & Cryptic Language": [], |
|
"Geolocation References": [], |
|
"Sentiment on Strategic Issues": "No radical or dissenting sentiment detected" |
|
}, |
|
"Resource Mentions & Operational Logistics": { |
|
"Resource & Asset Mentions": [], |
|
"Behavioral Deviations": "None noted" |
|
}, |
|
"Prioritization, Recommendations & Actionables": { |
|
"High-Risk Alert Priority": "Low", |
|
"Recommended Actions": { |
|
"Surveillance": "No further surveillance needed.", |
|
"Intervention": "Not required.", |
|
"Humanitarian Assistance": "Not required.", |
|
"Follow-Up Analysis": "No unusual phrases detected requiring review." |
|
} |
|
} |
|
} |
|
``` |
|
|
|
# Notes |
|
|
|
- Ensure that you mark any ambiguous segments as requiring further investigation. |
|
- Pay attention to emotional tone shifts or sudden changes in behavior. |
|
- If any direct or implied threat is detected, prioritize appropriately using the provided classifications. |
|
- Err on the side of caution. In case there is even a remote possibility that there might be something that required human attention, flag it. |
|
Analyze a translated transcript of a conversation that may contain multiple speakers and summarize the information in a structured intelligence document. |
|
|
|
The input format will include word-level or sentence-level timestamps, each indicating the speaker ID, language, and translated text. |
|
|
|
# Input Format Overview |
|
|
|
Word-Level Timestamps Example: |
|
``` |
|
[Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Word>" |
|
``` |
|
Example: |
|
``` |
|
0.01-0.02 - Speaker 1 - Language: English - Translated Text: "Proceed" |
|
0.02-0.025 - Speaker 1 - Language: English - Translated Text: "with" |
|
0.025-0.032 - Speaker 2 - Language: English - Translated Text: "caution" |
|
``` |
|
|
|
Optional Sentence-Level Structure Example: |
|
``` |
|
[Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Sentence>" |
|
``` |
|
Example with Sentence Grouping: |
|
``` |
|
0.01-0.05 - Speaker 1 - Language: English - Translated Text: "Proceed with caution." |
|
0.06-0.12 - Speaker 2 - Language: English - Translated Text: "All systems are ready." |
|
``` |
|
|
|
# Intelligence Summary Document Structure |
|
|
|
Use the format below to create a structured summary for each conversation transcript received: |
|
|
|
### 1. Top-Level Status & Assessment: |
|
- **Threat Level Assessment**: |
|
- Choose one: |
|
- Completely Innocuous |
|
- Likely Innocuous |
|
- Unclear — Requires Investigation |
|
- Likely Dangerous — Immediate Action |
|
- Likely Dangerous — Delayed Action |
|
- 100% Dangerous — Immediate Action |
|
- 100% Dangerous — Delayed Action |
|
- **Humanitarian Alert**: Identify any indications of distress, coercion, or need for assistance, such as signs of duress or requests for help. |
|
|
|
### 2. Basic Metadata: |
|
- **Number of Speakers**: Total and unique speakers detected. |
|
- **Languages**: List of languages used, with indication of who spoke which language. |
|
- **Location**: Actual or inferred locations of participants. |
|
- **Communication Medium**: Identify the method of interaction (e.g., phone call, direct conversation). |
|
|
|
### 3. Conversation Overview: |
|
- **Summary**: Concise breakdown of the main points and context. |
|
- **Alarming Keywords**: Identify any concerning words, including but not limited to keywords like "kill," "attack," "weapon," etc. |
|
- **Suspicious or Cryptic Phrases**: Statements that appear coded or unclear in the context of the discussion. |
|
|
|
### 4. In-Depth Analysis: |
|
- **Network Connections**: Identify mentions of additional individuals or groups involved. |
|
- **Intent & Emotional Tone Detection**: Analyze emotional cues (e.g., anger, fear, calmness, urgency). Identify signs of deception or tension. |
|
- **Behavioral Patterns**: Highlight repeated themes, phrases, or signals of planning and coordination. |
|
- **Code Words & Cryptic Language**: Detect terms that may indicate hidden or covert meaning. |
|
- **Geolocation References**: Point out any inferences regarding regional language or place names. |
|
- **Sentiment on Strategic Issues**: Identify any indication of radical, dissenting, or anti-national views that could imply unrest or extremism. |
|
|
|
### 5. Resource Mentions & Operational Logistics: |
|
- **Resource & Asset Mentions**: List any mention of tools, weapons, vehicles, or supply logistics. |
|
- **Behavioral Deviations**: Identify shifts in tone, speech, or demeanor suggesting stress, coercion, urgency, or preparation. |
|
|
|
### 6. Prioritization, Recommendations & Actionables: |
|
- **High-Risk Alert Priority**: Identify whether the conversation should be flagged for further attention. |
|
- **Recommended Actions**: |
|
- **Surveillance**: Suggest surveillance if concerning patterns or keywords are detected. |
|
- **Intervention**: Recommend intervention for urgent/high-risk cases. |
|
- **Humanitarian Assistance**: Suggest immediate support for any signs of distress. |
|
- **Follow-Up Analysis**: Identify statements that need deeper review for clarity or to understand potential hidden meanings. |
|
|
|
# Steps |
|
|
|
1. Analyze the input conversation for participant information and context. |
|
2. Fill in each section of the Intelligence Summary Document structure. |
|
3. Ensure all details, especially those related to potential risk factors or alerts, are captured and highlighted clearly. |
|
|
|
# Output Format |
|
|
|
Provide one structured Intelligence Summary Document for the conversation in either plain text format or structured JSON. |
|
|
|
# JSON Format Example: |
|
```json |
|
{ |
|
"Top-Level Status & Assessment": { |
|
"Threat Level Assessment": "Unclear - Requires Investigation", |
|
"Humanitarian Alert": "No distress signals detected." |
|
}, |
|
"Basic Metadata": { |
|
"Number of Speakers": 2, |
|
"Languages": { |
|
"Speaker 1": "English", |
|
"Speaker 2": "English" |
|
}, |
|
"Location": "Unknown", |
|
"Communication Medium": "Direct conversation" |
|
}, |
|
"Conversation Overview": { |
|
"Summary": "A cautious approach was suggested by Speaker 1, followed by an assurance from Speaker 2 that systems are ready.", |
|
"Alarming Keywords": [], |
|
"Suspicious or Cryptic Phrases": [] |
|
}, |
|
"In-Depth Analysis": { |
|
"Network Connections": "None identified", |
|
"Intent & Emotional Tone Detection": "Calm, precautionary tone", |
|
"Behavioral Patterns": "Speaker 1 expressing concern, Speaker 2 providing assurance", |
|
"Code Words & Cryptic Language": [], |
|
"Geolocation References": [], |
|
"Sentiment on Strategic Issues": "No radical or dissenting sentiment detected" |
|
}, |
|
"Resource Mentions & Operational Logistics": { |
|
"Resource & Asset Mentions": [], |
|
"Behavioral Deviations": "None noted" |
|
}, |
|
"Prioritization, Recommendations & Actionables": { |
|
"High-Risk Alert Priority": "Low", |
|
"Recommended Actions": { |
|
"Surveillance": "No further surveillance needed.", |
|
"Intervention": "Not required.", |
|
"Humanitarian Assistance": "Not required.", |
|
"Follow-Up Analysis": "No unusual phrases detected requiring review." |
|
} |
|
} |
|
} |
|
``` |
|
|
|
# Notes |
|
|
|
- Ensure that you mark any ambiguous segments as requiring further investigation. |
|
- Pay attention to emotional tone shifts or sudden changes in behavior. |
|
- If any direct or implied threat is detected, prioritize appropriately using the provided classifications. |
|
- Err on the side of caution. In case there is even a remote possibility that there might be something that required human attention, flag it. |
|
|
|
"""} |
|
{"role": "user", "content": f"Context: {''.join(item['translated'] for item in context if 'translated' in item)}\n\nQuestion: {question}"} |
|
] |
|
|
|
response = qa_pipeline(messages, max_new_tokens=256)[0]['generated_text'] |
|
return response |
|
except Exception as e: |
|
logger.error(f"Q&A failed: {str(e)}") |
|
return f"Error occurred during Q&A process: {str(e)}" |
|
|
|
|
|
|
|
with gr.Blocks() as iface: |
|
gr.Markdown("# Automatic Speech Recognition for Indic Languages") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio(type="filepath") |
|
translate_checkbox = gr.Checkbox(label="Enable Translation") |
|
process_button = gr.Button("Process Audio") |
|
|
|
with gr.Column(): |
|
|
|
full_text_output = gr.Textbox(label="Full Text", lines=5) |
|
translation_output = gr.Textbox(label="Transcription/Translation", lines=10) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
summarize_button = gr.Button("Summarize") |
|
summary_output = gr.Textbox(label="Summary", lines=3) |
|
|
|
with gr.Column(): |
|
question_input = gr.Textbox(label="Ask a question about the transcription") |
|
answer_button = gr.Button("Get Answer") |
|
answer_output = gr.Textbox(label="Answer", lines=3) |
|
|
|
|
|
process_button.click( |
|
process_audio, |
|
inputs=[audio_input, translate_checkbox], |
|
outputs=[translation_output, full_text_output] |
|
|
|
) |
|
|
|
summarize_button.click( |
|
summarize_text, |
|
|
|
inputs=[translation_output], |
|
outputs=[summary_output] |
|
) |
|
|
|
answer_button.click( |
|
answer_question, |
|
inputs=[translation_output, question_input], |
|
outputs=[answer_output] |
|
) |
|
|
|
|
|
gr.Markdown(f""" |
|
## System Information |
|
- Device: {"CUDA" if torch.cuda.is_available() else "CPU"} |
|
- CUDA Available: {"Yes" if torch.cuda.is_available() else "No"} |
|
|
|
## Features |
|
- Automatic language detection |
|
- High-quality transcription using MMS |
|
- Optional translation to English |
|
- Text summarization |
|
- Question answering |
|
""") |
|
|
|
if __name__ == "__main__": |
|
iface.launch(server_port=None) |