File size: 6,394 Bytes
f255904
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import os
import warnings
import torch
import soundfile as sf
from scipy.signal import resample
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
import streamlit as st
import io
import numpy as np

# Suppress warnings globally
warnings.filterwarnings("ignore")

# Setup models
device = "cuda:0" if torch.cuda.is_available() else "cpu"
whisper_model_id = "openai/whisper-medium"

# Load Whisper model and processor
whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_id)
whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)

# Create Whisper pipeline
whisper_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper_model,
    tokenizer=whisper_processor.tokenizer,
    feature_extractor=whisper_processor.feature_extractor,
    device=device
)

# Setup FLAN-T5 model and tokenizer
flan_t5_model_id = "google/flan-t5-large"

try:
    flan_t5_tokenizer = T5Tokenizer.from_pretrained(flan_t5_model_id)
    flan_t5_model = T5ForConditionalGeneration.from_pretrained(flan_t5_model_id)
except ImportError as e:
    st.error(f"ImportError: {e}")
    st.stop()
except Exception as e:
    st.error(f"An error occurred while loading models: {e}")
    st.stop()

# Function to resample audio to 16000 Hz
def resample_audio(audio_data, original_sample_rate, target_sample_rate=16000):
    num_samples = int(len(audio_data) * float(target_sample_rate) / original_sample_rate)
    resampled_audio = resample(audio_data, num_samples)
    return resampled_audio

# Function to transcribe audio files
def transcribe_audio(audio_file):
    try:
        # Read the audio file
        audio_data, sample_rate = sf.read(audio_file)

        # Resample if necessary
        if sample_rate != 16000:
            audio_data = resample_audio(audio_data, sample_rate, 16000)

        # Process the audio with Whisper model
        inputs = whisper_processor(audio_data, sampling_rate=16000, return_tensors="pt")
        result = whisper_pipe(inputs)
        return result['text']
    except Exception as e:
        st.error(f"Error in audio transcription: {e}")
        return "Error during transcription"

# Function to extract text and questions from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    questions = []
    try:
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
                    lines = page_text.split("\n")
                    for line in lines:
                        if line.strip() and line.strip()[0].isdigit():
                            questions.append(line.strip())
    except Exception as e:
        st.error(f"Error extracting text from PDF: {e}")
    return text, questions

# Function to generate form data with FLAN-T5
def generate_form_data(text, questions):
    responses = []
    for question in questions:
        input_text = f"""The following text is a transcript from an audio recording. Read the text and answer the following question in a complete sentence.\n\nText: {text}\n\nQuestion: {question}\n\nAnswer:"""

        inputs = flan_t5_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True)
        with torch.no_grad():
            outputs = flan_t5_model.generate(**inputs, max_length=100)

        generated_text = flan_t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

        if not generated_text.strip():
            generated_text = "The answer to this question is not present in the script."
        elif len(generated_text.strip()) < 10:
            input_text = f"""Based on the following transcript, provide a more detailed answer to the question.\n\nText: {text}\n\nQuestion: {question}\n\nAnswer:"""
            inputs = flan_t5_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True)
            outputs = flan_t5_model.generate(**inputs, max_length=100)
            generated_text = flan_t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

        responses.append(f"Question: {question}\nAnswer: {generated_text.strip()}")

    return "\n\n".join(responses)

# Function to save responses to PDF
def save_responses_to_pdf(responses, output_pdf_path):
    document = SimpleDocTemplate(output_pdf_path, pagesize=letter)
    styles = getSampleStyleSheet()

    response_style = ParagraphStyle(
        name='ResponseStyle',
        parent=styles['BodyText'],
        fontSize=10,
        spaceAfter=12
    )

    content = []
    for index, response in enumerate(responses, start=1):
        heading = Paragraph(f"<b>File {index}:</b>", styles['Heading2'])
        response_text = Paragraph(response.replace("\n", "<br/>"), response_style)

        content.append(heading)
        content.append(Spacer(1, 6))
        content.append(response_text)
        content.append(Spacer(1, 18))

    document.build(content)

# Streamlit UI
st.title("FillUp by Umar Majeed")

# Upload audio files
audio_files = st.file_uploader("Upload multiple audio files", type=["wav", "mp3"], accept_multiple_files=True)

# Upload PDF file
pdf_file = st.file_uploader("Upload a PDF file", type="pdf")

if st.button("Process"):
    if audio_files and pdf_file:
        responses = []
        pdf_text, pdf_questions = extract_text_from_pdf(pdf_file)

        for audio_file in audio_files:
            transcribed_text = transcribe_audio(audio_file)
            form_data = generate_form_data(transcribed_text, pdf_questions)
            responses.append(form_data)
            st.write(f"File {len(responses)}:\n{form_data}\n")

        output_pdf_path = "/tmp/response_output.pdf"
        save_responses_to_pdf(responses, output_pdf_path)
        st.write("Responses have been generated. You can download the result below.")

        with open(output_pdf_path, "rb") as file:
            st.download_button(
                label="Download PDF",
                data=file,
                file_name="response_output.pdf",
                mime="application/pdf"
            )
    else:
        st.error("Please upload both audio files and a PDF file.")