sanchit-gandhi/whisper-jax · How to save the transcribed text and timestamp as an SRT file

Aug 24, 2023

Your website can transcribe the text and timestamp of videos, which is very impressive. Can you add a function to save as an SRT file?

CuddleMaster

Sep 21, 2023

did you figure it out?

CuddleMaster

Sep 25, 2023

•

edited Sep 25, 2023

here's the python script to do this if anyone who needs it. Just change the input_file location:

import re

# Function to convert timestamp to seconds
def time_to_seconds(timestamp):
    m, s = map(float, timestamp.split(':'))
    return m * 60 + s

# Function to format time in seconds as SRT time (hh:mm:ss,ms)
def format_time(seconds):
    milliseconds = int((seconds - int(seconds)) * 1000)
    seconds = int(seconds)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

# Input and output file names
input_file = r"C:\Documents\Python scripts\input.txt"
output_file = r"C:\Documents\Python scripts\output.srt"

# Open the input and output files
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    # Read lines from the input file
    lines = infile.readlines()
    
    # Initialize subtitle index
    subtitle_index = 1
    
    # Iterate through lines and convert to SRT format
    for line in lines:
        # Use regular expression to extract start and end times
        time_match = re.search(r'\[(\d+:\d+\.\d+) -> (\d+:\d+\.\d+)]', line)
        
        if time_match:
            start_time = time_to_seconds(time_match.group(1))
            end_time = time_to_seconds(time_match.group(2))
            outfile.write(f"{subtitle_index}\n")
            outfile.write(f"{format_time(start_time)} --> {format_time(end_time)}\n")
            subtitle_index += 1
        else:
            # Skip lines without time information
            continue
        
        # Write the subtitle text
        text = re.sub(r'\[.*\]', '', line).strip()
        outfile.write(f"{text}\n\n")

print(f"Conversion completed. The output has been saved to {output_file}")

Adas93

Sep 25, 2023

const fs = require('fs');

// Funkcja przekształcająca napisy w formacie "whisper" na format SRT
function convertWhisperToSrt(data) {
const lines = data.split('\n');
let srtOutput = '';
let subtitleIndex = 1;

for (const line of lines) {
    if (line.trim() !== '') {
        const match = line.match(/\[(\d+:\d+\.\d+) --> (\d+:\d+\.\d+)](.*)/);
        if (match && match.length === 4) {
            const startTime = formatTime(match[1]);
            const endTime = formatTime(match[2]);
            const text = match[3].trim();

            srtOutput += `${subtitleIndex}\n${startTime} --> ${endTime}\n${text}\n\n`;
            subtitleIndex++;
        }
    }
}

return srtOutput;

}

// Funkcja do formatowania czasu w stylu "00:00:00,000 --> 00:00:01,960"
function formatTime(timestamp) {
const parts = timestamp.split(':');
const minutes = parts[0];
const secondsAndMillis = parts[1].split('.');
const seconds = secondsAndMillis[0];
const milliseconds = secondsAndMillis[1].padEnd(3, '0');

return `00:${pad(minutes)}:${pad(seconds)},${milliseconds}`;

}

// Funkcja do wypełniania zerami
function pad(number, length = 2) {
return number.toString().padStart(length, '0');
}

// Odczyt pliku "whisper.txt" i przetwarzanie danych
const inputFile = "whisper.txt";
try {
const data = fs.readFileSync(inputFile, 'utf8');
const srtOutput = convertWhisperToSrt(data);

// Zapis przekonwertowanych napisów do pliku "converted.srt"
const outputFileName = "german2.srt";
fs.writeFileSync(outputFileName, srtOutput, 'utf8');

console.log(`Plik "${outputFileName}" został zapisany w formacie SRT.`);

} catch (err) {
console.error(Błąd podczas odczytu/zapisu plików: ${err.message});
}
here is js node script that convert a file txt to srt.Remember to change "-" to "--" in txt file after that it going to work easyly.

ArthurParkerhouse

Nov 8, 2023

I just copy and paste the output into a text file, and then I load that text file into the free Subtitle Edit program - https://www.nikse.dk/subtitleedit - Then I run "Fix common errors" to have it auto-fix any issues and then export it as an SRT.

MeroyTruman

Jan 22

I adjust CuddleMaster' code above to match both format (Hours:Minutes:Seconds.Milliseconds) (01:03:52.328) or (Minutes:Seconds.Milliseconds)(21:32.545);
and remove the time convert

import re

# Input and output file names
input_file = r"./input.txt"
output_file = r"./output.srt"

# Open the input and output files
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    # Read lines from the input file
    lines = infile.readlines()
    
    # Initialize subtitle index
    subtitle_index = 1
    
    # Iterate through lines and convert to SRT format
    for line in lines:
        # Use regular expression to extract start and end times, both format (Hours:Minutes:Seconds.Milliseconds) (01:03:52.328) or (Minutes:Seconds.Milliseconds)(21:32.545)
        time_match = re.search(r'\[((\d+:){1,2}\d+\.\d+) -> ((\d+:){1,2}\d+\.\d+)]', line)
        
        if time_match:
            start_time = time_match.group(1).replace(".", ",")
            if len(start_time) < 10:    # if time string like 34:08,342, lengs <= 9
                start_time = "00:" + start_time     # add hour '00:'
            end_time = time_match.group(3).replace(".", ",")
            if len(end_time) < 10:
                end_time = "00:" + end_time
            outfile.write(f"{subtitle_index}\n")
            outfile.write(f"{start_time} --> {end_time}\n")

            subtitle_index += 1
        else:
            # Skip lines without time information
            continue
        
        # Write the subtitle text
        text = re.sub(r'\[.*\]', '', line).strip()
        outfile.write(f"{text}\n\n")

print(f"Conversion completed. The output has been saved to {output_file}")

DJStomp

7 days ago

•

edited 7 days ago

here's the python script to do this if anyone who needs it. Just change the input_file location:

import re

# Function to convert timestamp to seconds
def time_to_seconds(timestamp):
    m, s = map(float, timestamp.split(':'))
    return m * 60 + s

# Function to format time in seconds as SRT time (hh:mm:ss,ms)
def format_time(seconds):
    milliseconds = int((seconds - int(seconds)) * 1000)
    seconds = int(seconds)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

# Input and output file names
input_file = r"C:\Documents\Python scripts\input.txt"
output_file = r"C:\Documents\Python scripts\output.srt"

# Open the input and output files
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    # Read lines from the input file
    lines = infile.readlines()
    
    # Initialize subtitle index
    subtitle_index = 1
    
    # Iterate through lines and convert to SRT format
    for line in lines:
        # Use regular expression to extract start and end times
        time_match = re.search(r'\[(\d+:\d+\.\d+) -> (\d+:\d+\.\d+)]', line)
        
        if time_match:
            start_time = time_to_seconds(time_match.group(1))
            end_time = time_to_seconds(time_match.group(2))
            outfile.write(f"{subtitle_index}\n")
            outfile.write(f"{format_time(start_time)} --> {format_time(end_time)}\n")
            subtitle_index += 1
        else:
            # Skip lines without time information
            continue
        
        # Write the subtitle text
        text = re.sub(r'\[.*\]', '', line).strip()
        outfile.write(f"{text}\n\n")

print(f"Conversion completed. The output has been saved to {output_file}")

Good lookin' out!

I adjust CuddleMaster' code above to match both format (Hours:Minutes:Seconds.Milliseconds) (01:03:52.328) or (Minutes:Seconds.Milliseconds)(21:32.545);
and remove the time convert

import re

# Input and output file names
input_file = r"./input.txt"
output_file = r"./output.srt"

# Open the input and output files
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    # Read lines from the input file
    lines = infile.readlines()
    
    # Initialize subtitle index
    subtitle_index = 1
    
    # Iterate through lines and convert to SRT format
    for line in lines:
        # Use regular expression to extract start and end times, both format (Hours:Minutes:Seconds.Milliseconds) (01:03:52.328) or (Minutes:Seconds.Milliseconds)(21:32.545)
        time_match = re.search(r'\[((\d+:){1,2}\d+\.\d+) -> ((\d+:){1,2}\d+\.\d+)]', line)
        
        if time_match:
            start_time = time_match.group(1).replace(".", ",")
            if len(start_time) < 10:    # if time string like 34:08,342, lengs <= 9
                start_time = "00:" + start_time     # add hour '00:'
            end_time = time_match.group(3).replace(".", ",")
            if len(end_time) < 10:
                end_time = "00:" + end_time
            outfile.write(f"{subtitle_index}\n")
            outfile.write(f"{start_time} --> {end_time}\n")

            subtitle_index += 1
        else:
            # Skip lines without time information
            continue
        
        # Write the subtitle text
        text = re.sub(r'\[.*\]', '', line).strip()
        outfile.write(f"{text}\n\n")

print(f"Conversion completed. The output has been saved to {output_file}")

Nice, thanks