Spaces:

GeorgiosIoannouCoder
/

cuny-tech-prep-tutorial-1

Sleeping

File size: 7,384 Bytes

#############################################################################################################################
# Filename   : app.py
# Description: A Streamlit application to turn an image to audio story.
# Author     : Georgios Ioannou
#
# Copyright © 2024 by Georgios Ioannou
#############################################################################################################################
# Import libraries.


import os  # Load environment variable(s).
import requests  # Send HTTP GET request to Hugging Face models for inference.
import streamlit as st  # Build the GUI of the application.

from langchain.chat_models import ChatOpenAI  # Access to OpenAI gpt-3.5-turbo model.
from langchain.chains import LLMChain  # Chain to run queries against LLMs.
# A prompt template. It accepts a set of parameters from the user that can be used to generate a prompt for a language model.
from langchain.prompts import PromptTemplate
from transformers import pipeline  # Access to Hugging Face models.


#############################################################################################################################
# Load environment variable(s).

HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")


#############################################################################################################################
# Function to apply local CSS.


def local_css(file_name):
    with open(file_name) as f:
        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)


#############################################################################################################################
# Return the text generated by the model for the image.
# Using pipeline.


def img_to_text(image_path):
    # https://huggingface.co./tasks
    # Task used here : "image-to-text".
    # Model used here: "Salesforce/blip-image-captioning-base".
    # Backup model: "nlpconnect/vit-gpt2-image-captioning".
    # Backup model: "Salesforce/blip-image-captioning-large"

    image_to_text = pipeline(
        "image-to-text", model="Salesforce/blip-image-captioning-base"
    )
    # image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
    # image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

    scenario = image_to_text(image_path)[0]["generated_text"]

    return scenario


#############################################################################################################################
# Return the story generated by the model for the scenario.
# Using Langchain.


def generate_story(scenario, personality):
    # Model used here: "gpt-3.5-turbo".

    # The template can be customized to meet one's needs such as:
    # Generate a story and generate lyrics of a song.

    template = """
    You are a story teller.
    You must sound like {personality}.
    The story should be less than 50 words.
    Generate a story based on the above constraints and the following scenario: {scenario}.
    """

    prompt = PromptTemplate(
        template=template, input_variables=["scenario", "personality"]
    )

    story_llm = LLMChain(
        llm=ChatOpenAI(
            model_name="gpt-3.5-turbo", temperature=0
        ),  # Increasing the temperature, the model becomes more creative and takes longer for inference.
        prompt=prompt,
        verbose=True,  # Print intermediate values to the console.
    )

    story = story_llm.predict(
        scenario=scenario, personality=personality
    )  # Format prompt with kwargs and pass to LLM.

    return story


#############################################################################################################################
# Return the speech generated by the model for the story.
# Using inference api.


def text_to_speech(story):
    # Model used here: "espnet/kan-bayashi_ljspeech_vits.
    # Backup model: "facebook/mms-tts-eng".

    API_URL = (
        "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
    )
    # API_URL = "https://api-inference.huggingface.co/models/facebook/mms-tts-eng"

    headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}

    payload = {"inputs": story}

    response = requests.post(API_URL, headers=headers, json=payload)

    with open("audio.flac", "wb") as file:
        file.write(response.content)


#############################################################################################################################
# Main function to create the Streamlit web application.


def main():
    try:
        # Page title and favicon.

        st.set_page_config(page_title="Image To Audio Story", page_icon="🖼️")

        # Load CSS.

        local_css("styles/style.css")

        # Title.

        title = f"""<h1 align="center" style="font-family: monospace; font-size: 2.1rem; margin-top: -4rem">
                    Turn Image to Audio Story</h1>"""
        st.markdown(title, unsafe_allow_html=True)

        # Subtitle.

        title = f"""<h2 align="center" style="font-family: monospace; font-size: 1.5rem; margin-top: -2rem">
                    CUNY Tech Prep Tutorial 1</h2>"""
        st.markdown(title, unsafe_allow_html=True)

        # Image.

        image = "./ctp.png"
        left_co, cent_co, last_co = st.columns(3)
        with cent_co:
            st.image(image=image)

        # Define the personalities for the dropdown menu.

        personalities = [
            "Donald Trump",
            "Abraham Lincoln",
            "Aristotle",
            "Cardi B",
            "Kanye West",
        ]
        personality = st.selectbox("Select a personality:", personalities)

        # Upload an image.

        uploaded_file = st.file_uploader("Choose an image:")

        if uploaded_file is not None:
            # Display the uploaded image.

            bytes_data = uploaded_file.getvalue()
            with open(uploaded_file.name, "wb") as file:
                file.write(bytes_data)
            st.image(uploaded_file, caption="Uploaded Image.", use_column_width=True)
                        
            with st.spinner(text="Model Inference..."): # Spinner to keep the application interactive.
                # Model inference.
                
                scenario = img_to_text(uploaded_file.name)
                story = generate_story(scenario=scenario, personality=personality)
                text_to_speech(story)

                # Display the scenario and story.

                with st.expander("Scenario"):
                    st.write(scenario)
                with st.expander("Story"):
                    st.write(story)

            # Display the audio.

            st.audio("audio.flac")
    except Exception as e:
        # Display any errors.

        st.error(e)

    # GitHub repository of author.

    st.markdown(
        f"""
            <p align="center" style="font-family: monospace; color: #FAF9F6; font-size: 1rem;"><b> Check out our
            <a href="https://github.com/GeorgiosIoannouCoder/" style="color: #FAF9F6;"> GitHub repository</a></b>
            </p>
    """,
        unsafe_allow_html=True,
    )


#############################################################################################################################


if __name__ == "__main__":
    main()