GeorgiosIoannouCoder commited on
Commit
29bab07
·
verified ·
1 Parent(s): 75f74a2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -0
app.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #############################################################################################################################
2
+ # Filename : app.py
3
+ # Description: A Streamlit application to turn an image to audio story.
4
+ # Author : Georgios Ioannou
5
+ #
6
+ # Copyright © 2024 by Georgios Ioannou
7
+ #############################################################################################################################
8
+ # Import libraries.
9
+
10
+
11
+ import os # Load environment variable(s).
12
+ import requests # Send HTTP GET request to Hugging Face models for inference.
13
+ import streamlit as st # Build the GUI of the application.
14
+
15
+ from dotenv import find_dotenv, load_dotenv # Load environment variables.
16
+ from langchain.chat_models import ChatOpenAI # Access to OpenAI gpt-3.5-turbo model.
17
+ from langchain.chains import LLMChain # Chain to run queries against LLMs.
18
+ # A prompt template. It accepts a set of parameters from the user that can be used to generate a prompt for a language model.
19
+ from langchain.prompts import PromptTemplate
20
+ from transformers import pipeline # Access to Hugging Face models.
21
+
22
+
23
+ #############################################################################################################################
24
+ # Load environment variable(s).
25
+
26
+ load_dotenv(find_dotenv())
27
+ HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
28
+
29
+
30
+ #############################################################################################################################
31
+ # Function to apply local CSS.
32
+
33
+
34
+ def local_css(file_name):
35
+ with open(file_name) as f:
36
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
37
+
38
+
39
+ #############################################################################################################################
40
+ # Return the text generated by the model for the image.
41
+ # Using pipeline.
42
+
43
+
44
+ def img_to_text(image_path):
45
+ # https://huggingface.co/tasks
46
+ # Task used here : "image-to-text".
47
+ # Model used here: "Salesforce/blip-image-captioning-base".
48
+ # Backup model: "nlpconnect/vit-gpt2-image-captioning".
49
+
50
+ image_to_text = pipeline(
51
+ "image-to-text", model="Salesforce/blip-image-captioning-base"
52
+ )
53
+ # image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
54
+
55
+ scenario = image_to_text(image_path)[0]["generated_text"]
56
+
57
+ return scenario
58
+
59
+
60
+ #############################################################################################################################
61
+ # Return the story generated by the model for the scenario.
62
+ # Using Langchain.
63
+
64
+
65
+ def generate_story(scenario, personality):
66
+ # Model used here: "gpt-3.5-turbo".
67
+
68
+ # The template can be customized to meet one's needs such as:
69
+ # Generate a story and generate lyrics of a song.
70
+
71
+ template = """
72
+ You are a story teller.
73
+ You must sound like {personality}.
74
+ The story should be less than 50 words.
75
+ Generate a story based on the above constraints and the following scenario: {scenario}.
76
+ """
77
+
78
+ prompt = PromptTemplate(
79
+ template=template, input_variables=["scenario", "personality"]
80
+ )
81
+
82
+ story_llm = LLMChain(
83
+ llm=ChatOpenAI(
84
+ model_name="gpt-3.5-turbo", temperature=0
85
+ ), # Increasing the temperature, the model becomes more creative and takes longer for inference.
86
+ prompt=prompt,
87
+ verbose=True, # Print intermediate values to the console.
88
+ )
89
+
90
+ story = story_llm.predict(
91
+ scenario=scenario, personality=personality
92
+ ) # Format prompt with kwargs and pass to LLM.
93
+
94
+ return story
95
+
96
+
97
+ #############################################################################################################################
98
+ # Return the speech generated by the model for the story.
99
+ # Using inference api.
100
+
101
+
102
+ def text_to_speech(story):
103
+ # Model used here: "espnet/kan-bayashi_ljspeech_vits.
104
+ # Backup model: "facebook/mms-tts-eng".
105
+
106
+ API_URL = (
107
+ "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
108
+ )
109
+ # API_URL = "https://api-inference.huggingface.co/models/facebook/mms-tts-eng"
110
+
111
+ headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
112
+
113
+ payload = {"inputs": story}
114
+
115
+ response = requests.post(API_URL, headers=headers, json=payload)
116
+
117
+ with open("audio.flac", "wb") as file:
118
+ file.write(response.content)
119
+
120
+
121
+ #############################################################################################################################
122
+ # Main function to create the Streamlit web application.
123
+
124
+
125
+ def main():
126
+ try:
127
+ # Page title and favicon.
128
+
129
+ st.set_page_config(page_title="Image To Audio Story", page_icon="🖼️")
130
+
131
+ # Load CSS.
132
+
133
+ local_css("styles/style.css")
134
+
135
+ # Title.
136
+
137
+ title = f"""<h1 align="center" style="font-family: monospace; font-size: 2.1rem; margin-top: -6rem">
138
+ Turn Image to Audio Story</h1>"""
139
+ st.markdown(title, unsafe_allow_html=True)
140
+
141
+ # Define the personalities for the dropdown menu.
142
+
143
+ personalities = [
144
+ "Donald Trump",
145
+ "Abraham Lincoln",
146
+ "Aristotle",
147
+ "Cardi B",
148
+ "Kanye West",
149
+ ]
150
+ personality = st.selectbox("Select a personality:", personalities)
151
+
152
+ # Upload an image.
153
+
154
+ uploaded_file = st.file_uploader("Choose an image:")
155
+
156
+ if uploaded_file is not None:
157
+ # Display the uploaded image.
158
+
159
+ bytes_data = uploaded_file.getvalue()
160
+ with open(uploaded_file.name, "wb") as file:
161
+ file.write(bytes_data)
162
+ st.image(uploaded_file, caption="Uploaded Image.", use_column_width=True)
163
+
164
+ with st.spinner(text="Model Inference..."): # Spinner to keep the application interactive.
165
+ # Model inference.
166
+
167
+ scenario = img_to_text(uploaded_file.name)
168
+ story = generate_story(scenario=scenario, personality=personality)
169
+ text_to_speech(story)
170
+
171
+ # Display the scenario and story.
172
+
173
+ with st.expander("Scenario"):
174
+ st.write(scenario)
175
+ with st.expander("Story"):
176
+ st.write(story)
177
+
178
+ # Display the audio.
179
+
180
+ st.audio("audio.flac")
181
+ except Exception as e:
182
+ # Display any errors.
183
+
184
+ st.error(e)
185
+
186
+
187
+ #############################################################################################################################
188
+
189
+
190
+ if __name__ == "__main__":
191
+ main()