Spaces:

ad4r5hgs
/

image-audio-gen

Sleeping

App Files Files Community

image-audio-gen / app.py

ad4r5hgs

Rename main.py to app.py

1a648c8 verified 7 months ago

raw

history blame contribute delete

3.67 kB

	import streamlit as st
	from clarifai.client.model import Model
	import base64
	from dotenv import load_dotenv
	from PIL import Image
	from io import BytesIO

	load_dotenv()
	import os

	clarifai_pat = os.getenv("CLARIFAI_PAT")
	cohere_api_key = os.getenv("COHERE")

	def generate_image(user_description, api_key):
	prompt = f"You are a professional comic artist. Based on the below user's description and content, create a proper story comic: {user_description}"
	inference_params = dict(quality="standard", size="1024x1024")
	model_prediction = Model(
	f"https://clarifai.com/openai/dall-e/models/dall-e-3?api_key={api_key}"
	).predict_by_bytes(
	prompt.encode(), input_type="text", inference_params=inference_params
	)
	output_base64 = model_prediction.outputs[0].data.image.base64
	with open("generated_image.png", "wb") as f:
	f.write(output_base64)
	return "generated_image.png"

	def understand_image(base64_image, api_key):
	prompt = "Analyze the content of this image and write a creative, engaging story that brings the scene to life. Describe the characters, setting, and actions in a way that would captivate a young audience:"
	inference_params = dict(temperature=0.2, image_base64=base64_image, api_key=api_key)
	model_prediction = Model(
	"https://clarifai.com/openai/chat-completion/models/gpt-4-vision"
	).predict_by_bytes(
	prompt.encode(), input_type="text", inference_params=inference_params
	)
	return model_prediction.outputs[0].data.text.raw

	def encode_image(image_path):
	with open(image_path, "rb") as image_file:
	return base64.b64encode(image_file.read()).decode("utf-8")

	def text_to_speech(input_text, api_key):
	inference_params = dict(voice="alloy", speed=1.0, api_key=api_key)
	model_prediction = Model(
	"https://clarifai.com/openai/tts/models/openai-tts-1"
	).predict_by_bytes(
	input_text.encode(), input_type="text", inference_params=inference_params
	)
	audio_base64 = model_prediction.outputs[0].data.audio.base64
	return audio_base64

	def main():
	st.set_page_config(page_title="Interactive Media Creator", layout="wide")
	st.title("Interactive Media Creator")

	with st.sidebar:
	st.header("Controls")
	image_description = st.text_area("Description for Image Generation", height=100)
	generate_image_btn = st.button("Generate Image")

	col1, col2 = st.columns(2)

	with col1:
	st.header("Comic Art")
	if generate_image_btn and image_description:
	with st.spinner("Generating image..."):
	image_path = generate_image(image_description, clarifai_pat)
	if image_path:
	st.image(
	image_path,
	caption="Generated Comic Image",
	use_column_width=True,
	)
	st.success("Image generated!")
	else:
	st.error("Failed to generate image.")

	with col2:
	st.header("Story")
	if generate_image_btn and image_description:
	with st.spinner("Creating a story..."):
	base64_image = encode_image(image_path)
	understood_text = understand_image(base64_image, cohere_api_key)
	audio_base64 = text_to_speech(understood_text, cohere_api_key)
	st.audio(audio_base64, format="audio/mp3")
	st.success("Audio generated from image understanding!")

	if __name__ == "__main__":
	main()