from langchain.llms import HuggingFacePipeline
import torch

from components import pexels, utils
import os, gc
import gradio as gr
from transformers import VitsModel, AutoTokenizer, pipeline
import torch

model = VitsModel.from_pretrained("facebook/mms-tts-ind")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-ind")


pexels_api_key = os.getenv('pexels_api_key')

def pred(product_name, orientation):
    if orientation == "Shorts/Reels/TikTok (1080 x 1920)":
        orientation = "potrait"
        height = 1920
        width = 1080
    elif orientation == "Youtube Videos (1920 x 1080)":
        orientation = "landscape"
        height = 1080
        width = 1920
    else :
        orientation = "square"
        height = 1080
        width = 1080
    folder_name, sentences = pexels.generate_videos(product_name, pexels_api_key, orientation, height, width, model, tokenizer)
    gc.collect()
    utils.combine_videos(folder_name)

    vid = os.path.join(folder_name,"Final_Ad_Video.mp4")
    spe = "x.wav"

    utils.combine_audio_video(folder_name, vid, spe)
    return ["\n".join(sentences), os.path.join(folder_name, "new_filename.mp4")]
        #{'video':os.path.join(folder_name, "Final_Ad_Video.mp4"),
       # 'captions':"\n".join(sentences)}
    

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Content [Video] Generator
        Create a short video based on your text input using AI
        ### Note : the video generation takes about 2-4 minutes 
        """
    )
    dimension = gr.Dropdown(
            ["Shorts/Reels/TikTok (1080 x 1920)", "Facebook/Youtube Videos (1920 x 1080)", "Square (1080 x 1080)"], 
            label="Video Dimension", info="Choose dimension"
        )
    product_name = gr.Textbox(label="text story", lines=15, max_lines=100)
    captions = gr.Textbox(label="captions")
    video = gr.Video()
    btn = gr.Button("Submit")
    btn.click(pred, inputs=[product_name, dimension], outputs=[captions,video])
    
    
demo.launch()