File size: 2,897 Bytes
9d0e777
 
064d0ae
9d0e777
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ac7c1e
 
064d0ae
 
cb536a9
9d0e777
1cf8caa
9d0e777
 
 
38ab966
9d0e777
 
 
 
d54999f
38ab966
9d0e777
cb536a9
 
 
 
 
 
 
 
 
 
 
 
56c2f3a
cb536a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import torch
import torch.nn as nn
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer


class _MLPVectorProjector(nn.Module):
    def __init__(
        self, input_hidden_size: int, lm_hidden_size: int, num_layers: int, width: int
    ):
        super(_MLPVectorProjector, self).__init__()
        self.mlps = nn.ModuleList()
        for _ in range(width):
            mlp = [nn.Linear(input_hidden_size, lm_hidden_size, bias=False)]
            for _ in range(1, num_layers):
                mlp.append(nn.GELU())
                mlp.append(nn.Linear(lm_hidden_size, lm_hidden_size, bias=False))
            self.mlps.append(nn.Sequential(*mlp))

    def forward(self, x):
        return torch.cat([mlp(x) for mlp in self.mlps], dim=-2)

model_name = "microsoft/phi-2"

phi2_text = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


def textMode(text, count):
    count = int(count)
    inputs = tokenizer(text, return_tensors="pt", return_attention_mask=False)
    prediction = tokenizer.batch_decode(
    phi2_text.generate(
        **inputs, 
        max_new_tokens=count,
        bos_token_id=tokenizer.bos_token_id, 
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    )
    return prediction[0].rstrip('<|endoftext|>').rstrip("\n")
        


def imageMode(image, question):
    return "In progress"

def audioMode(audio):
    return "In progress"


interface_title = "TSAI-ERA-V1 - Capstone - Multimodal GPT Demo"
with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown(f"## **{interface_title}**")
    gr.Markdown("Choose text mode/image mode/audio mode for generation")
    with gr.Tab("Text mode"):
        text_input = gr.Textbox(placeholder="Enter a prompt", label="Input")
        text_input_count = gr.Textbox(placeholder="Enter number of characters you want to generate", label="Count")
        text_button = gr.Button("Submit")
        text_output = gr.Textbox(label="Chat GPT like text")        
    with gr.Tab("Image mode"):
        with gr.Row():
            image_input = gr.Image()
            image_text_input = gr.Textbox(placeholder="Enter a question/prompt around the image", label="Question/Prompt")
        image_button = gr.Button("Submit")   
        image_text_output = gr.Textbox(label="Answer")
        
    with gr.Tab("Audio mode"):
        audio_input = gr.Audio()
        audio_button = gr.Button("Submit")
        audio_text_output = gr.Textbox(label="Chat GPT like text")
        

    text_button.click(textMode, inputs=[text_input, text_input_count], outputs=text_output)
    image_button.click(imageMode, inputs=[image_input,image_text_input], outputs=image_text_output)
    audio_button.click(audioMode, inputs=audio_input, outputs=audio_text_output)

demo.launch()