diff --git a/examples/examples_.DS_Store b/.DS_Store similarity index 83% rename from examples/examples_.DS_Store rename to .DS_Store index b72bbc3351faf9fba73c538cfa032af5e0f6778a..c32883842752acf742ff9e4a68f0123bb24f31f0 100644 Binary files a/examples/examples_.DS_Store and b/.DS_Store differ diff --git a/app.py b/app.py index 126ec74d50a7668fa63ad601f143f3505c9ff9d4..e34994af0fb3769a0d110ba1176cc3287fd07fd4 100644 --- a/app.py +++ b/app.py @@ -7,788 +7,36 @@ import base64 import json import gradio as gr import numpy as np -from gradio import processing_utils -import requests -from packaging import version -from PIL import Image, ImageDraw +from PIL import Image import functools import emoji from langchain_community.chat_models import ChatOpenAI from langchain.schema import HumanMessage from caption_anything.model import CaptionAnything -from caption_anything.utils.image_editing_utils import create_bubble_frame from caption_anything.utils.utils import mask_painter, seg_model_map, prepare_segmenter, image_resize from caption_anything.utils.parser import parse_augment from caption_anything.captioner import build_captioner -from caption_anything.text_refiner import build_text_refiner from caption_anything.segmenter import build_segmenter -from chatbox import ConversationBot, build_chatbot_tools, get_new_image_name +from backend.chatbox import ConversationBot, build_chatbot_tools, get_new_image_name from segment_anything import sam_model_registry import easyocr import re -import edge_tts from langchain import __version__ -import torch -from transformers import AutoProcessor, SiglipModel -import faiss -from huggingface_hub import hf_hub_download -from datasets import load_dataset import pandas as pd import requests -import spaces -# Print the current version of LangChain -print(f"Current LangChain version: {__version__}") -print("testing testing") - - - -# import tts - -############################################################################### -############# this part is for 3D generate ############# -############################################################################### - - -# import spaces # -# import threading - -# lock = threading.Lock() import os -# import uuid -# from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler -# from diffusers.utils import export_to_video -# from safetensors.torch import load_file -#from diffusers.models.modeling_outputs import Transformer2DModelOutput - - -import random -import uuid import json -from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler - - - - -import imageio import numpy as np -import torch -import rembg from PIL import Image -from torchvision.transforms import v2 -from pytorch_lightning import seed_everything -from omegaconf import OmegaConf -from einops import rearrange, repeat -from tqdm import tqdm -from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler - -from src.utils.train_util import instantiate_from_config -from src.utils.camera_util import ( - FOV_to_intrinsics, - get_zero123plus_input_cameras, - get_circular_camera_poses, -) -from src.utils.mesh_util import save_obj, save_glb -from src.utils.infer_util import remove_background, resize_foreground, images_to_video - -import tempfile -from functools import partial - -from huggingface_hub import hf_hub_download - - - - -# def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False): -# """ -# Get the rendering camera parameters. -# """ -# c2ws = get_circular_camera_poses(M=M, radius=radius, elevation=elevation) -# if is_flexicubes: -# cameras = torch.linalg.inv(c2ws) -# cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1, 1) -# else: -# extrinsics = c2ws.flatten(-2) -# intrinsics = FOV_to_intrinsics(50.0).unsqueeze(0).repeat(M, 1, 1).float().flatten(-2) -# cameras = torch.cat([extrinsics, intrinsics], dim=-1) -# cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1) -# return cameras - - -# def images_to_video(images, output_path, fps=30): -# # images: (N, C, H, W) -# os.makedirs(os.path.dirname(output_path), exist_ok=True) -# frames = [] -# for i in range(images.shape[0]): -# frame = (images[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8).clip(0, 255) -# assert frame.shape[0] == images.shape[2] and frame.shape[1] == images.shape[3], \ -# f"Frame shape mismatch: {frame.shape} vs {images.shape}" -# assert frame.min() >= 0 and frame.max() <= 255, \ -# f"Frame value out of range: {frame.min()} ~ {frame.max()}" -# frames.append(frame) -# imageio.mimwrite(output_path, np.stack(frames), fps=fps, codec='h264') - - -# ############################################################################### -# # Configuration. -# ############################################################################### - -# import shutil - -# def find_cuda(): -# # Check if CUDA_HOME or CUDA_PATH environment variables are set -# cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH') - -# if cuda_home and os.path.exists(cuda_home): -# return cuda_home - -# # Search for the nvcc executable in the system's PATH -# nvcc_path = shutil.which('nvcc') - -# if nvcc_path: -# # Remove the 'bin/nvcc' part to get the CUDA installation path -# cuda_path = os.path.dirname(os.path.dirname(nvcc_path)) -# return cuda_path - -# return None - -# cuda_path = find_cuda() - -# if cuda_path: -# print(f"CUDA installation found at: {cuda_path}") -# else: -# print("CUDA installation not found") - -# config_path = 'configs/instant-nerf-base.yaml' -# config = OmegaConf.load(config_path) -# config_name = os.path.basename(config_path).replace('.yaml', '') -# model_config = config.model_config -# infer_config = config.infer_config - -# IS_FLEXICUBES = True if config_name.startswith('instant-mesh') else False - -# device = torch.device('cuda') - -# # load diffusion model -# print('Loading diffusion model ...') -# pipeline = DiffusionPipeline.from_pretrained( -# "sudo-ai/zero123plus-v1.2", -# custom_pipeline="zero123plus", -# torch_dtype=torch.float16, -# ) -# pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config( -# pipeline.scheduler.config, timestep_spacing='trailing' -# ) - -# # load custom white-background UNet -# unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model") -# state_dict = torch.load(unet_ckpt_path, map_location='cpu') -# pipeline.unet.load_state_dict(state_dict, strict=True) - -# pipeline = pipeline.to(device) - -# # load reconstruction model -# print('Loading reconstruction model ...') -# model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_nerf_base.ckpt", repo_type="model") -# model0 = instantiate_from_config(model_config) -# state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict'] -# state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k} -# model0.load_state_dict(state_dict, strict=True) - -# model0 = model0.to(device) - -# print('Loading Finished!') - - -# def check_input_image(input_image): -# if input_image is None: -# raise gr.Error("No image uploaded!") -# image = None -# else: -# image = Image.open(input_image) -# return image - -# def preprocess(input_image, do_remove_background): - -# rembg_session = rembg.new_session() if do_remove_background else None - -# if do_remove_background: -# input_image = remove_background(input_image, rembg_session) -# input_image = resize_foreground(input_image, 0.85) - -# return input_image - +from backend.prompts import generate_prompt +from backend.recommendation import RecommendationConfig, ImageRecommender +from backend.gpt_service import get_gpt_response, get_artistinfo, get_yearinfo +from backend.texttospeech.tts import texttospeech -# # @spaces.GPU -# def generate_mvs(input_image, sample_steps, sample_seed): +recommendation_config = RecommendationConfig() +recommender = ImageRecommender(recommendation_config) -# seed_everything(sample_seed) - -# # sampling -# z123_image = pipeline( -# input_image, -# num_inference_steps=sample_steps -# ).images[0] - -# show_image = np.asarray(z123_image, dtype=np.uint8) -# show_image = torch.from_numpy(show_image) # (960, 640, 3) -# show_image = rearrange(show_image, '(n h) (m w) c -> (n m) h w c', n=3, m=2) -# show_image = rearrange(show_image, '(n m) h w c -> (n h) (m w) c', n=2, m=3) -# show_image = Image.fromarray(show_image.numpy()) - -# return z123_image, show_image - - -# # @spaces.GPU -# def make3d(images): - -# global model0 -# if IS_FLEXICUBES: -# model0.init_flexicubes_geometry(device) -# model0 = model0.eval() - -# images = np.asarray(images, dtype=np.float32) / 255.0 -# images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float() # (3, 960, 640) -# images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=3, m=2) # (6, 3, 320, 320) - -# input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device) -# render_cameras = get_render_cameras(batch_size=1, radius=2.5, is_flexicubes=IS_FLEXICUBES).to(device) - -# images = images.unsqueeze(0).to(device) -# images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1) - -# mesh_fpath = tempfile.NamedTemporaryFile(suffix=f".obj", delete=False).name -# print(mesh_fpath) -# mesh_basename = os.path.basename(mesh_fpath).split('.')[0] -# mesh_dirname = os.path.dirname(mesh_fpath) -# video_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.mp4") -# mesh_glb_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.glb") - -# with torch.no_grad(): -# # get triplane -# planes = model0.forward_planes(images, input_cameras) - -# # # get video -# # chunk_size = 20 if IS_FLEXICUBES else 1 -# # render_size = 384 - -# # frames = [] -# # for i in tqdm(range(0, render_cameras.shape[1], chunk_size)): -# # if IS_FLEXICUBES: -# # frame = model.forward_geometry( -# # planes, -# # render_cameras[:, i:i+chunk_size], -# # render_size=render_size, -# # )['img'] -# # else: -# # frame = model.synthesizer( -# # planes, -# # cameras=render_cameras[:, i:i+chunk_size], -# # render_size=render_size, -# # )['images_rgb'] -# # frames.append(frame) -# # frames = torch.cat(frames, dim=1) - -# # images_to_video( -# # frames[0], -# # video_fpath, -# # fps=30, -# # ) - -# # print(f"Video saved to {video_fpath}") - -# # get mesh -# mesh_out = model0.extract_mesh( -# planes, -# use_texture_map=False, -# **infer_config, -# ) - -# vertices, faces, vertex_colors = mesh_out -# vertices = vertices[:, [1, 2, 0]] - -# save_glb(vertices, faces, vertex_colors, mesh_glb_fpath) -# save_obj(vertices, faces, vertex_colors, mesh_fpath) - -# print(f"Mesh saved to {mesh_fpath}") - -# return mesh_fpath, mesh_glb_fpath - - -############################################################################### -############# above part is for 3D generate ############# -############################################################################### - - -############################################################################### -############# This part is for sCLIP ############# -############################################################################### - -# download model and dataset -hf_hub_download("merve/siglip-faiss-wikiart", "siglip_10k_latest.index", local_dir="./") -hf_hub_download("merve/siglip-faiss-wikiart", "wikiart_10k_latest.csv", local_dir="./") - -# read index, dataset and load siglip model and processor -index = faiss.read_index("./siglip_10k_latest.index") -df = pd.read_csv("./wikiart_10k_latest.csv") -device = torch.device('cuda' if torch.cuda.is_available() else "cpu") -processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") -slipmodel = SiglipModel.from_pretrained("google/siglip-base-patch16-224").to(device) - - -def read_image_from_url(url): - response = requests.get(url) - img = Image.open(BytesIO(response.content)).convert("RGB") - return img - -#@spaces.GPU -def extract_features_siglip(image): - with torch.no_grad(): - inputs = processor(images=image, return_tensors="pt").to(device) - image_features = slipmodel.get_image_features(**inputs) - return image_features - -@spaces.GPU -def infer(crop_image_path,full_image_path,state,language,task_type=None): - print("task type",task_type) - style_gallery_output = [] - item_gallery_output=[] - - if task_type=="task 1": - item_gallery_output.append("recomendation_pic/1.8.jpg") - item_gallery_output.append("recomendation_pic/1.9.jpg") - input_image = Image.open(full_image_path).convert("RGB") - input_features = extract_features_siglip(input_image.convert("RGB")) - input_features = input_features.detach().cpu().numpy() - input_features = np.float32(input_features) - faiss.normalize_L2(input_features) - distances, indices = index.search(input_features, 2) - for i,v in enumerate(indices[0]): - sim = -distances[0][i] - image_url = df.iloc[v]["Link"] - img_retrieved = read_image_from_url(image_url) - style_gallery_output.append(img_retrieved) - if language=="English": - msg="🖼️ Please refer to the section below to see the recommended results." - else: - msg="🖼️ 请到下方查看推荐结果。" - state+=[(None,msg)] - - return item_gallery_output, style_gallery_output,state,state - elif task_type=="task 2": - item_gallery_output.append("recomendation_pic/2.8.jpg") - item_gallery_output.append("recomendation_pic/2.9.png") - input_image = Image.open(full_image_path).convert("RGB") - input_features = extract_features_siglip(input_image.convert("RGB")) - input_features = input_features.detach().cpu().numpy() - input_features = np.float32(input_features) - faiss.normalize_L2(input_features) - distances, indices = index.search(input_features, 2) - for i,v in enumerate(indices[0]): - sim = -distances[0][i] - image_url = df.iloc[v]["Link"] - img_retrieved = read_image_from_url(image_url) - style_gallery_output.append(img_retrieved) - if language=="English": - msg="🖼️ Please refer to the section below to see the recommended results." - else: - msg="🖼️ 请到下方查看推荐结果。" - state+=[(None,msg)] - - return item_gallery_output, style_gallery_output,state,state - - elif task_type=="task 3": - item_gallery_output.append("recomendation_pic/3.8.png") - item_gallery_output.append("recomendation_pic/basket-2.png") - input_image = Image.open(full_image_path).convert("RGB") - input_features = extract_features_siglip(input_image.convert("RGB")) - input_features = input_features.detach().cpu().numpy() - input_features = np.float32(input_features) - faiss.normalize_L2(input_features) - distances, indices = index.search(input_features, 2) - for i,v in enumerate(indices[0]): - sim = -distances[0][i] - image_url = df.iloc[v]["Link"] - img_retrieved = read_image_from_url(image_url) - style_gallery_output.append(img_retrieved) - if language=="English": - msg="🖼️ Please refer to the section below to see the recommended results." - else: - msg="🖼️ 请到下方查看推荐结果。" - state+=[(None,msg)] - - return item_gallery_output, style_gallery_output,state,state - - elif crop_image_path: - input_image = Image.open(crop_image_path).convert("RGB") - input_features = extract_features_siglip(input_image.convert("RGB")) - input_features = input_features.detach().cpu().numpy() - input_features = np.float32(input_features) - faiss.normalize_L2(input_features) - distances, indices = index.search(input_features, 2) - for i,v in enumerate(indices[0]): - sim = -distances[0][i] - image_url = df.iloc[v]["Link"] - img_retrieved = read_image_from_url(image_url) - item_gallery_output.append(img_retrieved) - - input_image = Image.open(full_image_path).convert("RGB") - input_features = extract_features_siglip(input_image.convert("RGB")) - input_features = input_features.detach().cpu().numpy() - input_features = np.float32(input_features) - faiss.normalize_L2(input_features) - distances, indices = index.search(input_features, 2) - for i,v in enumerate(indices[0]): - sim = -distances[0][i] - image_url = df.iloc[v]["Link"] - img_retrieved = read_image_from_url(image_url) - style_gallery_output.append(img_retrieved) - if language=="English": - msg="🖼️ Please refer to the section below to see the recommended results." - else: - msg="🖼️ 请到下方查看推荐结果。" - state+=[(None,msg)] - - return item_gallery_output, style_gallery_output,state,state - else: - input_image = Image.open(full_image_path).convert("RGB") - input_features = extract_features_siglip(input_image.convert("RGB")) - input_features = input_features.detach().cpu().numpy() - input_features = np.float32(input_features) - faiss.normalize_L2(input_features) - distances, indices = index.search(input_features, 4) - for i,v in enumerate(indices[0]): - sim = -distances[0][i] - image_url = df.iloc[v]["Link"] - img_retrieved = read_image_from_url(image_url) - style_gallery_output.append(img_retrieved) - if language=="English": - msg="🖼️ Please refer to the section below to see the recommended results." - else: - msg="🖼️ 请到下方查看推荐结果。" - state+=[(None,msg)] - - return item_gallery_output, style_gallery_output,state,state - - - -############################################################################### -############# Above part is for sCLIP ############# -############################################################################### - - -############################################################################### -############# this part is for text to image ############# -############################################################################### - -# # Use environment variables for flexibility -MODEL_ID = os.getenv("MODEL_ID", "sd-community/sdxl-flash") MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096")) -USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1" -ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1" -BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1")) # Allow generating multiple images at once - -# # Determine device and load model outside of function for efficiency -# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -# pipe = StableDiffusionXLPipeline.from_pretrained( -# MODEL_ID, -# torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, -# use_safetensors=True, -# add_watermarker=False, -# ).to(device) -# pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) - -# # Torch compile for potential speedup (experimental) -# if USE_TORCH_COMPILE: -# pipe.compile() - -# # CPU offloading for larger RAM capacity (experimental) -# if ENABLE_CPU_OFFLOAD: -# pipe.enable_model_cpu_offload() - -MAX_SEED = np.iinfo(np.int32).max - -# def save_image(img): -# unique_name = str(uuid.uuid4()) + ".png" -# img.save(unique_name) -# return unique_name - -# def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: -# if randomize_seed: -# seed = random.randint(0, MAX_SEED) -# return seed - -# @spaces.GPU(duration=30, queue=False) -# def generate( -# prompt: str, -# negative_prompt: str = "", -# use_negative_prompt: bool = False, -# seed: int = 1, -# width: int = 200, -# height: int = 200, -# guidance_scale: float = 3, -# num_inference_steps: int = 30, -# randomize_seed: bool = False, -# num_images: int = 4, # Number of images to generate -# use_resolution_binning: bool = True, -# progress=gr.Progress(track_tqdm=True), -# ): -# seed = int(randomize_seed_fn(seed, randomize_seed)) -# generator = torch.Generator(device=device).manual_seed(seed) - -# # Improved options handling -# options = { -# "prompt": [prompt] * num_images, -# "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None, -# "width": width, -# "height": height, -# "guidance_scale": guidance_scale, -# "num_inference_steps": num_inference_steps, -# "generator": generator, -# "output_type": "pil", -# } - -# # Use resolution binning for faster generation with less VRAM usage -# # if use_resolution_binning: -# # options["use_resolution_binning"] = True - -# # Generate images potentially in batches -# images = [] -# for i in range(0, num_images, BATCH_SIZE): -# batch_options = options.copy() -# batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE] -# if "negative_prompt" in batch_options: -# batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE] -# images.extend(pipe(**batch_options).images) - -# image_paths = [save_image(img) for img in images] -# return image_paths, seed - -# examples = [ -# "a cat eating a piece of cheese", -# "a ROBOT riding a BLUE horse on Mars, photorealistic, 4k", -# "Ironman VS Hulk, ultrarealistic", -# "Astronaut in a jungle, cold color palette, oil pastel, detailed, 8k", -# "An alien holding a sign board containing the word 'Flash', futuristic, neonpunk", -# "Kids going to school, Anime style" -# ] - - - - -############################################################################### -############# above part is for text to image ############# -############################################################################### - - -print("4") - -css = """ -#warning {background-color: #FFCCCB} -.tools_button { - display: flex; - flex-direction: column; - align-items: center; - justify-content: center; - background: white; - border: none !important; - box-shadow: none !important; - text-align: center; - color: black; -} - -.tools_button_clicked { - display: flex; - flex-direction: column; - align-items: center; - justify-content: center; - background: white; - border: none !important; - box-shadow: none !important; - text-align: center; - color: rgb(18,150,219); -} - -.tools_button_add { - display: flex; - flex-direction: column; - align-items: center; - justify-content: center; - background: white; - border: none !important; - box-shadow: none !important; - text-align: center; - color: rgb(18,150,219); -} - - -.info_btn { - background: rgb(245, 245, 245) !important; - border: none !important; - box-shadow: none !important; - font-size: 15px !important; - min-width: 6rem !important; - max-width: 10rem !important; -} - -.info_btn_interact { - background: rgb(217, 217, 217) !important; - box-shadow: none !important; - font-size: 15px !important; - min-width: 6rem !important; - max-width: 10rem !important; -} - -.function_button { - background: rgb(227, 226, 226) !important; - border: none !important; - box-shadow: none !important; -} - -.function_button_rec { - background: rgb(189, 189, 189) !important; - border: none !important; - box-shadow: none !important; -} - -.small_button { - font-size: 12px !important; - padding: 2px 8px !important; - min-width: 60px !important; - height: 30px !important; -} - -#tool_box {max-width: 50px} - -""" -filtered_language_dict = { - 'English': {'female': 'en-US-JennyNeural', 'male': 'en-US-GuyNeural'}, - 'Chinese': {'female': 'zh-CN-XiaoxiaoNeural', 'male': 'zh-CN-YunxiNeural'}, - 'French': {'female': 'fr-FR-DeniseNeural', 'male': 'fr-FR-HenriNeural'}, - 'Spanish': {'female': 'es-MX-DaliaNeural', 'male': 'es-MX-JorgeNeural'}, - 'Arabic': {'female': 'ar-SA-ZariyahNeural', 'male': 'ar-SA-HamedNeural'}, - 'Portuguese': {'female': 'pt-BR-FranciscaNeural', 'male': 'pt-BR-AntonioNeural'}, - 'Cantonese': {'female': 'zh-HK-HiuGaaiNeural', 'male': 'zh-HK-WanLungNeural'} -} - -focus_map = { -"Describe":0, -"D+Analysis":1, -"DA+Interprete":2, -"Judge":3 -} - - - -# prompt_list = [ -# [ -# 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.', -# 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact that describes the theme and content you see, and one analysis of the techniques used in the work (shape, color, texture, form principles) as markdown outline with appropriate emojis. Each point listed is to be in {language} language, with a response length of about {length} words.', -# "Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact that describes the theme and content, one analysis of the techniques used in the work (shape, color, texture, form principles), and one interpretation that explores the deeper meaning and the artist's intentions (thoughts, emotions, concepts) as a markdown outline with appropriate emojis. Each point listed is to be in {language} language, with a response length of about {length} words.", -# 'Wiki_caption: {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.' -# ], -# [ -# "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", -# "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact and one analysis of the techniques used in the work (shape, color, texture, form principles) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", -# "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact, one analysis of the techniques used in the work (shape, color, texture, form principles), and one interpretation that explores the deeper meaning and the artist's intentions (thoughts, emotions, concepts) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", -# "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", -# ], -# [ -# 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.', -# "When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis of the techniques used in the work (shape, color, texture, form principles). Each point should be formatted as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", -# "When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis of the techniques used in the work (shape, color, texture, form principles) and and one interpretation that explores the deeper meaning and the artist's intentions (thoughts, emotions, concepts). Each point should be formatted as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", -# 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.', -# ] -# ] - -prompt_list = [ - [ - 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.', - 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.', - 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.', - 'Wiki_caption: {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.' - ], - [ - "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", - "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", - "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact, one analysis, and one interpret from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", - "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", - ], - [ - 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.', - 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.', - 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.', - 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.', - ] -] - -recommendation_prompt=[ - - [ - ''' - First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: - Recommendation reason: {{Recommendation based on {{object}} in the painting you saw earlier. Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate in three points. }} - Each bullet point should be in {language} language, with a response length of about {length} words. - ''', - ''' - When generating answers, you should tell people that I am the creator of painting you were looking at earlier itself, and generate text in the tone and manner in which you are the creator of painting were looking at earlier. - - First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: - - Recommendation reason: {{I'm the creator of that painting you saw earlier. I'm an artist. and I'm recommending this painting based on the fact that the {{object}} I've drawn also appear in the painting you're looking at. }} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the creator of painting were looking at earlier and start every sentence with I. - - Each bullet point should be in {language} language, with a response length of about {length} words. - - ''', - ''' - When generating answers, you should tell people that you are the object itself that was selected in the painting, and generate text in the tone and manner in which you are the object - - First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: - - Recommendation reason: {{I'm the {{object}} in the painting you were looking at earlier, and I'm recommending this painting based on the fact that I'm also present in the one you're looking at.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the object of this painting and start every sentence with I. - - Each bullet point should be in {language} language, with a response length of about {length} words. - - '''], - - [ - ''' - First identify what the name of the first painting is, you save yourself as the parameter: {{name}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: - Recommendation reason: {{Recommendation based on the painting {{name}}.Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate in three points.}} - Each bullet point should be in {language} language, with a response length of about {length} words. - ''', - ''' - When generating answers, you should tell people that I am the creator of painting you were looking at earlier itself, and generate text in the tone and manner in which you are the creator of painting were looking at earlier. - - First identify what the creator of the first painting is, you save yourself as the parameter: {artist}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: - - Recommendation reason: {{I'm the creator of that painting you saw earlier, {artist}. I'm an artist. and I'm recommending this painting based on the fact that the painting you're looking at is similar to the one you just saw of me.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the creator of painting were looking at earlier and start every sentence with I. - - Each bullet point should be in {language} language, with a response length of about {length} words. - - ''', - ''' - When generating answers, you should tell people that I am the painting you were looking at earlier itself, and generate text in the tone and manner in which you are the painting were looking at earlier. - - First identify what the name of the first painting is, you save yourself as the parameter: {{name}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: - - Recommendation reason: {{I'm the painting {{name}} you were looking at earlier, and I'm recommending this painting based on the fact that I'm similar to the one you're looking at.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the painting were looking at earlier and start every sentence with I. - - Each bullet point should be in {language} language, with a response length of about {length} words. - - '''], - - - - -] - -gpt_state = 0 -VOICE = "en-GB-SoniaNeural" -article = """ -
By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
-Gradio demo for EyeSee Anything in Art, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. """ examples = [ - ["test_images/1.The Ambassadors.jpg","test_images/task1.jpg","task 1"], - ["test_images/2.Football Players.jpg","test_images/task2.jpg","task 2"], - ["test_images/3-square.jpg","test_images/task3.jpg","task 3"], - # ["test_images/test4.jpg"], - # ["test_images/test5.jpg"], - # ["test_images/Picture5.png"], - - ] + ["assets/test_images/1.The Ambassadors.jpg","assets/test_images/task1.jpg","task 1"], + ["assets/test_images/2.Football Players.jpg","assets/test_images/task2.jpg","task 2"], + ["assets/test_images/3-square.jpg","assets/test_images/task3.jpg","task 3"]] with gr.Blocks( css=css, @@ -1910,7 +671,7 @@ def create_ui(): ) as iface: #display in the chatbox state = gr.State([]) - # expoer in log + # export in log log_state=gr.State([]) # history log for gpt history_log=gr.State([]) @@ -1929,7 +690,6 @@ def create_ui(): input_mask_state = gr.State(np.zeros((1, 1))) input_points_state = gr.State([]) input_labels_state = gr.State([]) - #store the selected image new_crop_save_path = gr.State(None) image_input_nobackground = gr.State(None) artist=gr.State(None) @@ -1938,17 +698,12 @@ def create_ui(): point_prompt = gr.State("Positive") log_list=gr.State([]) gender=gr.State('female') - # store the whole image path image_path=gr.State('') pic_index=gr.State(None) - recomended_state=gr.State([]) - + recomended_state=gr.State([]) recomended_path=gr.State(None) recomended_type=gr.State(None) - - - with gr.Row(): with gr.Column(scale=6): @@ -1970,21 +725,7 @@ def create_ui(): label="Check to autoplay audio", value=False, elem_classes="custom-autoplay",visible=False) output_audio = gr.HTML( label="Synthesised Audio", elem_classes="custom-output", visible=False) - with gr.Tab("Base(GPT Power)",visible=False) as base_tab: - image_input_base = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650) - with gr.Row(): - name_label_base = gr.Button(value="Name: ",elem_classes="info_btn") - artist_label_base = gr.Button(value="Artist: ",elem_classes="info_btn_interact") - year_label_base = gr.Button(value="Year: ",elem_classes="info_btn_interact") - material_label_base = gr.Button(value="Style: ",elem_classes="info_btn") - with gr.Tab("Base2",visible=False) as base_tab2: - image_input_base_2 = gr.Image(type="pil", interactive=True, elem_classes="image_upload",height=650) - with gr.Row(): - name_label_base2 = gr.Button(value="Name: ",elem_classes="info_btn") - artist_label_base2 = gr.Button(value="Artist: ",elem_classes="info_btn_interact") - year_label_base2 = gr.Button(value="Year: ",elem_classes="info_btn_interact") - material_label_base2 = gr.Button(value="Style: ",elem_classes="info_btn") with gr.Row(): with gr.Column(scale=1,min_width=50,visible=False) as instruct: @@ -2009,26 +750,10 @@ def create_ui(): focus_dda = gr.Button(value="Judge",interactive=True,elem_classes="function_button") recommend_btn = gr.Button(value="Recommend",interactive=True,elem_classes="function_button_rec") - # focus_asso = gr.Button(value="Associate",interactive=True,elem_classes="function_button",variant="primary") + - - with gr.Tab("Trajectory (beta)", visible=False) as traj_tab: - # sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=10, - # elem_id="image_sketcher") - sketcher_input = gr.ImageEditor(type="pil", interactive=True - ) - with gr.Row(): - name_label_traj = gr.Button(value="Name: ") - artist_label_traj = gr.Button(value="Artist: ") - year_label_traj = gr.Button(value="Year: ") - material_label_traj = gr.Button(value="Material: ") - # example_image_traj = gr.Image(type="pil", interactive=False, visible=False) - with gr.Row(): - clear_button_sketcher = gr.Button(value="Clear Sketch", interactive=True) - submit_button_sketcher = gr.Button(value="Submit", interactive=True) - with gr.Column(visible=False,scale=4) as modules_need_gpt1: with gr.Row(visible=False): sentiment = gr.Radio( @@ -2044,15 +769,7 @@ def create_ui(): label="Factuality", interactive=True, ) - # length = gr.Slider( - # minimum=10, - # maximum=80, - # value=10, - # step=1, - # interactive=True, - # label="Generated Caption Length", - # ) - # 是否启用wiki内容整合到caption中 + enable_wiki = gr.Radio( choices=["Yes", "No"], value="No", @@ -2068,7 +785,7 @@ def create_ui(): with gr.Column(scale=4): with gr.Column(visible=True) as module_key_input: openai_api_key = gr.Textbox( - value="sk-proj-bxHhgjZV8TVgd1IupZrUT3BlbkFJvrthq6zIxpZVk3vwsvJ9", + value="", placeholder="Input openAI API key", show_label=False, label="OpenAI API Key", @@ -2076,15 +793,10 @@ def create_ui(): type="password") with gr.Row(): enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary') - # disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True, - # variant='primary') + with gr.Column(visible=False) as module_notification_box: notification_box = gr.Textbox(lines=1, label="Notification", max_lines=5, show_label=False) - - # with gr.Column() as modules_need_gpt0: - # with gr.Column(visible=False) as modules_need_gpt2: - # paragraph_output = gr.Textbox(lines=16, label="Describe Everything", max_lines=16) - # cap_everything_button = gr.Button(value="Caption Everything in a Paragraph", interactive=True) + with gr.Column(visible=False) as modules_not_need_gpt2: with gr.Blocks(): chatbot = gr.Chatbot(label="Chatbox", elem_classes="chatbot",likeable=True,height=750,bubble_full_width=False) @@ -2103,72 +815,23 @@ def create_ui(): scale=5, interactive=True) - - - - - - - - - - # TTS interface hidden initially - with gr.Column(visible=False) as tts_interface: - input_text = gr.Textbox(label="Text Prompt", value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality") - input_language = gr.Dropdown(label="Language", choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn"], value="en") - input_audio = gr.Audio(label="Reference Audio", type="filepath", value="examples/female.wav") - input_mic = gr.Audio(sources="microphone", type="filepath", label="Use Microphone for Reference") - use_mic = gr.Checkbox(label="Check to use Microphone as Reference", value=False) - agree = gr.Checkbox(label="Agree", value=True) - output_waveform = gr.Video(label="Waveform Visual") - # output_audio = gr.HTML(label="Synthesised Audio") - - with gr.Row(): - submit_tts = gr.Button(value="Submit", interactive=True) - clear_tts = gr.Button(value="Clear", interactive=True) with gr.Row(): with gr.Column(scale=6): with gr.Row(): with gr.Column(visible=False) as recommend: - - # sort_rec=gr.Dropdown(["1", "2", "3", "4"], visible=False, - # value=[], - # multiselect=True, - # label="Score", info="Please sort the pictures according to your preference" - # ) gallery_result = gr.Gallery( label="Object-based Recommendation", height="auto", columns=2, - interactive=False - # columns=4, - # rows=2, - # show_label=False, - # allow_preview=True, - # object_fit="contain", - # height="auto", - # preview=True, - # show_share_button=True, - # show_download_button=True - ) + interactive=False) style_gallery_result = gr.Gallery( label="Style-based Recommendation", height="auto", columns=2, - interactive=False - # columns=4, - # rows=2, - # show_label=False, - # allow_preview=True, - # object_fit="contain", - # height="auto", - # preview=True, - # show_share_button=True, - # show_download_button=True - ) + interactive=False) with gr.Column(scale=3,visible=False) as reco_preview: selected_image = gr.Image(label="Selected Image", interactive=False) @@ -2198,109 +861,7 @@ def create_ui(): examples=examples, inputs=[example_image,task_instuction,task_type], ) - - - - - - ############################################################################### - ############# this part is for text to image ############# - ############################################################################### - - with gr.Row(variant="panel",visible=False) as text2image_model: - - with gr.Column(): - with gr.Column(): - gr.Radio(["Other Paintings by the Artist"], label="Artist", info="Who is the artist?🧑🎨"), - gr.Radio(["Oil Painting","Printmaking","Watercolor Painting","Drawing"], label="Art Forms", info="What are the art forms?🎨"), - gr.Radio(["Renaissance", "Baroque", "Impressionism","Modernism"], label="Period", info="Which art period?⏳"), - # to be done - gr.Dropdown( - ["ran", "swam", "ate", "slept"], value=["swam", "slept"], multiselect=True, label="Items", info="Which items are you interested in?" - ) - - with gr.Row(): - prompt = gr.Text( - label="Prompt", - show_label=False, - max_lines=1, - placeholder="Enter your prompt", - container=False, - ) - run_button = gr.Button("Run") - - with gr.Accordion("Advanced options", open=False): - num_images = gr.Slider( - label="Number of Images", - minimum=1, - maximum=4, - step=1, - value=4, - ) - with gr.Row(): - use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=True) - negative_prompt = gr.Text( - label="Negative prompt", - max_lines=5, - lines=4, - placeholder="Enter a negative prompt", - value="(deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, (mutated hands and fingers:1.4), disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation, NSFW", - visible=True, - ) - seed = gr.Slider( - label="Seed", - minimum=0, - maximum=MAX_SEED, - step=1, - value=0, - ) - randomize_seed = gr.Checkbox(label="Randomize seed", value=True) - with gr.Row(): - width = gr.Slider( - label="Width", - minimum=100, - maximum=MAX_IMAGE_SIZE, - step=64, - value=1024, - ) - height = gr.Slider( - label="Height", - minimum=100, - maximum=MAX_IMAGE_SIZE, - step=64, - value=1024, - ) - with gr.Row(): - guidance_scale = gr.Slider( - label="Guidance Scale", - minimum=0.1, - maximum=6, - step=0.1, - value=3.0, - ) - num_inference_steps = gr.Slider( - label="Number of inference steps", - minimum=1, - maximum=15, - step=1, - value=8, - ) - # with gr.Column(): - # result = gr.Gallery( - # label="Result", - # height="auto", - # columns=4 - # # columns=4, - # # rows=2, - # # show_label=False, - # # allow_preview=True, - # # object_fit="contain", - # # height="auto", - # # preview=True, - # # show_share_button=True, - # # show_download_button=True - # ) - + with gr.Row(visible=False) as export: chat_log_file = gr.File(label="Download Chat Log",scale=5) @@ -2321,59 +882,15 @@ def create_ui(): interactive=True, label="Generated Caption Length", ) - - # auto_play = gr.Checkbox( - # label="Check to autoplay audio", value=False, elem_classes="custom-autoplay" - # ) - # output_audio = gr.HTML( - # label="Synthesised Audio", elem_classes="custom-output" - # ) - - - # gr.Examples( - # examples=examples, - # inputs=prompt, - # cache_examples=False - # ) - - use_negative_prompt.change( - fn=lambda x: gr.update(visible=x), - inputs=use_negative_prompt, - outputs=negative_prompt, - api_name=False, - ) - - # gr.on( - # triggers=[ - # prompt.submit, - # negative_prompt.submit, - # run_button.click, - # ], - # fn=generate, - # inputs=[ - # prompt, - # negative_prompt, - # use_negative_prompt, - # seed, - # width, - # height, - # guidance_scale, - # num_inference_steps, - # randomize_seed, - # num_images - # ], - # outputs=[result, seed], - # api_name="run", - # ) recommend_btn.click( - fn=infer, + fn=recommender.infer, inputs=[new_crop_save_path,image_path,state,language,task_type], outputs=[gallery_result,style_gallery_result,chatbot,state] ) gallery_result.select( - item_associate, + recommender.item_associate, inputs=[new_crop_save_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive,recomended_state], outputs=[recommend_bot,recomended_state,output_audio,log_state,pic_index,recommend_score,selected_image,recomended_path, recomended_type], @@ -2381,160 +898,18 @@ def create_ui(): ) style_gallery_result.select( - style_associate, + recommender.style_associate, inputs=[image_path,openai_api_key,language,auto_play,length,log_state,sort_rec,naritive,recomended_state,artist_label], - outputs=[recommend_bot,recomended_state,output_audio,log_state,pic_index,recommend_score,selected_image,recomended_path,recomended_type], - - - ) + outputs=[recommend_bot,recomended_state,output_audio,log_state,pic_index,recommend_score,selected_image,recomended_path,recomended_type]) selected_image.select( get_recommendation, inputs=[new_crop_save_path,image_path, openai_api_key,language,auto_play,length,log_state,sort_rec,naritive,recomended_state,recomended_type,artist_label,recomended_path], outputs=[recommend_bot,recomended_state,output_audio,log_state,pic_index,recommend_score]) - ############################################################################### - ############# above part is for text to image ############# - ############################################################################### - - - ############################################################################### - # this part is for 3d generate. - ############################################################################### - - # with gr.Row(variant="panel",visible=False) as d3_model: - # with gr.Column(): - # with gr.Row(): - # input_image = gr.Image( - # label="Input Image", - # image_mode="RGBA", - # sources="upload", - # #width=256, - # #height=256, - # type="pil", - # elem_id="content_image", - # ) - # processed_image = gr.Image( - # label="Processed Image", - # image_mode="RGBA", - # #width=256, - # #height=256, - # type="pil", - # interactive=False - # ) - # with gr.Row(): - # with gr.Group(): - # do_remove_background = gr.Checkbox( - # label="Remove Background", value=True - # ) - # sample_seed = gr.Number(value=42, label="Seed Value", precision=0) - - # sample_steps = gr.Slider( - # label="Sample Steps", - # minimum=30, - # maximum=75, - # value=75, - # step=5 - # ) - - # with gr.Row(): - # submit = gr.Button("Generate", elem_id="generate", variant="primary") - - # with gr.Row(variant="panel"): - # gr.Examples( - # examples=[ - # os.path.join("examples", img_name) for img_name in sorted(os.listdir("examples")) - # ], - # inputs=[input_image], - # label="Examples", - # cache_examples=False, - # examples_per_page=16 - # ) - - # with gr.Column(): - - # with gr.Row(): - - # with gr.Column(): - # mv_show_images = gr.Image( - # label="Generated Multi-views", - # type="pil", - # width=379, - # interactive=False - # ) - - # # with gr.Column(): - # # output_video = gr.Video( - # # label="video", format="mp4", - # # width=379, - # # autoplay=True, - # # interactive=False - # # ) - - # with gr.Row(): - # with gr.Tab("OBJ"): - # output_model_obj = gr.Model3D( - # label="Output Model (OBJ Format)", - # interactive=False, - # ) - # gr.Markdown("Note: Downloaded .obj model will be flipped. Export .glb instead or manually flip it before usage.") - # with gr.Tab("GLB"): - # output_model_glb = gr.Model3D( - # label="Output Model (GLB Format)", - # interactive=False, - # ) - # gr.Markdown("Note: The model shown here has a darker appearance. Download to get correct results.") - - - - - # mv_images = gr.State() chatbot.like(print_like_dislike, inputs=[state,log_state], outputs=[log_state,chatbot]) - - # submit.click(fn=check_input_image, inputs=[new_crop_save_path], outputs=[processed_image]).success( - # fn=generate_mvs, - # inputs=[processed_image, sample_steps, sample_seed], - # outputs=[mv_images, mv_show_images] - - # ).success( - # fn=make3d, - # inputs=[mv_images], - # outputs=[output_model_obj, output_model_glb] - # ) - - ############################################################################### - # above part is for 3d generate. - ############################################################################### - - - def clear_tts_fields(): - return [gr.update(value=""), gr.update(value=""), None, None, gr.update(value=False), gr.update(value=True), None, None] - - # submit_tts.click( - # tts.predict, - # inputs=[input_text, input_language, input_audio, input_mic, use_mic, agree], - # outputs=[output_waveform, output_audio], - # queue=True - # ) - - clear_tts.click( - clear_tts_fields, - inputs=None, - outputs=[input_text, input_language, input_audio, input_mic, use_mic, agree, output_waveform, output_audio], - queue=False - ) - - - - - # clear_button_sketcher.click( - # lambda x: (x), - # [origin_image], - # [sketcher_input], - # queue=False, - # show_progress=False - # ) + recommend_score.select( get_recommendationscore, @@ -2542,73 +917,27 @@ def create_ui(): outputs=[log_state], ) - - - - + openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key], outputs=[export, modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt, - modules_not_need_gpt2, tts_interface, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,instruct,modules_not_need_gpt3,sort_rec,reco_preview]) + modules_not_need_gpt2, module_key_input ,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,instruct,modules_not_need_gpt3,sort_rec,reco_preview]) enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key], outputs=[export,modules_need_gpt1, modules_need_gpt3, modules_not_need_gpt, - modules_not_need_gpt2, tts_interface,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,instruct,modules_not_need_gpt3,sort_rec,reco_preview]) - - # disable_chatGPT_button.click(init_wo_openai_api_key, - # outputs=[export,modules_need_gpt1, modules_need_gpt3, - # modules_not_need_gpt, - # modules_not_need_gpt2, tts_interface,module_key_input, module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row]) + modules_not_need_gpt2,module_key_input,module_notification_box, text_refiner, visual_chatgpt, notification_box,top_row,recommend,reco_reasons,instruct,modules_not_need_gpt3,sort_rec,reco_preview]) - # artist_label_base2.click( - # get_artistinfo, - # inputs=[artist_label_base2,openai_api_key,state,language,auto_play,length], - # outputs=[chatbot,state,output_audio] - # ) artist_label.click( get_artistinfo, inputs=[artist_label,openai_api_key,state,language,auto_play,length,log_state], outputs=[chatbot,state,output_audio,log_state] ) - # artist_label_traj.click( - # get_artistinfo, - # inputs=[artist_label_traj,openai_api_key,state,language,auto_play,length], - # outputs=[chatbot,state,output_audio] - # ) - - # year_label_base2.click( - # get_yearinfo, - # inputs=[year_label_base2,openai_api_key,state,language,auto_play,length], - # outputs=[chatbot,state,output_audio] - # ) + year_label.click( get_yearinfo, inputs=[year_label,openai_api_key,state,language,auto_play,length,log_state], outputs=[chatbot,state,output_audio,log_state] ) - # year_label_traj.click( - # get_yearinfo, - # inputs=[year_label_traj,openai_api_key,state,language,auto_play,length], - # outputs=[chatbot,state,output_audio] - # ) - - - # enable_chatGPT_button.click( - # lambda: (None, [], [], [[], [], []], "", "", ""), - # [], - # [image_input, chatbot, state, click_state, paragraph_output, origin_image], - # queue=False, - # show_progress=False - # ) - # openai_api_key.submit( - # lambda: (None, [], [], [[], [], []], "", "", ""), - # [], - # [image_input, chatbot, state, click_state, paragraph_output, origin_image], - # queue=False, - # show_progress=False - # ) - - # cap_everything_button.click(cap_everything, [paragraph, visual_chatgpt, language,auto_play], - # [paragraph_output,output_audio]) + def reset_and_add(origin_image): new_prompt = "Positive" new_add_icon = "assets/icons/plus-square-blue.png" @@ -2625,14 +954,7 @@ def create_ui(): show_progress=False ) clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt]) - # clear_button_image.click( - # lambda: (None, [], [], [[], [], []], "", "", ""), - # [], - # [image_input, chatbot, state, click_state, paragraph, origin_image], - # queue=False, - # show_progress=False - # ) - # clear_button_image.click(clear_chat_memory, inputs=[visual_chatgpt]) + clear_button_text.click( lambda: ([], [], [[], [], [], []],[]), [], @@ -2651,52 +973,21 @@ def create_ui(): ) image_input.clear(clear_chat_memory, inputs=[visual_chatgpt]) - - # image_input.change( - # lambda: ([], [], [[], [], []], [], []), - # [], - # [chatbot, state, click_state, history_log, log_state], - # queue=False, - # show_progress=False - # ) - - - - - # image_input_base.upload(upload_callback, [image_input_base, state, visual_chatgpt,openai_api_key,language,naritive], - # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2, - # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \ - # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \ - # paragraph,artist,gender,image_path]) - - # image_input_base_2.upload(upload_callback, [image_input_base_2, state, visual_chatgpt,openai_api_key,language,naritive], - # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2, - # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \ - # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \ - # paragraph,artist,gender,image_path]) + image_input.upload(upload_callback, [image_input, state, log_state,task_type, visual_chatgpt,openai_api_key,language,naritive,history_log,auto_play,session_type], - [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2, - image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \ - name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \ - paragraph,artist,gender,image_path,log_state,history_log,output_audio]) + [chatbot, state, origin_image, click_state, image_input,image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,\ + paragraph,artist,gender,image_path,log_state,history_log,output_audio]) - # sketcher_input.upload(upload_callback, [sketcher_input, state, visual_chatgpt,openai_api_key], - # [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2, - # image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \ - # name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \ - # paragraph,artist]) chat_input.submit(chat_input_callback, [visual_chatgpt, chat_input, click_state, state, aux_state,language,auto_play,gender,openai_api_key,image_path,log_state,history_log,naritive], [chatbot, state, aux_state,output_audio,log_state,history_log]) - # chat_input.submit(lambda: "", None, chat_input) + chat_input.submit(lambda: {"text": ""}, None, chat_input) example_image.change(upload_callback, [example_image, state, log_state, task_type, visual_chatgpt, openai_api_key,language,naritive,history_log,auto_play,session_type], - [chatbot, state, origin_image, click_state, image_input, image_input_base, sketcher_input,image_input_base_2, - image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,name_label_base, artist_label_base, year_label_base, material_label_base, \ - name_label_base2, artist_label_base2, year_label_base2, material_label_base2,name_label_traj, artist_label_traj, year_label_traj, material_label_traj, \ + [chatbot, state, origin_image, click_state, image_input, image_embedding, original_size, input_size,name_label,artist_label,year_label,material_label,\ paragraph,artist,gender,image_path, log_state,history_log,output_audio]) example_image.change(clear_chat_memory, inputs=[visual_chatgpt]) @@ -2704,41 +995,7 @@ def create_ui(): lambda:([],[],[],None,[],gr.update(value="Preview")), [], [gallery_result,style_gallery_result,recommend_bot,new_crop_save_path,chatbot,recommend_type]) - - # def on_click_tab_selected(): - # if gpt_state ==1: - # print(gpt_state) - # print("using gpt") - # return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2 - # else: - # print("no gpt") - # print("gpt_state",gpt_state) - # return [gr.update(visible=False)]+[gr.update(visible=True)]+[gr.update(visible=False)]*2 - - # def on_base_selected(): - # if gpt_state ==1: - # print(gpt_state) - # print("using gpt") - # return [gr.update(visible=True)]*2+[gr.update(visible=False)]*2 - # else: - # print("no gpt") - # return [gr.update(visible=False)]*4 - - # traj_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2]) - # click_tab.select(on_click_tab_selected, outputs=[modules_need_gpt1,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt2]) - # base_tab.select(on_base_selected, outputs=[modules_need_gpt0,modules_need_gpt2,modules_not_need_gpt2,modules_need_gpt1]) - # base_tab2.select(on_base_selected, outputs=[modules_not_need_gpt2,modules_not_need_gpt2,modules_need_gpt0,modules_need_gpt1]) - - def print_reason(): - print("reason") - - - - - - - image_input.select( inference_click, inputs=[ @@ -2830,19 +1087,6 @@ def create_ui(): ) - - - - - # submit_button_sketcher.click( - # inference_traject, - # inputs=[ - # origin_image,sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state, - # original_size, input_size, text_refiner,focus_type_sketch,paragraph,openai_api_key,auto_play,Input_sketch - # ], - # outputs=[chatbot, state, sketcher_input,output_audio,new_crop_save_path], - # show_progress=False, queue=True - # ) export_button.click( export_chat_log, @@ -2869,21 +1113,6 @@ def create_ui(): [], [image_input, chatbot, state, click_state, paragraph, origin_image,history_log,log_state,task_instuction,task_type,gallery_result,style_gallery_result,recommend_bot] ) - - # upvote_btn.click( - # handle_liked, - # inputs=[state,like_res], - # outputs=[chatbot,like_res] - # ) - - # downvote_btn.click( - # handle_disliked, - # inputs=[state,dislike_res], - # outputs=[chatbot,dislike_res] - # ) - - - return iface @@ -2891,6 +1120,5 @@ def create_ui(): if __name__ == '__main__': print("main") iface = create_ui() - # iface.queue(api_open=False, max_size=10) - iface.queue(api_open=False, max_size=40) + iface.queue(api_open=False, max_size=10) iface.launch(server_name="0.0.0.0") \ No newline at end of file diff --git a/assets/UI.png b/assets/UI.png deleted file mode 100644 index b15f2f7910d2e5bca5deb20fb1e69440e6b603b7..0000000000000000000000000000000000000000 --- a/assets/UI.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bce7f8b8b11832a98d85ecf7755274df5860d9b5eb35738dabbb2e585d70ddd4 -size 2643805 diff --git a/assets/caption_anything_logo.png b/assets/caption_anything_logo.png deleted file mode 100644 index 3ef08b1d76c8a941701d5ba72d30a6cd21d5d689..0000000000000000000000000000000000000000 Binary files a/assets/caption_anything_logo.png and /dev/null differ diff --git a/assets/demo1.jpg b/assets/demo1.jpg deleted file mode 100644 index 7d8d3a3b46d6907c94ee75bc92eb8e6af0b7ed40..0000000000000000000000000000000000000000 --- a/assets/demo1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a3bf5f8e4e8a79824f06916cdd41c94c23c5159abf3ecd5045732f27dd358f2 -size 1874272 diff --git a/assets/demo1.png b/assets/demo1.png deleted file mode 100644 index c33d26fcdeabb07652162b71579151446518736d..0000000000000000000000000000000000000000 --- a/assets/demo1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2bd22e897705a8cebb3f1fc2ddf857eeeb1736b7b627cf8c24ed84c17728a4cc -size 1791363 diff --git a/assets/demo1.svg b/assets/demo1.svg deleted file mode 100644 index b8435a71ad9f9d5b66d7f7e991ef490271fb6c62..0000000000000000000000000000000000000000 --- a/assets/demo1.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/assets/demo2.png b/assets/demo2.png deleted file mode 100644 index 172b1229e49fe9a1805d2ac969d72cc1e9e1987b..0000000000000000000000000000000000000000 Binary files a/assets/demo2.png and /dev/null differ diff --git a/assets/demo2.svg b/assets/demo2.svg deleted file mode 100644 index bd7f5f1c2400efbc6eb0122c46c109c1514bdfab..0000000000000000000000000000000000000000 --- a/assets/demo2.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/assets/qingming.gif b/assets/qingming.gif deleted file mode 100644 index 778f326042752e5dde85dfe5627568fffff8f3aa..0000000000000000000000000000000000000000 --- a/assets/qingming.gif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dc052aad5ab86a9a0ac1483853f2370686add2a4b0a5088be86598bec01b533e -size 4640388 diff --git a/assets/times_with_simsun.ttf b/assets/times_with_simsun.ttf deleted file mode 100644 index 0213c4b5dd14af52f642645aa01e4503569f11b4..0000000000000000000000000000000000000000 --- a/assets/times_with_simsun.ttf +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0b15a12dd4bba4a48885c279a1d16590b652773f02137a7e62ede3411970c59f -size 11066612 diff --git a/assets/title.png b/assets/title.png deleted file mode 100644 index 1d51c5c88e82323424175e43d86ec147492dee72..0000000000000000000000000000000000000000 Binary files a/assets/title.png and /dev/null differ diff --git a/assets/title.svg b/assets/title.svg deleted file mode 100644 index 87fcc5fe890c431b9ea6488172b5539b6a959695..0000000000000000000000000000000000000000 --- a/assets/title.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/chatbox.py b/backend/chatbox.py similarity index 100% rename from chatbox.py rename to backend/chatbox.py diff --git a/backend/gpt_service/__init__.py b/backend/gpt_service/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..91c96e328dfedb1660fcbd4bfd8598ff544b0987 --- /dev/null +++ b/backend/gpt_service/__init__.py @@ -0,0 +1,4 @@ +from .utils import get_gpt_response +from .info_queries import get_artistinfo, get_yearinfo + +__all__ = ['get_gpt_response', 'get_artistinfo', 'get_yearinfo'] \ No newline at end of file diff --git a/backend/gpt_service/info_queries.py b/backend/gpt_service/info_queries.py new file mode 100644 index 0000000000000000000000000000000000000000..a5b19f0c2a581741bc4db4a8a770c26a2787b896 --- /dev/null +++ b/backend/gpt_service/info_queries.py @@ -0,0 +1,39 @@ +import re +import emoji +from .utils import get_gpt_response + +async def get_artistinfo(artist_name, api_key, state, language, autoplay, length, log_state, texttospeech_fn): + prompt = ( + f"Provide a concise summary of about {length} words in {language} on the painter {artist_name}, " + "covering his biography, major works, artistic style, significant contributions to the art world, " + "and any major awards or recognitions he has received. Start your response with 'Artist Background: '." + ) + + res = get_gpt_response(api_key, None, prompt) + state = state + [(None, res)] + read_info = re.sub(r'[#[\]!*]', '', res) + read_info = emoji.replace_emoji(read_info, replace="") + log_state = log_state + [(f"res", None)] + + if autoplay: + audio_output = await texttospeech_fn(read_info, language) + return state, state, audio_output, log_state + return state, state, None, log_state + +async def get_yearinfo(year, api_key, state, language, autoplay, length, log_state, texttospeech_fn): + prompt = ( + f"Provide a concise summary of about {length} words in {language} on the art historical period " + f"associated with the year {year}, covering its major characteristics, influential artists, " + "notable works, and its significance in the broader context of art history with 'History Background: '." + ) + + res = get_gpt_response(api_key, None, prompt) + log_state = log_state + [(f"res", None)] + state = state + [(None, res)] + read_info = re.sub(r'[#[\]!*]', '', res) + read_info = emoji.replace_emoji(read_info, replace="") + + if autoplay: + audio_output = await texttospeech_fn(read_info, language) + return state, state, audio_output, log_state + return state, state, None, log_state \ No newline at end of file diff --git a/backend/gpt_service/utils.py b/backend/gpt_service/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..84477f6645a066bd888f7e43d1806f71dfc3f7b5 --- /dev/null +++ b/backend/gpt_service/utils.py @@ -0,0 +1,75 @@ +import json +import requests +import base64 + + +def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + +def get_gpt_response(api_key, image_path, prompt, history=None): + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + + if history: + if len(history) > 4: + history = history[-4:] + else: + history = [] + + messages = history[:] + base64_images = [] + + if image_path: + if isinstance(image_path, list): + for img in image_path: + base64_image = encode_image(img) + base64_images.append(base64_image) + else: + base64_image = encode_image(image_path) + base64_images.append(base64_image) + + messages.append({ + "role": "user", + "content": [ + { + "type": "text", + "text": prompt + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_images}" + } + } + ] + }) + else: + messages.append({ + "role": "user", + "content": prompt + }) + + payload = { + "model": "gpt-4o", + "messages": messages, + "max_tokens": 600 + } + + + # Sending the request to the OpenAI API + response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) + result = response.json() + print("gpt result",result) + try: + content = result['choices'][0]['message']['content'] + if content.startswith("```json"): + content = content[7:] + if content.endswith("```"): + content = content[:-3] + return content + except (KeyError, IndexError, json.JSONDecodeError) as e: + return json.dumps({"error": "Failed to parse model output", "details": str(e)}) diff --git a/backend/prompts/__init__.py b/backend/prompts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7d6d7f6e187c10b742014ac570b9aa9665fdb198 --- /dev/null +++ b/backend/prompts/__init__.py @@ -0,0 +1,3 @@ +from .generate_prompt import generate_prompt + +__all__ = ['generate_prompt'] \ No newline at end of file diff --git a/backend/prompts/generate_prompt.py b/backend/prompts/generate_prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..9303b2dba20edd3a910ffc4aac5005c8eeec2b9d --- /dev/null +++ b/backend/prompts/generate_prompt.py @@ -0,0 +1,23 @@ +from .prompt_templates import PromptTemplates + +def generate_prompt(focus_type, paragraph, length, sentiment, factuality, language, narrative): + mapped_value = PromptTemplates.FOCUS_MAP.get(focus_type, -1) + narrative_value = PromptTemplates.NARRATIVE_MAPPING[narrative] + + controls = { + 'length': length, + 'sentiment': sentiment, + 'factuality': factuality, + 'language': language + } + + if mapped_value != -1: + prompt = PromptTemplates.ANALYSIS_PROMPTS[narrative_value][mapped_value].format( + Wiki_caption=paragraph, + length=controls['length'], + sentiment=controls['sentiment'], + language=controls['language'] + ) + else: + prompt = "Invalid focus type." + return prompt \ No newline at end of file diff --git a/backend/prompts/prompt_templates.py b/backend/prompts/prompt_templates.py new file mode 100644 index 0000000000000000000000000000000000000000..03da2b27bb1633e7481e0e55af1155c602b18fe9 --- /dev/null +++ b/backend/prompts/prompt_templates.py @@ -0,0 +1,91 @@ +class PromptTemplates: + FOCUS_MAP = { + "Describe": 0, + "D+Analysis": 1, + "DA+Interprete": 2, + "Judge": 3 + } + + NARRATIVE_MAPPING = { + "Narrator": 0, + "Artist": 1, + "In-Situ": 2 + } + + ANALYSIS_PROMPTS = [ + [ + 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.', + 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.', + 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.', + 'Wiki_caption: {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.' + ], + [ + "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", + "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", + "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact, one analysis, and one interpret from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", + "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.", + ], + [ + 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.', + 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.', + 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.', + 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.', + ] + ] + + RECOMMENDATION_PROMPTS = [ + + [ + ''' + First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: + Recommendation reason: {{Recommendation based on {{object}} in the painting you saw earlier. Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate in three points. }} + Each bullet point should be in {language} language, with a response length of about {length} words. + ''', + ''' + When generating answers, you should tell people that I am the creator of painting you were looking at earlier itself, and generate text in the tone and manner in which you are the creator of painting were looking at earlier. + + First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: + + Recommendation reason: {{I'm the creator of that painting you saw earlier. I'm an artist. and I'm recommending this painting based on the fact that the {{object}} I've drawn also appear in the painting you're looking at. }} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the creator of painting were looking at earlier and start every sentence with I. + + Each bullet point should be in {language} language, with a response length of about {length} words. + + ''', + ''' + When generating answers, you should tell people that you are the object itself that was selected in the painting, and generate text in the tone and manner in which you are the object + + First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: + + Recommendation reason: {{I'm the {{object}} in the painting you were looking at earlier, and I'm recommending this painting based on the fact that I'm also present in the one you're looking at.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the object of this painting and start every sentence with I. + + Each bullet point should be in {language} language, with a response length of about {length} words. + + '''], + + [ + ''' + First identify what the name of the first painting is, you save yourself as the parameter: {{name}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: + Recommendation reason: {{Recommendation based on the painting {{name}}.Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate in three points.}} + Each bullet point should be in {language} language, with a response length of about {length} words. + ''', + ''' + When generating answers, you should tell people that I am the creator of painting you were looking at earlier itself, and generate text in the tone and manner in which you are the creator of painting were looking at earlier. + + First identify what the creator of the first painting is, you save yourself as the parameter: {artist}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: + + Recommendation reason: {{I'm the creator of that painting you saw earlier, {artist}. I'm an artist. and I'm recommending this painting based on the fact that the painting you're looking at is similar to the one you just saw of me.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the creator of painting were looking at earlier and start every sentence with I. + + Each bullet point should be in {language} language, with a response length of about {length} words. + + ''', + ''' + When generating answers, you should tell people that I am the painting you were looking at earlier itself, and generate text in the tone and manner in which you are the painting were looking at earlier. + + First identify what the name of the first painting is, you save yourself as the parameter: {{name}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting: + + Recommendation reason: {{I'm the painting {{name}} you were looking at earlier, and I'm recommending this painting based on the fact that I'm similar to the one you're looking at.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the painting were looking at earlier and start every sentence with I. + + Each bullet point should be in {language} language, with a response length of about {length} words. + + '''], +] diff --git a/backend/recommendation/__init__.py b/backend/recommendation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5500f122db81827464e5ae7f4e70c3c2cd8e81ff --- /dev/null +++ b/backend/recommendation/__init__.py @@ -0,0 +1,4 @@ +from .config import RecommendationConfig +from .recommender import ImageRecommender + +__all__ = ['RecommendationConfig', 'ImageRecommender'] \ No newline at end of file diff --git a/backend/recommendation/config.py b/backend/recommendation/config.py new file mode 100644 index 0000000000000000000000000000000000000000..fd3639fa79d983f086add616115d8207c3f0579b --- /dev/null +++ b/backend/recommendation/config.py @@ -0,0 +1,23 @@ +import torch +from transformers import AutoProcessor, SiglipModel +from huggingface_hub import hf_hub_download +import faiss +import pandas as pd + +class RecommendationConfig: + def __init__(self): + hf_hub_download("merve/siglip-faiss-wikiart", "siglip_10k_latest.index", local_dir="./") + hf_hub_download("merve/siglip-faiss-wikiart", "wikiart_10k_latest.csv", local_dir="./") + + self.index = faiss.read_index("./siglip_10k_latest.index") + self.df = pd.read_csv("./wikiart_10k_latest.csv") + + self.device = torch.device('cuda' if torch.cuda.is_available() else "cpu") + self.processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") + self.model = SiglipModel.from_pretrained("google/siglip-base-patch16-224").to(self.device) + + def get_messages(self, language): + return { + "English": "🖼️ Please refer to the section below to see the recommended results.", + "Chinese": "🖼️ 请到下方查看推荐结果。" + }[language] \ No newline at end of file diff --git a/backend/recommendation/recommender.py b/backend/recommendation/recommender.py new file mode 100644 index 0000000000000000000000000000000000000000..87918b499cfa96ff0e6f004e5fd3bb04edf428ab --- /dev/null +++ b/backend/recommendation/recommender.py @@ -0,0 +1,107 @@ +import torch +import numpy as np +from PIL import Image +from io import BytesIO +import requests +import spaces +import gradio as gr +import re +import emoji +from ..prompts.prompt_templates import PromptTemplates + +class ImageRecommender: + def __init__(self, config): + self.config = config + + def read_image_from_url(self, url): + response = requests.get(url) + img = Image.open(BytesIO(response.content)).convert("RGB") + return img + + def extract_features_siglip(self, image): + with torch.no_grad(): + inputs = self.config.processor(images=image, return_tensors="pt").to(self.config.device) + image_features = self.config.model.get_image_features(**inputs) + return image_features + + def process_image(self, image_path, num_results=2): + input_image = Image.open(image_path).convert("RGB") + input_features = self.extract_features_siglip(input_image) + input_features = input_features.detach().cpu().numpy() + input_features = np.float32(input_features) + faiss.normalize_L2(input_features) + + distances, indices = self.config.index.search(input_features, num_results) + gallery_output = [] + + for i, v in enumerate(indices[0]): + sim = -distances[0][i] + image_url = self.config.df.iloc[v]["Link"] + img_retrieved = self.read_image_from_url(image_url) + gallery_output.append(img_retrieved) + + return gallery_output + + @spaces.GPU + def infer(self, crop_image_path, full_image_path, state, language, task_type=None): + style_gallery_output = [] + item_gallery_output = [] + + if crop_image_path: + item_gallery_output = self.process_image(crop_image_path, 2) + style_gallery_output = self.process_image(full_image_path, 2) + else: + style_gallery_output = self.process_image(full_image_path, 4) + + msg = self.config.get_messages(language) + state += [(None, msg)] + + return item_gallery_output, style_gallery_output, state, state + + async def item_associate(self, new_crop, openai_api_key, language, autoplay, length, + log_state, sort_score, narrative, state, evt: gr.SelectData): + rec_path = evt._data['value']['image']['path'] + return ( + state, + state, + None, + log_state, + None, + gr.update(value=[]), + rec_path, + rec_path, + "Item" + ) + + async def style_associate(self, image_path, openai_api_key, language, autoplay, + length, log_state, sort_score, narrative, state, artist, + evt: gr.SelectData): + rec_path = evt._data['value']['image']['path'] + return ( + state, + state, + None, + log_state, + None, + gr.update(value=[]), + rec_path, + rec_path, + "Style" + ) + + def generate_recommendation_prompt(self, recommend_type, narrative, language, length, artist=None): + + narrative_value = PromptTemplates.NARRATIVE_MAPPING[narrative] + prompt_type = 0 if recommend_type == "Item" else 1 + + if narrative_value == 1 and recommend_type == "Style": + return PromptTemplates.RECOMMENDATION_PROMPTS[prompt_type][narrative_value].format( + language=language, + length=length, + artist=artist[8:] if artist else "" + ) + else: + return PromptTemplates.RECOMMENDATION_PROMPTS[prompt_type][narrative_value].format( + language=language, + length=length + ) diff --git a/backend/texttospeech/tts.py b/backend/texttospeech/tts.py new file mode 100644 index 0000000000000000000000000000000000000000..62cecd358d98fcc3a1c450b6609852e425837b05 --- /dev/null +++ b/backend/texttospeech/tts.py @@ -0,0 +1,33 @@ +import edge_tts +import base64 +from io import BytesIO + +filtered_language_dict = { + 'English': {'female': 'en-US-JennyNeural', 'male': 'en-US-GuyNeural'}, + 'Chinese': {'female': 'zh-CN-XiaoxiaoNeural', 'male': 'zh-CN-YunxiNeural'}, + 'French': {'female': 'fr-FR-DeniseNeural', 'male': 'fr-FR-HenriNeural'}, + 'Spanish': {'female': 'es-MX-DaliaNeural', 'male': 'es-MX-JorgeNeural'}, + 'Arabic': {'female': 'ar-SA-ZariyahNeural', 'male': 'ar-SA-HamedNeural'}, + 'Portuguese': {'female': 'pt-BR-FranciscaNeural', 'male': 'pt-BR-AntonioNeural'}, + 'Cantonese': {'female': 'zh-HK-HiuGaaiNeural', 'male': 'zh-HK-WanLungNeural'} +} + +async def texttospeech(text, language, gender='female'): + try: + voice = filtered_language_dict[language][gender] + communicate = edge_tts.Communicate(text=text, voice=voice, rate="+25%") + file_path = "output.wav" + await communicate.save(file_path) + + with open(file_path, "rb") as audio_file: + audio_bytes = BytesIO(audio_file.read()) + audio = base64.b64encode(audio_bytes.read()).decode("utf-8") + print("TTS processing completed.") + + audio_style = 'style="width:210px;"' + audio_player = f'' + return audio_player + + except Exception as e: + print(f"Error in texttospeech: {e}") + return None \ No newline at end of file diff --git a/configs/instant-mesh-base.yaml b/configs/instant-mesh-base.yaml deleted file mode 100644 index ad4f4c0cd0d3c6f4d3038b657a41dab82c048dd1..0000000000000000000000000000000000000000 --- a/configs/instant-mesh-base.yaml +++ /dev/null @@ -1,22 +0,0 @@ -model_config: - target: src.models.lrm_mesh.InstantMesh - params: - encoder_feat_dim: 768 - encoder_freeze: false - encoder_model_name: facebook/dino-vitb16 - transformer_dim: 1024 - transformer_layers: 12 - transformer_heads: 16 - triplane_low_res: 32 - triplane_high_res: 64 - triplane_dim: 40 - rendering_samples_per_ray: 96 - grid_res: 128 - grid_scale: 2.1 - - -infer_config: - unet_path: ckpts/diffusion_pytorch_model.bin - model_path: ckpts/instant_mesh_base.ckpt - texture_resolution: 1024 - render_resolution: 512 \ No newline at end of file diff --git a/configs/instant-mesh-large-train.yaml b/configs/instant-mesh-large-train.yaml deleted file mode 100644 index 2d2822df7d068f2d7d210ad24d41a4817b832fd2..0000000000000000000000000000000000000000 --- a/configs/instant-mesh-large-train.yaml +++ /dev/null @@ -1,67 +0,0 @@ -model: - base_learning_rate: 4.0e-05 - target: src.model_mesh.MVRecon - params: - init_ckpt: logs/instant-nerf-large-train/checkpoints/last.ckpt - input_size: 320 - render_size: 512 - - lrm_generator_config: - target: src.models.lrm_mesh.InstantMesh - params: - encoder_feat_dim: 768 - encoder_freeze: false - encoder_model_name: facebook/dino-vitb16 - transformer_dim: 1024 - transformer_layers: 16 - transformer_heads: 16 - triplane_low_res: 32 - triplane_high_res: 64 - triplane_dim: 80 - rendering_samples_per_ray: 128 - grid_res: 128 - grid_scale: 2.1 - - -data: - target: src.data.objaverse.DataModuleFromConfig - params: - batch_size: 2 - num_workers: 8 - train: - target: src.data.objaverse.ObjaverseData - params: - root_dir: data/objaverse - meta_fname: filtered_obj_name.json - input_image_dir: rendering_random_32views - target_image_dir: rendering_random_32views - input_view_num: 6 - target_view_num: 4 - total_view_n: 32 - fov: 50 - camera_rotation: true - validation: false - validation: - target: src.data.objaverse.ValidationData - params: - root_dir: data/valid_samples - input_view_num: 6 - input_image_size: 320 - fov: 30 - - -lightning: - modelcheckpoint: - params: - every_n_train_steps: 2000 - save_top_k: -1 - save_last: true - callbacks: {} - - trainer: - benchmark: true - max_epochs: -1 - val_check_interval: 1000 - num_sanity_val_steps: 0 - accumulate_grad_batches: 1 - check_val_every_n_epoch: null # if not set this, validation does not run diff --git a/configs/instant-mesh-large.yaml b/configs/instant-mesh-large.yaml deleted file mode 100644 index e296bc89f6d0d0649136ba2ce0e34490f76a5e41..0000000000000000000000000000000000000000 --- a/configs/instant-mesh-large.yaml +++ /dev/null @@ -1,22 +0,0 @@ -model_config: - target: src.models.lrm_mesh.InstantMesh - params: - encoder_feat_dim: 768 - encoder_freeze: false - encoder_model_name: facebook/dino-vitb16 - transformer_dim: 1024 - transformer_layers: 16 - transformer_heads: 16 - triplane_low_res: 32 - triplane_high_res: 64 - triplane_dim: 80 - rendering_samples_per_ray: 128 - grid_res: 128 - grid_scale: 2.1 - - -infer_config: - unet_path: ckpts/diffusion_pytorch_model.bin - model_path: ckpts/instant_mesh_large.ckpt - texture_resolution: 1024 - render_resolution: 512 \ No newline at end of file diff --git a/configs/instant-nerf-base.yaml b/configs/instant-nerf-base.yaml deleted file mode 100644 index ded3d484751127d430891fc28eb2de664aecd5e1..0000000000000000000000000000000000000000 --- a/configs/instant-nerf-base.yaml +++ /dev/null @@ -1,21 +0,0 @@ -model_config: - target: src.models.lrm.InstantNeRF - params: - encoder_feat_dim: 768 - encoder_freeze: false - encoder_model_name: facebook/dino-vitb16 - transformer_dim: 1024 - transformer_layers: 12 - transformer_heads: 16 - triplane_low_res: 32 - triplane_high_res: 64 - triplane_dim: 40 - rendering_samples_per_ray: 96 - - -infer_config: - unet_path: ckpts/diffusion_pytorch_model.bin - model_path: ckpts/instant_nerf_base.ckpt - mesh_threshold: 10.0 - mesh_resolution: 256 - render_resolution: 384 \ No newline at end of file diff --git a/configs/instant-nerf-large-train.yaml b/configs/instant-nerf-large-train.yaml deleted file mode 100644 index 0e0cc76a61efc19ff9efdc30d10081679468b5c4..0000000000000000000000000000000000000000 --- a/configs/instant-nerf-large-train.yaml +++ /dev/null @@ -1,65 +0,0 @@ -model: - base_learning_rate: 4.0e-04 - target: src.model.MVRecon - params: - input_size: 320 - render_size: 192 - - lrm_generator_config: - target: src.models.lrm.InstantNeRF - params: - encoder_feat_dim: 768 - encoder_freeze: false - encoder_model_name: facebook/dino-vitb16 - transformer_dim: 1024 - transformer_layers: 16 - transformer_heads: 16 - triplane_low_res: 32 - triplane_high_res: 64 - triplane_dim: 80 - rendering_samples_per_ray: 128 - - -data: - target: src.data.objaverse.DataModuleFromConfig - params: - batch_size: 2 - num_workers: 8 - train: - target: src.data.objaverse.ObjaverseData - params: - root_dir: data/objaverse - meta_fname: filtered_obj_name.json - input_image_dir: rendering_random_32views - target_image_dir: rendering_random_32views - input_view_num: 6 - target_view_num: 4 - total_view_n: 32 - fov: 50 - camera_rotation: true - validation: false - validation: - target: src.data.objaverse.ValidationData - params: - root_dir: data/valid_samples - input_view_num: 6 - input_image_size: 320 - fov: 30 - - -lightning: - modelcheckpoint: - params: - every_n_train_steps: 1000 - save_top_k: -1 - save_last: true - callbacks: {} - - trainer: - benchmark: true - max_epochs: -1 - gradient_clip_val: 1.0 - val_check_interval: 1000 - num_sanity_val_steps: 0 - accumulate_grad_batches: 1 - check_val_every_n_epoch: null # if not set this, validation does not run diff --git a/configs/instant-nerf-large.yaml b/configs/instant-nerf-large.yaml deleted file mode 100644 index 57494b69d74ee78dca2e2cead2ef68ddfd0fd531..0000000000000000000000000000000000000000 --- a/configs/instant-nerf-large.yaml +++ /dev/null @@ -1,21 +0,0 @@ -model_config: - target: src.models.lrm.InstantNeRF - params: - encoder_feat_dim: 768 - encoder_freeze: false - encoder_model_name: facebook/dino-vitb16 - transformer_dim: 1024 - transformer_layers: 16 - transformer_heads: 16 - triplane_low_res: 32 - triplane_high_res: 64 - triplane_dim: 80 - rendering_samples_per_ray: 128 - - -infer_config: - unet_path: ckpts/diffusion_pytorch_model.bin - model_path: ckpts/instant_nerf_large.ckpt - mesh_threshold: 10.0 - mesh_resolution: 256 - render_resolution: 384 \ No newline at end of file diff --git a/configs/zero123plus-finetune.yaml b/configs/zero123plus-finetune.yaml deleted file mode 100644 index 52b3394d957b4306670d45dc9a8ebfd52f57b429..0000000000000000000000000000000000000000 --- a/configs/zero123plus-finetune.yaml +++ /dev/null @@ -1,47 +0,0 @@ -model: - base_learning_rate: 1.0e-05 - target: zero123plus.model.MVDiffusion - params: - drop_cond_prob: 0.1 - - stable_diffusion_config: - pretrained_model_name_or_path: sudo-ai/zero123plus-v1.2 - custom_pipeline: ./zero123plus - -data: - target: src.data.objaverse_zero123plus.DataModuleFromConfig - params: - batch_size: 6 - num_workers: 8 - train: - target: src.data.objaverse_zero123plus.ObjaverseData - params: - root_dir: data/objaverse - meta_fname: lvis-annotations.json - image_dir: rendering_zero123plus - validation: false - validation: - target: src.data.objaverse_zero123plus.ObjaverseData - params: - root_dir: data/objaverse - meta_fname: lvis-annotations.json - image_dir: rendering_zero123plus - validation: true - - -lightning: - modelcheckpoint: - params: - every_n_train_steps: 1000 - save_top_k: -1 - save_last: true - callbacks: {} - - trainer: - benchmark: true - max_epochs: -1 - gradient_clip_val: 1.0 - val_check_interval: 1000 - num_sanity_val_steps: 0 - accumulate_grad_batches: 1 - check_val_every_n_epoch: null # if not set this, validation does not run diff --git a/examples/female.wav b/examples/female.wav deleted file mode 100644 index 2de3a40021f95d5b7d770c2eefdc05c7910db6f4..0000000000000000000000000000000000000000 --- a/examples/female.wav +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89a4fa9a16b6463f852cf9424f72c3d3c87aa83010e89db534c53fcd1ae12c02 -size 1002030 diff --git a/examples/male.wav b/examples/male.wav deleted file mode 100644 index 5048a86992432b2c601292dcdcea16333dfb7010..0000000000000000000000000000000000000000 Binary files a/examples/male.wav and /dev/null differ diff --git a/recomendation_pic/1.8.jpg b/recomendation_pic/1.8.jpg deleted file mode 100644 index 49e4394826262767b70d48920ac620654578cedc..0000000000000000000000000000000000000000 Binary files a/recomendation_pic/1.8.jpg and /dev/null differ diff --git a/recomendation_pic/1.9.jpg b/recomendation_pic/1.9.jpg deleted file mode 100644 index 535ce283f464a7a60a46d2cb4ea02c601314c7e1..0000000000000000000000000000000000000000 Binary files a/recomendation_pic/1.9.jpg and /dev/null differ diff --git a/recomendation_pic/2.8.jpg b/recomendation_pic/2.8.jpg deleted file mode 100644 index 8b06a746bfeff1fff8656582b37704dace41b4a3..0000000000000000000000000000000000000000 Binary files a/recomendation_pic/2.8.jpg and /dev/null differ diff --git a/recomendation_pic/2.9.png b/recomendation_pic/2.9.png deleted file mode 100644 index dd3c5f57169039e5f355aa14a781c0dbdffc9640..0000000000000000000000000000000000000000 Binary files a/recomendation_pic/2.9.png and /dev/null differ diff --git a/recomendation_pic/3.8.png b/recomendation_pic/3.8.png deleted file mode 100644 index 358c61c05aba3fddc1bcd495607333cb641e317c..0000000000000000000000000000000000000000 Binary files a/recomendation_pic/3.8.png and /dev/null differ diff --git a/recomendation_pic/3.9.png b/recomendation_pic/3.9.png deleted file mode 100644 index bc9ffe10b463063ddc1b3caee2012094b44fec74..0000000000000000000000000000000000000000 Binary files a/recomendation_pic/3.9.png and /dev/null differ diff --git a/recomendation_pic/basket-2.png b/recomendation_pic/basket-2.png deleted file mode 100644 index 2b6fc47301b3121e97a77f85b37690994075f170..0000000000000000000000000000000000000000 Binary files a/recomendation_pic/basket-2.png and /dev/null differ diff --git a/recomendation_pic/readme.md b/recomendation_pic/readme.md deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/test_images/1.The Ambassadors.jpg b/test_images/1.The Ambassadors.jpg deleted file mode 100644 index 6e720fb1946e29d7f6af44004e5fa05656cd4d8a..0000000000000000000000000000000000000000 Binary files a/test_images/1.The Ambassadors.jpg and /dev/null differ diff --git a/test_images/2.Football Players.jpg b/test_images/2.Football Players.jpg deleted file mode 100644 index 3d80fa9c4d9959468553e20da3eb69878429bdb3..0000000000000000000000000000000000000000 Binary files a/test_images/2.Football Players.jpg and /dev/null differ diff --git a/test_images/3-square.jpg b/test_images/3-square.jpg deleted file mode 100644 index 7523f3350546b5650c3d477b0907daca53c9d01d..0000000000000000000000000000000000000000 --- a/test_images/3-square.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e2a8f2e93e275b853d47803136cf8a8dc10f62001779a8d903ceb9c3678cc481 -size 1059522 diff --git a/test_images/3.Along the River during the Qingming Festival.jpeg b/test_images/3.Along the River during the Qingming Festival.jpeg deleted file mode 100644 index 98fb54bfec8b6bef21e6d1b0de017baafe8149a3..0000000000000000000000000000000000000000 --- a/test_images/3.Along the River during the Qingming Festival.jpeg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3fc255019acfe629f0838ec225028f32f38b71ebd01a2abcaa8e261eae48a521 -size 1174114 diff --git a/test_images/MUS.png b/test_images/MUS.png deleted file mode 100644 index 20416446d443a8e6460ef49ff37029118a15b8d5..0000000000000000000000000000000000000000 Binary files a/test_images/MUS.png and /dev/null differ diff --git a/test_images/Picture0.png b/test_images/Picture0.png deleted file mode 100644 index 8e2cf70509bbd7a5a2e4046ec24d082fbdfbcd71..0000000000000000000000000000000000000000 Binary files a/test_images/Picture0.png and /dev/null differ diff --git a/test_images/Picture1.png b/test_images/Picture1.png deleted file mode 100644 index 393e8a2c986d6caac4e9d71f6dc9c9ac5bd1e13f..0000000000000000000000000000000000000000 Binary files a/test_images/Picture1.png and /dev/null differ diff --git a/test_images/Picture10.png b/test_images/Picture10.png deleted file mode 100644 index 075c3bb6c9c9c1b7f342e147d44482f4a38c22be..0000000000000000000000000000000000000000 Binary files a/test_images/Picture10.png and /dev/null differ diff --git a/test_images/Picture2.png b/test_images/Picture2.png deleted file mode 100644 index c803f391f152515459d682db0008398f2a198e50..0000000000000000000000000000000000000000 Binary files a/test_images/Picture2.png and /dev/null differ diff --git a/test_images/Picture3.png b/test_images/Picture3.png deleted file mode 100644 index ac0a070011bfa83c30683ca00ffa378e17ed989c..0000000000000000000000000000000000000000 Binary files a/test_images/Picture3.png and /dev/null differ diff --git a/test_images/Picture4.png b/test_images/Picture4.png deleted file mode 100644 index 8a00740915f4e48d9450b676eaffdd6d2041cdd7..0000000000000000000000000000000000000000 Binary files a/test_images/Picture4.png and /dev/null differ diff --git a/test_images/Picture5.png b/test_images/Picture5.png deleted file mode 100644 index 274aa31beec99bf05d673682d94e46f397a07d19..0000000000000000000000000000000000000000 Binary files a/test_images/Picture5.png and /dev/null differ diff --git a/test_images/Picture6.png b/test_images/Picture6.png deleted file mode 100644 index 14eb0993fe81d3990a672d81178eee1e495d444a..0000000000000000000000000000000000000000 Binary files a/test_images/Picture6.png and /dev/null differ diff --git a/test_images/Picture7.png b/test_images/Picture7.png deleted file mode 100644 index fce768a1d19dae772ac9955991fac7440f6e5566..0000000000000000000000000000000000000000 Binary files a/test_images/Picture7.png and /dev/null differ diff --git a/test_images/Picture8.png b/test_images/Picture8.png deleted file mode 100644 index 1dee24c28aa37ef63787e6b5794dadb6d02e668b..0000000000000000000000000000000000000000 Binary files a/test_images/Picture8.png and /dev/null differ diff --git a/test_images/Picture9.png b/test_images/Picture9.png deleted file mode 100644 index 4957487c997adebf627ac22ec7b755a336668f36..0000000000000000000000000000000000000000 Binary files a/test_images/Picture9.png and /dev/null differ diff --git a/test_images/ambass.jpg b/test_images/ambass.jpg deleted file mode 100644 index 6e720fb1946e29d7f6af44004e5fa05656cd4d8a..0000000000000000000000000000000000000000 Binary files a/test_images/ambass.jpg and /dev/null differ diff --git a/test_images/img0.png b/test_images/img0.png deleted file mode 100644 index 19c2a9cd512c9a798821511f3dc7e15bf43680ba..0000000000000000000000000000000000000000 Binary files a/test_images/img0.png and /dev/null differ diff --git a/test_images/img1.jpg b/test_images/img1.jpg deleted file mode 100644 index 83c0c9eb9f5026fdb7a7f49fba081d4764ce0515..0000000000000000000000000000000000000000 Binary files a/test_images/img1.jpg and /dev/null differ diff --git a/test_images/img12.jpg b/test_images/img12.jpg deleted file mode 100644 index 20a3789bad40238cc90cca7b8e0049aaad1e1dbd..0000000000000000000000000000000000000000 Binary files a/test_images/img12.jpg and /dev/null differ diff --git a/test_images/img14.jpg b/test_images/img14.jpg deleted file mode 100644 index f60ad955110a5238e80ef93af7bbce03a4322e48..0000000000000000000000000000000000000000 Binary files a/test_images/img14.jpg and /dev/null differ diff --git a/test_images/img2.jpg b/test_images/img2.jpg deleted file mode 100644 index 583f69ec771a6f562e8dd9511b61fb9034a1af64..0000000000000000000000000000000000000000 Binary files a/test_images/img2.jpg and /dev/null differ diff --git a/test_images/img35.webp b/test_images/img35.webp deleted file mode 100644 index 1b934790352d86a013cee6dfc4119701ee676b1d..0000000000000000000000000000000000000000 Binary files a/test_images/img35.webp and /dev/null differ diff --git a/test_images/img36.webp b/test_images/img36.webp deleted file mode 100644 index 59ca50ba437f5eb34c0d2e4657fa4300485a4b05..0000000000000000000000000000000000000000 Binary files a/test_images/img36.webp and /dev/null differ diff --git a/test_images/img5.jpg b/test_images/img5.jpg deleted file mode 100644 index 80e2e7e4b9505a1528b8d319d6b2efcbde16a9cf..0000000000000000000000000000000000000000 Binary files a/test_images/img5.jpg and /dev/null differ diff --git a/test_images/pearl.jpg b/test_images/pearl.jpg deleted file mode 100644 index aaf6ab2480e5c198f2477b44f97fc92677e53805..0000000000000000000000000000000000000000 Binary files a/test_images/pearl.jpg and /dev/null differ diff --git a/test_images/qingming3.jpeg b/test_images/qingming3.jpeg deleted file mode 100644 index 98fb54bfec8b6bef21e6d1b0de017baafe8149a3..0000000000000000000000000000000000000000 --- a/test_images/qingming3.jpeg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3fc255019acfe629f0838ec225028f32f38b71ebd01a2abcaa8e261eae48a521 -size 1174114 diff --git a/test_images/task1.jpg b/test_images/task1.jpg deleted file mode 100644 index 03d846e27891920676c6c68ea23da257f89fe9b7..0000000000000000000000000000000000000000 Binary files a/test_images/task1.jpg and /dev/null differ diff --git a/test_images/task2.jpg b/test_images/task2.jpg deleted file mode 100644 index 191d8e87cc590f3d427edf1f1377dfc6c3028212..0000000000000000000000000000000000000000 Binary files a/test_images/task2.jpg and /dev/null differ diff --git a/test_images/task3.jpg b/test_images/task3.jpg deleted file mode 100644 index 8b774d507414f1d6ab63fff797981d520ac33012..0000000000000000000000000000000000000000 Binary files a/test_images/task3.jpg and /dev/null differ diff --git a/test_images/task4.jpg b/test_images/task4.jpg deleted file mode 100644 index d7133aa4635a0d2639303cf9709ca29b47cf55b2..0000000000000000000000000000000000000000 Binary files a/test_images/task4.jpg and /dev/null differ diff --git a/test_images/task5.jpg b/test_images/task5.jpg deleted file mode 100644 index c2d01c484524d0e499f26703c866bdb2cac27665..0000000000000000000000000000000000000000 Binary files a/test_images/task5.jpg and /dev/null differ diff --git a/test_images/test1.jpg b/test_images/test1.jpg deleted file mode 100644 index 96edacceefabcf19d35a54a389980d8f7c974e30..0000000000000000000000000000000000000000 --- a/test_images/test1.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75b3da6c93861df6abb2f2a61739b5bf01e32194f7fe531fbaf9b53b51235e2d -size 2108120 diff --git a/test_images/test2.jpg b/test_images/test2.jpg deleted file mode 100644 index 87666bb83945334adc6b62bef5ba332f38c11b2a..0000000000000000000000000000000000000000 Binary files a/test_images/test2.jpg and /dev/null differ diff --git a/test_images/test3.jpg b/test_images/test3.jpg deleted file mode 100644 index 549cc7681e5d81fba7fef0dbd1fd3fd3a060be96..0000000000000000000000000000000000000000 --- a/test_images/test3.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab1dc56d3015fe34faa414be93dc6458a7d36c174baf37f46a1888932a60f84b -size 6357668 diff --git a/test_images/test4.jpg b/test_images/test4.jpg deleted file mode 100644 index aa7d780dcb5d6ab14557b83762f2ac095d18ae12..0000000000000000000000000000000000000000 Binary files a/test_images/test4.jpg and /dev/null differ diff --git a/test_images/test5.jpg b/test_images/test5.jpg deleted file mode 100644 index 0c1e6930b033eeccc13f6edf445da69eaa3128a9..0000000000000000000000000000000000000000 Binary files a/test_images/test5.jpg and /dev/null differ diff --git "a/test_images/\345\233\276\347\211\2072.png" "b/test_images/\345\233\276\347\211\2072.png" deleted file mode 100644 index 465afb2307faf3789a082e39b4dc0fef8d812bf8..0000000000000000000000000000000000000000 --- "a/test_images/\345\233\276\347\211\2072.png" +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9dbffe8755cd74f789cd8cf64bc44bd279d8550f798a2f99fccdfe9bdc94493b -size 1314431 diff --git a/tts.py b/tts.py deleted file mode 100644 index 1a661883fddcbb6ce99329ad6aad5223a15ea444..0000000000000000000000000000000000000000 --- a/tts.py +++ /dev/null @@ -1,57 +0,0 @@ -import os -import sys -from fastapi import Request -import gradio as gr -from TTS.api import TTS -from TTS.utils.manage import ModelManager -from io import BytesIO -import base64 - -model_names = TTS().list_models() -print(model_names.__dict__) -print(model_names.__dir__()) - -os.environ["COQUI_TOS_AGREED"] = "1" - -model_name = "tts_models/multilingual/multi-dataset/xtts_v2" -tts = TTS(model_name, gpu=False) -tts.to("cuda") - -def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree): - if agree: - speaker_wav = mic_file_path if use_mic and mic_file_path else audio_file_pth - - if not speaker_wav: - return None, "Please provide a reference audio." - - if len(prompt) < 2: - return None, "Please provide a longer text prompt." - - if len(prompt) > 10000: - return None, "Text length is limited to 10000 characters. Please try a shorter text." - - try: - if language == "fr" and "your" in model_name: - language = "fr-fr" - if "/fr/" in model_name: - language = None - - tts.tts_to_file( - text=prompt, - file_path="output.wav", - speaker_wav=speaker_wav, - language=language - ) - except RuntimeError as e: - if "device-assert" in str(e): - return None, "Runtime error encountered. Please try again later." - else: - raise e - - with open("output.wav", "rb") as audio_file: - audio_bytes = BytesIO(audio_file.read()) - audio = base64.b64encode(audio_bytes.read()).decode("utf-8") - audio_player = f'' - return gr.make_waveform(audio="output.wav"),audio_player - else: - return None, "Please accept the Terms & Conditions." diff --git a/zero123plus/model.py b/zero123plus/model.py deleted file mode 100644 index 1655c45f2df23640d9a9270b6240b3453557599e..0000000000000000000000000000000000000000 --- a/zero123plus/model.py +++ /dev/null @@ -1,272 +0,0 @@ -import os -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import pytorch_lightning as pl -from tqdm import tqdm -from torchvision.transforms import v2 -from torchvision.utils import make_grid, save_image -from einops import rearrange - -from src.utils.train_util import instantiate_from_config -from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler, DDPMScheduler, UNet2DConditionModel -from .pipeline import RefOnlyNoisedUNet - - -def scale_latents(latents): - latents = (latents - 0.22) * 0.75 - return latents - - -def unscale_latents(latents): - latents = latents / 0.75 + 0.22 - return latents - - -def scale_image(image): - image = image * 0.5 / 0.8 - return image - - -def unscale_image(image): - image = image / 0.5 * 0.8 - return image - - -def extract_into_tensor(a, t, x_shape): - b, *_ = t.shape - out = a.gather(-1, t) - return out.reshape(b, *((1,) * (len(x_shape) - 1))) - - -class MVDiffusion(pl.LightningModule): - def __init__( - self, - stable_diffusion_config, - drop_cond_prob=0.1, - ): - super(MVDiffusion, self).__init__() - - self.drop_cond_prob = drop_cond_prob - - self.register_schedule() - - # init modules - pipeline = DiffusionPipeline.from_pretrained(**stable_diffusion_config) - pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config( - pipeline.scheduler.config, timestep_spacing='trailing' - ) - self.pipeline = pipeline - - train_sched = DDPMScheduler.from_config(self.pipeline.scheduler.config) - if isinstance(self.pipeline.unet, UNet2DConditionModel): - self.pipeline.unet = RefOnlyNoisedUNet(self.pipeline.unet, train_sched, self.pipeline.scheduler) - - self.train_scheduler = train_sched # use ddpm scheduler during training - - self.unet = pipeline.unet - - # validation output buffer - self.validation_step_outputs = [] - - def register_schedule(self): - self.num_timesteps = 1000 - - # replace scaled_linear schedule with linear schedule as Zero123++ - beta_start = 0.00085 - beta_end = 0.0120 - betas = torch.linspace(beta_start, beta_end, 1000, dtype=torch.float32) - - alphas = 1. - betas - alphas_cumprod = torch.cumprod(alphas, dim=0) - alphas_cumprod_prev = torch.cat([torch.ones(1, dtype=torch.float64), alphas_cumprod[:-1]], 0) - - self.register_buffer('betas', betas.float()) - self.register_buffer('alphas_cumprod', alphas_cumprod.float()) - self.register_buffer('alphas_cumprod_prev', alphas_cumprod_prev.float()) - - # calculations for diffusion q(x_t | x_{t-1}) and others - self.register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod).float()) - self.register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1 - alphas_cumprod).float()) - - self.register_buffer('sqrt_recip_alphas_cumprod', torch.sqrt(1. / alphas_cumprod).float()) - self.register_buffer('sqrt_recipm1_alphas_cumprod', torch.sqrt(1. / alphas_cumprod - 1).float()) - - def on_fit_start(self): - device = torch.device(f'cuda:{self.global_rank}') - self.pipeline.to(device) - if self.global_rank == 0: - os.makedirs(os.path.join(self.logdir, 'images'), exist_ok=True) - os.makedirs(os.path.join(self.logdir, 'images_val'), exist_ok=True) - - def prepare_batch_data(self, batch): - # prepare stable diffusion input - cond_imgs = batch['cond_imgs'] # (B, C, H, W) - cond_imgs = cond_imgs.to(self.device) - - # random resize the condition image - cond_size = np.random.randint(128, 513) - cond_imgs = v2.functional.resize(cond_imgs, cond_size, interpolation=3, antialias=True).clamp(0, 1) - - target_imgs = batch['target_imgs'] # (B, 6, C, H, W) - target_imgs = v2.functional.resize(target_imgs, 320, interpolation=3, antialias=True).clamp(0, 1) - target_imgs = rearrange(target_imgs, 'b (x y) c h w -> b c (x h) (y w)', x=3, y=2) # (B, C, 3H, 2W) - target_imgs = target_imgs.to(self.device) - - return cond_imgs, target_imgs - - @torch.no_grad() - def forward_vision_encoder(self, images): - dtype = next(self.pipeline.vision_encoder.parameters()).dtype - image_pil = [v2.functional.to_pil_image(images[i]) for i in range(images.shape[0])] - image_pt = self.pipeline.feature_extractor_clip(images=image_pil, return_tensors="pt").pixel_values - image_pt = image_pt.to(device=self.device, dtype=dtype) - global_embeds = self.pipeline.vision_encoder(image_pt, output_hidden_states=False).image_embeds - global_embeds = global_embeds.unsqueeze(-2) - - encoder_hidden_states = self.pipeline._encode_prompt("", self.device, 1, False)[0] - ramp = global_embeds.new_tensor(self.pipeline.config.ramping_coefficients).unsqueeze(-1) - encoder_hidden_states = encoder_hidden_states + global_embeds * ramp - - return encoder_hidden_states - - @torch.no_grad() - def encode_condition_image(self, images): - dtype = next(self.pipeline.vae.parameters()).dtype - image_pil = [v2.functional.to_pil_image(images[i]) for i in range(images.shape[0])] - image_pt = self.pipeline.feature_extractor_vae(images=image_pil, return_tensors="pt").pixel_values - image_pt = image_pt.to(device=self.device, dtype=dtype) - latents = self.pipeline.vae.encode(image_pt).latent_dist.sample() - return latents - - @torch.no_grad() - def encode_target_images(self, images): - dtype = next(self.pipeline.vae.parameters()).dtype - # equals to scaling images to [-1, 1] first and then call scale_image - images = (images - 0.5) / 0.8 # [-0.625, 0.625] - posterior = self.pipeline.vae.encode(images.to(dtype)).latent_dist - latents = posterior.sample() * self.pipeline.vae.config.scaling_factor - latents = scale_latents(latents) - return latents - - def forward_unet(self, latents, t, prompt_embeds, cond_latents): - dtype = next(self.pipeline.unet.parameters()).dtype - latents = latents.to(dtype) - prompt_embeds = prompt_embeds.to(dtype) - cond_latents = cond_latents.to(dtype) - cross_attention_kwargs = dict(cond_lat=cond_latents) - pred_noise = self.pipeline.unet( - latents, - t, - encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, - return_dict=False, - )[0] - return pred_noise - - def predict_start_from_z_and_v(self, x_t, t, v): - return ( - extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t - - extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v - ) - - def get_v(self, x, noise, t): - return ( - extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) * noise - - extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * x - ) - - def training_step(self, batch, batch_idx): - # get input - cond_imgs, target_imgs = self.prepare_batch_data(batch) - - # sample random timestep - B = cond_imgs.shape[0] - - t = torch.randint(0, self.num_timesteps, size=(B,)).long().to(self.device) - - # classifier-free guidance - if np.random.rand() < self.drop_cond_prob: - prompt_embeds = self.pipeline._encode_prompt([""]*B, self.device, 1, False) - cond_latents = self.encode_condition_image(torch.zeros_like(cond_imgs)) - else: - prompt_embeds = self.forward_vision_encoder(cond_imgs) - cond_latents = self.encode_condition_image(cond_imgs) - - latents = self.encode_target_images(target_imgs) - noise = torch.randn_like(latents) - latents_noisy = self.train_scheduler.add_noise(latents, noise, t) - - v_pred = self.forward_unet(latents_noisy, t, prompt_embeds, cond_latents) - v_target = self.get_v(latents, noise, t) - - loss, loss_dict = self.compute_loss(v_pred, v_target) - - # logging - self.log_dict(loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) - self.log("global_step", self.global_step, prog_bar=True, logger=True, on_step=True, on_epoch=False) - lr = self.optimizers().param_groups[0]['lr'] - self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False) - - if self.global_step % 500 == 0 and self.global_rank == 0: - with torch.no_grad(): - latents_pred = self.predict_start_from_z_and_v(latents_noisy, t, v_pred) - - latents = unscale_latents(latents_pred) - images = unscale_image(self.pipeline.vae.decode(latents / self.pipeline.vae.config.scaling_factor, return_dict=False)[0]) # [-1, 1] - images = (images * 0.5 + 0.5).clamp(0, 1) - images = torch.cat([target_imgs, images], dim=-2) - - grid = make_grid(images, nrow=images.shape[0], normalize=True, value_range=(0, 1)) - save_image(grid, os.path.join(self.logdir, 'images', f'train_{self.global_step:07d}.png')) - - return loss - - def compute_loss(self, noise_pred, noise_gt): - loss = F.mse_loss(noise_pred, noise_gt) - - prefix = 'train' - loss_dict = {} - loss_dict.update({f'{prefix}/loss': loss}) - - return loss, loss_dict - - @torch.no_grad() - def validation_step(self, batch, batch_idx): - # get input - cond_imgs, target_imgs = self.prepare_batch_data(batch) - - images_pil = [v2.functional.to_pil_image(cond_imgs[i]) for i in range(cond_imgs.shape[0])] - - outputs = [] - for cond_img in images_pil: - latent = self.pipeline(cond_img, num_inference_steps=75, output_type='latent').images - image = unscale_image(self.pipeline.vae.decode(latent / self.pipeline.vae.config.scaling_factor, return_dict=False)[0]) # [-1, 1] - image = (image * 0.5 + 0.5).clamp(0, 1) - outputs.append(image) - outputs = torch.cat(outputs, dim=0).to(self.device) - images = torch.cat([target_imgs, outputs], dim=-2) - - self.validation_step_outputs.append(images) - - @torch.no_grad() - def on_validation_epoch_end(self): - images = torch.cat(self.validation_step_outputs, dim=0) - - all_images = self.all_gather(images) - all_images = rearrange(all_images, 'r b c h w -> (r b) c h w') - - if self.global_rank == 0: - grid = make_grid(all_images, nrow=8, normalize=True, value_range=(0, 1)) - save_image(grid, os.path.join(self.logdir, 'images_val', f'val_{self.global_step:07d}.png')) - - self.validation_step_outputs.clear() # free memory - - def configure_optimizers(self): - lr = self.learning_rate - - optimizer = torch.optim.AdamW(self.unet.parameters(), lr=lr) - scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 3000, eta_min=lr/4) - - return {'optimizer': optimizer, 'lr_scheduler': scheduler} diff --git a/zero123plus/pipeline.py b/zero123plus/pipeline.py deleted file mode 100644 index 0088218346b36f07662d051670e51c658df59f1f..0000000000000000000000000000000000000000 --- a/zero123plus/pipeline.py +++ /dev/null @@ -1,406 +0,0 @@ -from typing import Any, Dict, Optional -from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.schedulers import KarrasDiffusionSchedulers - -import numpy -import torch -import torch.nn as nn -import torch.utils.checkpoint -import torch.distributed -import transformers -from collections import OrderedDict -from PIL import Image -from torchvision import transforms -from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer - -import diffusers -from diffusers import ( - AutoencoderKL, - DDPMScheduler, - DiffusionPipeline, - EulerAncestralDiscreteScheduler, - UNet2DConditionModel, - ImagePipelineOutput -) -from diffusers.image_processor import VaeImageProcessor -from diffusers.models.attention_processor import Attention, AttnProcessor, XFormersAttnProcessor, AttnProcessor2_0 -from diffusers.utils.import_utils import is_xformers_available - - -def to_rgb_image(maybe_rgba: Image.Image): - if maybe_rgba.mode == 'RGB': - return maybe_rgba - elif maybe_rgba.mode == 'RGBA': - rgba = maybe_rgba - img = numpy.random.randint(255, 256, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8) - img = Image.fromarray(img, 'RGB') - img.paste(rgba, mask=rgba.getchannel('A')) - return img - else: - raise ValueError("Unsupported image type.", maybe_rgba.mode) - - -class ReferenceOnlyAttnProc(torch.nn.Module): - def __init__( - self, - chained_proc, - enabled=False, - name=None - ) -> None: - super().__init__() - self.enabled = enabled - self.chained_proc = chained_proc - self.name = name - - def __call__( - self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, - mode="w", ref_dict: dict = None, is_cfg_guidance = False - ) -> Any: - if encoder_hidden_states is None: - encoder_hidden_states = hidden_states - if self.enabled and is_cfg_guidance: - res0 = self.chained_proc(attn, hidden_states[:1], encoder_hidden_states[:1], attention_mask) - hidden_states = hidden_states[1:] - encoder_hidden_states = encoder_hidden_states[1:] - if self.enabled: - if mode == 'w': - ref_dict[self.name] = encoder_hidden_states - elif mode == 'r': - encoder_hidden_states = torch.cat([encoder_hidden_states, ref_dict.pop(self.name)], dim=1) - elif mode == 'm': - encoder_hidden_states = torch.cat([encoder_hidden_states, ref_dict[self.name]], dim=1) - else: - assert False, mode - res = self.chained_proc(attn, hidden_states, encoder_hidden_states, attention_mask) - if self.enabled and is_cfg_guidance: - res = torch.cat([res0, res]) - return res - - -class RefOnlyNoisedUNet(torch.nn.Module): - def __init__(self, unet: UNet2DConditionModel, train_sched: DDPMScheduler, val_sched: EulerAncestralDiscreteScheduler) -> None: - super().__init__() - self.unet = unet - self.train_sched = train_sched - self.val_sched = val_sched - - unet_lora_attn_procs = dict() - for name, _ in unet.attn_processors.items(): - if torch.__version__ >= '2.0': - default_attn_proc = AttnProcessor2_0() - elif is_xformers_available(): - default_attn_proc = XFormersAttnProcessor() - else: - default_attn_proc = AttnProcessor() - unet_lora_attn_procs[name] = ReferenceOnlyAttnProc( - default_attn_proc, enabled=name.endswith("attn1.processor"), name=name - ) - unet.set_attn_processor(unet_lora_attn_procs) - - def __getattr__(self, name: str): - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.unet, name) - - def forward_cond(self, noisy_cond_lat, timestep, encoder_hidden_states, class_labels, ref_dict, is_cfg_guidance, **kwargs): - if is_cfg_guidance: - encoder_hidden_states = encoder_hidden_states[1:] - class_labels = class_labels[1:] - self.unet( - noisy_cond_lat, timestep, - encoder_hidden_states=encoder_hidden_states, - class_labels=class_labels, - cross_attention_kwargs=dict(mode="w", ref_dict=ref_dict), - **kwargs - ) - - def forward( - self, sample, timestep, encoder_hidden_states, class_labels=None, - *args, cross_attention_kwargs, - down_block_res_samples=None, mid_block_res_sample=None, - **kwargs - ): - cond_lat = cross_attention_kwargs['cond_lat'] - is_cfg_guidance = cross_attention_kwargs.get('is_cfg_guidance', False) - noise = torch.randn_like(cond_lat) - if self.training: - noisy_cond_lat = self.train_sched.add_noise(cond_lat, noise, timestep) - noisy_cond_lat = self.train_sched.scale_model_input(noisy_cond_lat, timestep) - else: - noisy_cond_lat = self.val_sched.add_noise(cond_lat, noise, timestep.reshape(-1)) - noisy_cond_lat = self.val_sched.scale_model_input(noisy_cond_lat, timestep.reshape(-1)) - ref_dict = {} - self.forward_cond( - noisy_cond_lat, timestep, - encoder_hidden_states, class_labels, - ref_dict, is_cfg_guidance, **kwargs - ) - weight_dtype = self.unet.dtype - return self.unet( - sample, timestep, - encoder_hidden_states, *args, - class_labels=class_labels, - cross_attention_kwargs=dict(mode="r", ref_dict=ref_dict, is_cfg_guidance=is_cfg_guidance), - down_block_additional_residuals=[ - sample.to(dtype=weight_dtype) for sample in down_block_res_samples - ] if down_block_res_samples is not None else None, - mid_block_additional_residual=( - mid_block_res_sample.to(dtype=weight_dtype) - if mid_block_res_sample is not None else None - ), - **kwargs - ) - - -def scale_latents(latents): - latents = (latents - 0.22) * 0.75 - return latents - - -def unscale_latents(latents): - latents = latents / 0.75 + 0.22 - return latents - - -def scale_image(image): - image = image * 0.5 / 0.8 - return image - - -def unscale_image(image): - image = image / 0.5 * 0.8 - return image - - -class DepthControlUNet(torch.nn.Module): - def __init__(self, unet: RefOnlyNoisedUNet, controlnet: Optional[diffusers.ControlNetModel] = None, conditioning_scale=1.0) -> None: - super().__init__() - self.unet = unet - if controlnet is None: - self.controlnet = diffusers.ControlNetModel.from_unet(unet.unet) - else: - self.controlnet = controlnet - DefaultAttnProc = AttnProcessor2_0 - if is_xformers_available(): - DefaultAttnProc = XFormersAttnProcessor - self.controlnet.set_attn_processor(DefaultAttnProc()) - self.conditioning_scale = conditioning_scale - - def __getattr__(self, name: str): - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.unet, name) - - def forward(self, sample, timestep, encoder_hidden_states, class_labels=None, *args, cross_attention_kwargs: dict, **kwargs): - cross_attention_kwargs = dict(cross_attention_kwargs) - control_depth = cross_attention_kwargs.pop('control_depth') - down_block_res_samples, mid_block_res_sample = self.controlnet( - sample, - timestep, - encoder_hidden_states=encoder_hidden_states, - controlnet_cond=control_depth, - conditioning_scale=self.conditioning_scale, - return_dict=False, - ) - return self.unet( - sample, - timestep, - encoder_hidden_states=encoder_hidden_states, - down_block_res_samples=down_block_res_samples, - mid_block_res_sample=mid_block_res_sample, - cross_attention_kwargs=cross_attention_kwargs - ) - - -class ModuleListDict(torch.nn.Module): - def __init__(self, procs: dict) -> None: - super().__init__() - self.keys = sorted(procs.keys()) - self.values = torch.nn.ModuleList(procs[k] for k in self.keys) - - def __getitem__(self, key): - return self.values[self.keys.index(key)] - - -class SuperNet(torch.nn.Module): - def __init__(self, state_dict: Dict[str, torch.Tensor]): - super().__init__() - state_dict = OrderedDict((k, state_dict[k]) for k in sorted(state_dict.keys())) - self.layers = torch.nn.ModuleList(state_dict.values()) - self.mapping = dict(enumerate(state_dict.keys())) - self.rev_mapping = {v: k for k, v in enumerate(state_dict.keys())} - - # .processor for unet, .self_attn for text encoder - self.split_keys = [".processor", ".self_attn"] - - # we add a hook to state_dict() and load_state_dict() so that the - # naming fits with `unet.attn_processors` - def map_to(module, state_dict, *args, **kwargs): - new_state_dict = {} - for key, value in state_dict.items(): - num = int(key.split(".")[1]) # 0 is always "layers" - new_key = key.replace(f"layers.{num}", module.mapping[num]) - new_state_dict[new_key] = value - - return new_state_dict - - def remap_key(key, state_dict): - for k in self.split_keys: - if k in key: - return key.split(k)[0] + k - return key.split('.')[0] - - def map_from(module, state_dict, *args, **kwargs): - all_keys = list(state_dict.keys()) - for key in all_keys: - replace_key = remap_key(key, state_dict) - new_key = key.replace(replace_key, f"layers.{module.rev_mapping[replace_key]}") - state_dict[new_key] = state_dict[key] - del state_dict[key] - - self._register_state_dict_hook(map_to) - self._register_load_state_dict_pre_hook(map_from, with_module=True) - - -class Zero123PlusPipeline(diffusers.StableDiffusionPipeline): - tokenizer: transformers.CLIPTokenizer - text_encoder: transformers.CLIPTextModel - vision_encoder: transformers.CLIPVisionModelWithProjection - - feature_extractor_clip: transformers.CLIPImageProcessor - unet: UNet2DConditionModel - scheduler: diffusers.schedulers.KarrasDiffusionSchedulers - - vae: AutoencoderKL - ramping: nn.Linear - - feature_extractor_vae: transformers.CLIPImageProcessor - - depth_transforms_multi = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize([0.5], [0.5]) - ]) - - def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - vision_encoder: transformers.CLIPVisionModelWithProjection, - feature_extractor_clip: CLIPImageProcessor, - feature_extractor_vae: CLIPImageProcessor, - ramping_coefficients: Optional[list] = None, - safety_checker=None, - ): - DiffusionPipeline.__init__(self) - - self.register_modules( - vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, - unet=unet, scheduler=scheduler, safety_checker=None, - vision_encoder=vision_encoder, - feature_extractor_clip=feature_extractor_clip, - feature_extractor_vae=feature_extractor_vae - ) - self.register_to_config(ramping_coefficients=ramping_coefficients) - self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - - def prepare(self): - train_sched = DDPMScheduler.from_config(self.scheduler.config) - if isinstance(self.unet, UNet2DConditionModel): - self.unet = RefOnlyNoisedUNet(self.unet, train_sched, self.scheduler).eval() - - def add_controlnet(self, controlnet: Optional[diffusers.ControlNetModel] = None, conditioning_scale=1.0): - self.prepare() - self.unet = DepthControlUNet(self.unet, controlnet, conditioning_scale) - return SuperNet(OrderedDict([('controlnet', self.unet.controlnet)])) - - def encode_condition_image(self, image: torch.Tensor): - image = self.vae.encode(image).latent_dist.sample() - return image - - @torch.no_grad() - def __call__( - self, - image: Image.Image = None, - prompt = "", - *args, - num_images_per_prompt: Optional[int] = 1, - guidance_scale=4.0, - depth_image: Image.Image = None, - output_type: Optional[str] = "pil", - width=640, - height=960, - num_inference_steps=28, - return_dict=True, - **kwargs - ): - self.prepare() - if image is None: - raise ValueError("Inputting embeddings not supported for this pipeline. Please pass an image.") - assert not isinstance(image, torch.Tensor) - image = to_rgb_image(image) - image_1 = self.feature_extractor_vae(images=image, return_tensors="pt").pixel_values - image_2 = self.feature_extractor_clip(images=image, return_tensors="pt").pixel_values - if depth_image is not None and hasattr(self.unet, "controlnet"): - depth_image = to_rgb_image(depth_image) - depth_image = self.depth_transforms_multi(depth_image).to( - device=self.unet.controlnet.device, dtype=self.unet.controlnet.dtype - ) - image = image_1.to(device=self.vae.device, dtype=self.vae.dtype) - image_2 = image_2.to(device=self.vae.device, dtype=self.vae.dtype) - cond_lat = self.encode_condition_image(image) - if guidance_scale > 1: - negative_lat = self.encode_condition_image(torch.zeros_like(image)) - cond_lat = torch.cat([negative_lat, cond_lat]) - encoded = self.vision_encoder(image_2, output_hidden_states=False) - global_embeds = encoded.image_embeds - global_embeds = global_embeds.unsqueeze(-2) - - if hasattr(self, "encode_prompt"): - encoder_hidden_states = self.encode_prompt( - prompt, - self.device, - num_images_per_prompt, - False - )[0] - else: - encoder_hidden_states = self._encode_prompt( - prompt, - self.device, - num_images_per_prompt, - False - ) - ramp = global_embeds.new_tensor(self.config.ramping_coefficients).unsqueeze(-1) - encoder_hidden_states = encoder_hidden_states + global_embeds * ramp - cak = dict(cond_lat=cond_lat) - if hasattr(self.unet, "controlnet"): - cak['control_depth'] = depth_image - latents: torch.Tensor = super().__call__( - None, - *args, - cross_attention_kwargs=cak, - guidance_scale=guidance_scale, - num_images_per_prompt=num_images_per_prompt, - prompt_embeds=encoder_hidden_states, - num_inference_steps=num_inference_steps, - output_type='latent', - width=width, - height=height, - **kwargs - ).images - latents = unscale_latents(latents) - if not output_type == "latent": - image = unscale_image(self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]) - else: - image = latents - - image = self.image_processor.postprocess(image, output_type=output_type) - if not return_dict: - return (image,) - - return ImagePipelineOutput(images=image)