hyzhang00 commited on
Commit
54ec520
·
1 Parent(s): 1ff4f32
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. examples/examples_.DS_Store → .DS_Store +0 -0
  2. app.py +0 -0
  3. assets/UI.png +0 -3
  4. assets/caption_anything_logo.png +0 -0
  5. assets/demo1.jpg +0 -3
  6. assets/demo1.png +0 -3
  7. assets/demo1.svg +0 -0
  8. assets/demo2.png +0 -0
  9. assets/demo2.svg +0 -0
  10. assets/qingming.gif +0 -3
  11. assets/times_with_simsun.ttf +0 -3
  12. assets/title.png +0 -0
  13. assets/title.svg +0 -1
  14. chatbox.py → backend/chatbox.py +0 -0
  15. backend/gpt_service/__init__.py +4 -0
  16. backend/gpt_service/info_queries.py +39 -0
  17. backend/gpt_service/utils.py +75 -0
  18. backend/prompts/__init__.py +3 -0
  19. backend/prompts/generate_prompt.py +23 -0
  20. backend/prompts/prompt_templates.py +91 -0
  21. backend/recommendation/__init__.py +4 -0
  22. backend/recommendation/config.py +23 -0
  23. backend/recommendation/recommender.py +107 -0
  24. backend/texttospeech/tts.py +33 -0
  25. configs/instant-mesh-base.yaml +0 -22
  26. configs/instant-mesh-large-train.yaml +0 -67
  27. configs/instant-mesh-large.yaml +0 -22
  28. configs/instant-nerf-base.yaml +0 -21
  29. configs/instant-nerf-large-train.yaml +0 -65
  30. configs/instant-nerf-large.yaml +0 -21
  31. configs/zero123plus-finetune.yaml +0 -47
  32. examples/female.wav +0 -3
  33. examples/male.wav +0 -0
  34. recomendation_pic/1.8.jpg +0 -0
  35. recomendation_pic/1.9.jpg +0 -0
  36. recomendation_pic/2.8.jpg +0 -0
  37. recomendation_pic/2.9.png +0 -0
  38. recomendation_pic/3.8.png +0 -0
  39. recomendation_pic/3.9.png +0 -0
  40. recomendation_pic/basket-2.png +0 -0
  41. recomendation_pic/readme.md +0 -0
  42. test_images/1.The Ambassadors.jpg +0 -0
  43. test_images/2.Football Players.jpg +0 -0
  44. test_images/3-square.jpg +0 -3
  45. test_images/3.Along the River during the Qingming Festival.jpeg +0 -3
  46. test_images/MUS.png +0 -0
  47. test_images/Picture0.png +0 -0
  48. test_images/Picture1.png +0 -0
  49. test_images/Picture10.png +0 -0
  50. test_images/Picture2.png +0 -0
examples/examples_.DS_Store → .DS_Store RENAMED
Binary files a/examples/examples_.DS_Store and b/.DS_Store differ
 
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
assets/UI.png DELETED

Git LFS Details

  • SHA256: bce7f8b8b11832a98d85ecf7755274df5860d9b5eb35738dabbb2e585d70ddd4
  • Pointer size: 132 Bytes
  • Size of remote file: 2.64 MB
assets/caption_anything_logo.png DELETED
Binary file (150 kB)
 
assets/demo1.jpg DELETED

Git LFS Details

  • SHA256: 7a3bf5f8e4e8a79824f06916cdd41c94c23c5159abf3ecd5045732f27dd358f2
  • Pointer size: 132 Bytes
  • Size of remote file: 1.87 MB
assets/demo1.png DELETED

Git LFS Details

  • SHA256: 2bd22e897705a8cebb3f1fc2ddf857eeeb1736b7b627cf8c24ed84c17728a4cc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.79 MB
assets/demo1.svg DELETED
assets/demo2.png DELETED
Binary file (726 kB)
 
assets/demo2.svg DELETED
assets/qingming.gif DELETED

Git LFS Details

  • SHA256: dc052aad5ab86a9a0ac1483853f2370686add2a4b0a5088be86598bec01b533e
  • Pointer size: 132 Bytes
  • Size of remote file: 4.64 MB
assets/times_with_simsun.ttf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b15a12dd4bba4a48885c279a1d16590b652773f02137a7e62ede3411970c59f
3
- size 11066612
 
 
 
 
assets/title.png DELETED
Binary file (40.8 kB)
 
assets/title.svg DELETED
chatbox.py → backend/chatbox.py RENAMED
File without changes
backend/gpt_service/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .utils import get_gpt_response
2
+ from .info_queries import get_artistinfo, get_yearinfo
3
+
4
+ __all__ = ['get_gpt_response', 'get_artistinfo', 'get_yearinfo']
backend/gpt_service/info_queries.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import emoji
3
+ from .utils import get_gpt_response
4
+
5
+ async def get_artistinfo(artist_name, api_key, state, language, autoplay, length, log_state, texttospeech_fn):
6
+ prompt = (
7
+ f"Provide a concise summary of about {length} words in {language} on the painter {artist_name}, "
8
+ "covering his biography, major works, artistic style, significant contributions to the art world, "
9
+ "and any major awards or recognitions he has received. Start your response with 'Artist Background: '."
10
+ )
11
+
12
+ res = get_gpt_response(api_key, None, prompt)
13
+ state = state + [(None, res)]
14
+ read_info = re.sub(r'[#[\]!*]', '', res)
15
+ read_info = emoji.replace_emoji(read_info, replace="")
16
+ log_state = log_state + [(f"res", None)]
17
+
18
+ if autoplay:
19
+ audio_output = await texttospeech_fn(read_info, language)
20
+ return state, state, audio_output, log_state
21
+ return state, state, None, log_state
22
+
23
+ async def get_yearinfo(year, api_key, state, language, autoplay, length, log_state, texttospeech_fn):
24
+ prompt = (
25
+ f"Provide a concise summary of about {length} words in {language} on the art historical period "
26
+ f"associated with the year {year}, covering its major characteristics, influential artists, "
27
+ "notable works, and its significance in the broader context of art history with 'History Background: '."
28
+ )
29
+
30
+ res = get_gpt_response(api_key, None, prompt)
31
+ log_state = log_state + [(f"res", None)]
32
+ state = state + [(None, res)]
33
+ read_info = re.sub(r'[#[\]!*]', '', res)
34
+ read_info = emoji.replace_emoji(read_info, replace="")
35
+
36
+ if autoplay:
37
+ audio_output = await texttospeech_fn(read_info, language)
38
+ return state, state, audio_output, log_state
39
+ return state, state, None, log_state
backend/gpt_service/utils.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import requests
3
+ import base64
4
+
5
+
6
+ def encode_image(image_path):
7
+ with open(image_path, "rb") as image_file:
8
+ return base64.b64encode(image_file.read()).decode('utf-8')
9
+
10
+ def get_gpt_response(api_key, image_path, prompt, history=None):
11
+
12
+ headers = {
13
+ "Content-Type": "application/json",
14
+ "Authorization": f"Bearer {api_key}"
15
+ }
16
+
17
+ if history:
18
+ if len(history) > 4:
19
+ history = history[-4:]
20
+ else:
21
+ history = []
22
+
23
+ messages = history[:]
24
+ base64_images = []
25
+
26
+ if image_path:
27
+ if isinstance(image_path, list):
28
+ for img in image_path:
29
+ base64_image = encode_image(img)
30
+ base64_images.append(base64_image)
31
+ else:
32
+ base64_image = encode_image(image_path)
33
+ base64_images.append(base64_image)
34
+
35
+ messages.append({
36
+ "role": "user",
37
+ "content": [
38
+ {
39
+ "type": "text",
40
+ "text": prompt
41
+ },
42
+ {
43
+ "type": "image_url",
44
+ "image_url": {
45
+ "url": f"data:image/jpeg;base64,{base64_images}"
46
+ }
47
+ }
48
+ ]
49
+ })
50
+ else:
51
+ messages.append({
52
+ "role": "user",
53
+ "content": prompt
54
+ })
55
+
56
+ payload = {
57
+ "model": "gpt-4o",
58
+ "messages": messages,
59
+ "max_tokens": 600
60
+ }
61
+
62
+
63
+ # Sending the request to the OpenAI API
64
+ response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
65
+ result = response.json()
66
+ print("gpt result",result)
67
+ try:
68
+ content = result['choices'][0]['message']['content']
69
+ if content.startswith("```json"):
70
+ content = content[7:]
71
+ if content.endswith("```"):
72
+ content = content[:-3]
73
+ return content
74
+ except (KeyError, IndexError, json.JSONDecodeError) as e:
75
+ return json.dumps({"error": "Failed to parse model output", "details": str(e)})
backend/prompts/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .generate_prompt import generate_prompt
2
+
3
+ __all__ = ['generate_prompt']
backend/prompts/generate_prompt.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .prompt_templates import PromptTemplates
2
+
3
+ def generate_prompt(focus_type, paragraph, length, sentiment, factuality, language, narrative):
4
+ mapped_value = PromptTemplates.FOCUS_MAP.get(focus_type, -1)
5
+ narrative_value = PromptTemplates.NARRATIVE_MAPPING[narrative]
6
+
7
+ controls = {
8
+ 'length': length,
9
+ 'sentiment': sentiment,
10
+ 'factuality': factuality,
11
+ 'language': language
12
+ }
13
+
14
+ if mapped_value != -1:
15
+ prompt = PromptTemplates.ANALYSIS_PROMPTS[narrative_value][mapped_value].format(
16
+ Wiki_caption=paragraph,
17
+ length=controls['length'],
18
+ sentiment=controls['sentiment'],
19
+ language=controls['language']
20
+ )
21
+ else:
22
+ prompt = "Invalid focus type."
23
+ return prompt
backend/prompts/prompt_templates.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class PromptTemplates:
2
+ FOCUS_MAP = {
3
+ "Describe": 0,
4
+ "D+Analysis": 1,
5
+ "DA+Interprete": 2,
6
+ "Judge": 3
7
+ }
8
+
9
+ NARRATIVE_MAPPING = {
10
+ "Narrator": 0,
11
+ "Artist": 1,
12
+ "In-Situ": 2
13
+ }
14
+
15
+ ANALYSIS_PROMPTS = [
16
+ [
17
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
18
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
19
+ 'Wiki_caption: {Wiki_caption}, you have to help me understand what is about the selected object and list one fact and one analysis and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.',
20
+ 'Wiki_caption: {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Each point listed is to be in {language} language, with a response length of about {length} words.'
21
+ ],
22
+ [
23
+ "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
24
+ "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
25
+ "When generating the answer, you should tell others that you are the creator of this painting and generate the text in the tone and manner as if you are the creator of this painting. You have to help me understand what is about the selected object and list one fact, one analysis, and one interpret from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
26
+ "When generating the answer, you should tell others that you are one of the creators of these paintings and generate the text in the tone and manner as if you are the creator of the painting. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the creator of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.",
27
+ ],
28
+ [
29
+ 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact (describes the selected object but does not include analysis) as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
30
+ 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
31
+ 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. You have to help me understand what is about the selected object and list one fact and one analysis from art appreciation perspective and one interpret as markdown outline with appropriate emojis that describes what you see according to the image and {Wiki_caption}. Please generate the above points in the tone and manner as if you are the object and start every sentence with I. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
32
+ 'When generating answers, you should tell people that you are the object itself that was selected, and generate text in the tone and manner in which you are the object or the person. According to image and wiki_caption {Wiki_caption}, You have to help me understand what is about the selected object and list one object judgement and one whole art judgement(how successful do you think the artist was?) as markdown outline with appropriate emojis that describes what you see according to the image and wiki caption. Please generate the above points in the tone and manner as if you are the object of this painting and start every sentence with I. Each point listed is to be in {language} language, with a response length of about {length} words.',
33
+ ]
34
+ ]
35
+
36
+ RECOMMENDATION_PROMPTS = [
37
+
38
+ [
39
+ '''
40
+ First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
41
+ Recommendation reason: {{Recommendation based on {{object}} in the painting you saw earlier. Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate in three points. }}
42
+ Each bullet point should be in {language} language, with a response length of about {length} words.
43
+ ''',
44
+ '''
45
+ When generating answers, you should tell people that I am the creator of painting you were looking at earlier itself, and generate text in the tone and manner in which you are the creator of painting were looking at earlier.
46
+
47
+ First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
48
+
49
+ Recommendation reason: {{I'm the creator of that painting you saw earlier. I'm an artist. and I'm recommending this painting based on the fact that the {{object}} I've drawn also appear in the painting you're looking at. }} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the creator of painting were looking at earlier and start every sentence with I.
50
+
51
+ Each bullet point should be in {language} language, with a response length of about {length} words.
52
+
53
+ ''',
54
+ '''
55
+ When generating answers, you should tell people that you are the object itself that was selected in the painting, and generate text in the tone and manner in which you are the object
56
+
57
+ First identify what the object of the first painting is, you save yourself as the parameter: {{object}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
58
+
59
+ Recommendation reason: {{I'm the {{object}} in the painting you were looking at earlier, and I'm recommending this painting based on the fact that I'm also present in the one you're looking at.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the object of this painting and start every sentence with I.
60
+
61
+ Each bullet point should be in {language} language, with a response length of about {length} words.
62
+
63
+ '''],
64
+
65
+ [
66
+ '''
67
+ First identify what the name of the first painting is, you save yourself as the parameter: {{name}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
68
+ Recommendation reason: {{Recommendation based on the painting {{name}}.Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate in three points.}}
69
+ Each bullet point should be in {language} language, with a response length of about {length} words.
70
+ ''',
71
+ '''
72
+ When generating answers, you should tell people that I am the creator of painting you were looking at earlier itself, and generate text in the tone and manner in which you are the creator of painting were looking at earlier.
73
+
74
+ First identify what the creator of the first painting is, you save yourself as the parameter: {artist}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
75
+
76
+ Recommendation reason: {{I'm the creator of that painting you saw earlier, {artist}. I'm an artist. and I'm recommending this painting based on the fact that the painting you're looking at is similar to the one you just saw of me.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the creator of painting were looking at earlier and start every sentence with I.
77
+
78
+ Each bullet point should be in {language} language, with a response length of about {length} words.
79
+
80
+ ''',
81
+ '''
82
+ When generating answers, you should tell people that I am the painting you were looking at earlier itself, and generate text in the tone and manner in which you are the painting were looking at earlier.
83
+
84
+ First identify what the name of the first painting is, you save yourself as the parameter: {{name}}, do not need to tell me, the following will use the parameter. I want you to write the recommendation reason according to the following content, as a markdown outline with appropriate emojis that describe what you see according to the painting:
85
+
86
+ Recommendation reason: {{I'm the painting {{name}} you were looking at earlier, and I'm recommending this painting based on the fact that I'm similar to the one you're looking at.}} Detailed analysis: Based on the recommendation reason and the relationship between the two paintings, explain why you recommend another painting. Please generate the three points in the tone and manner as if you are the painting were looking at earlier and start every sentence with I.
87
+
88
+ Each bullet point should be in {language} language, with a response length of about {length} words.
89
+
90
+ '''],
91
+ ]
backend/recommendation/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .config import RecommendationConfig
2
+ from .recommender import ImageRecommender
3
+
4
+ __all__ = ['RecommendationConfig', 'ImageRecommender']
backend/recommendation/config.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoProcessor, SiglipModel
3
+ from huggingface_hub import hf_hub_download
4
+ import faiss
5
+ import pandas as pd
6
+
7
+ class RecommendationConfig:
8
+ def __init__(self):
9
+ hf_hub_download("merve/siglip-faiss-wikiart", "siglip_10k_latest.index", local_dir="./")
10
+ hf_hub_download("merve/siglip-faiss-wikiart", "wikiart_10k_latest.csv", local_dir="./")
11
+
12
+ self.index = faiss.read_index("./siglip_10k_latest.index")
13
+ self.df = pd.read_csv("./wikiart_10k_latest.csv")
14
+
15
+ self.device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
16
+ self.processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
17
+ self.model = SiglipModel.from_pretrained("google/siglip-base-patch16-224").to(self.device)
18
+
19
+ def get_messages(self, language):
20
+ return {
21
+ "English": "🖼️ Please refer to the section below to see the recommended results.",
22
+ "Chinese": "🖼️ 请到下方查看推荐结果。"
23
+ }[language]
backend/recommendation/recommender.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from PIL import Image
4
+ from io import BytesIO
5
+ import requests
6
+ import spaces
7
+ import gradio as gr
8
+ import re
9
+ import emoji
10
+ from ..prompts.prompt_templates import PromptTemplates
11
+
12
+ class ImageRecommender:
13
+ def __init__(self, config):
14
+ self.config = config
15
+
16
+ def read_image_from_url(self, url):
17
+ response = requests.get(url)
18
+ img = Image.open(BytesIO(response.content)).convert("RGB")
19
+ return img
20
+
21
+ def extract_features_siglip(self, image):
22
+ with torch.no_grad():
23
+ inputs = self.config.processor(images=image, return_tensors="pt").to(self.config.device)
24
+ image_features = self.config.model.get_image_features(**inputs)
25
+ return image_features
26
+
27
+ def process_image(self, image_path, num_results=2):
28
+ input_image = Image.open(image_path).convert("RGB")
29
+ input_features = self.extract_features_siglip(input_image)
30
+ input_features = input_features.detach().cpu().numpy()
31
+ input_features = np.float32(input_features)
32
+ faiss.normalize_L2(input_features)
33
+
34
+ distances, indices = self.config.index.search(input_features, num_results)
35
+ gallery_output = []
36
+
37
+ for i, v in enumerate(indices[0]):
38
+ sim = -distances[0][i]
39
+ image_url = self.config.df.iloc[v]["Link"]
40
+ img_retrieved = self.read_image_from_url(image_url)
41
+ gallery_output.append(img_retrieved)
42
+
43
+ return gallery_output
44
+
45
+ @spaces.GPU
46
+ def infer(self, crop_image_path, full_image_path, state, language, task_type=None):
47
+ style_gallery_output = []
48
+ item_gallery_output = []
49
+
50
+ if crop_image_path:
51
+ item_gallery_output = self.process_image(crop_image_path, 2)
52
+ style_gallery_output = self.process_image(full_image_path, 2)
53
+ else:
54
+ style_gallery_output = self.process_image(full_image_path, 4)
55
+
56
+ msg = self.config.get_messages(language)
57
+ state += [(None, msg)]
58
+
59
+ return item_gallery_output, style_gallery_output, state, state
60
+
61
+ async def item_associate(self, new_crop, openai_api_key, language, autoplay, length,
62
+ log_state, sort_score, narrative, state, evt: gr.SelectData):
63
+ rec_path = evt._data['value']['image']['path']
64
+ return (
65
+ state,
66
+ state,
67
+ None,
68
+ log_state,
69
+ None,
70
+ gr.update(value=[]),
71
+ rec_path,
72
+ rec_path,
73
+ "Item"
74
+ )
75
+
76
+ async def style_associate(self, image_path, openai_api_key, language, autoplay,
77
+ length, log_state, sort_score, narrative, state, artist,
78
+ evt: gr.SelectData):
79
+ rec_path = evt._data['value']['image']['path']
80
+ return (
81
+ state,
82
+ state,
83
+ None,
84
+ log_state,
85
+ None,
86
+ gr.update(value=[]),
87
+ rec_path,
88
+ rec_path,
89
+ "Style"
90
+ )
91
+
92
+ def generate_recommendation_prompt(self, recommend_type, narrative, language, length, artist=None):
93
+
94
+ narrative_value = PromptTemplates.NARRATIVE_MAPPING[narrative]
95
+ prompt_type = 0 if recommend_type == "Item" else 1
96
+
97
+ if narrative_value == 1 and recommend_type == "Style":
98
+ return PromptTemplates.RECOMMENDATION_PROMPTS[prompt_type][narrative_value].format(
99
+ language=language,
100
+ length=length,
101
+ artist=artist[8:] if artist else ""
102
+ )
103
+ else:
104
+ return PromptTemplates.RECOMMENDATION_PROMPTS[prompt_type][narrative_value].format(
105
+ language=language,
106
+ length=length
107
+ )
backend/texttospeech/tts.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import edge_tts
2
+ import base64
3
+ from io import BytesIO
4
+
5
+ filtered_language_dict = {
6
+ 'English': {'female': 'en-US-JennyNeural', 'male': 'en-US-GuyNeural'},
7
+ 'Chinese': {'female': 'zh-CN-XiaoxiaoNeural', 'male': 'zh-CN-YunxiNeural'},
8
+ 'French': {'female': 'fr-FR-DeniseNeural', 'male': 'fr-FR-HenriNeural'},
9
+ 'Spanish': {'female': 'es-MX-DaliaNeural', 'male': 'es-MX-JorgeNeural'},
10
+ 'Arabic': {'female': 'ar-SA-ZariyahNeural', 'male': 'ar-SA-HamedNeural'},
11
+ 'Portuguese': {'female': 'pt-BR-FranciscaNeural', 'male': 'pt-BR-AntonioNeural'},
12
+ 'Cantonese': {'female': 'zh-HK-HiuGaaiNeural', 'male': 'zh-HK-WanLungNeural'}
13
+ }
14
+
15
+ async def texttospeech(text, language, gender='female'):
16
+ try:
17
+ voice = filtered_language_dict[language][gender]
18
+ communicate = edge_tts.Communicate(text=text, voice=voice, rate="+25%")
19
+ file_path = "output.wav"
20
+ await communicate.save(file_path)
21
+
22
+ with open(file_path, "rb") as audio_file:
23
+ audio_bytes = BytesIO(audio_file.read())
24
+ audio = base64.b64encode(audio_bytes.read()).decode("utf-8")
25
+ print("TTS processing completed.")
26
+
27
+ audio_style = 'style="width:210px;"'
28
+ audio_player = f'<audio src="data:audio/wav;base64,{audio}" controls autoplay {audio_style}></audio>'
29
+ return audio_player
30
+
31
+ except Exception as e:
32
+ print(f"Error in texttospeech: {e}")
33
+ return None
configs/instant-mesh-base.yaml DELETED
@@ -1,22 +0,0 @@
1
- model_config:
2
- target: src.models.lrm_mesh.InstantMesh
3
- params:
4
- encoder_feat_dim: 768
5
- encoder_freeze: false
6
- encoder_model_name: facebook/dino-vitb16
7
- transformer_dim: 1024
8
- transformer_layers: 12
9
- transformer_heads: 16
10
- triplane_low_res: 32
11
- triplane_high_res: 64
12
- triplane_dim: 40
13
- rendering_samples_per_ray: 96
14
- grid_res: 128
15
- grid_scale: 2.1
16
-
17
-
18
- infer_config:
19
- unet_path: ckpts/diffusion_pytorch_model.bin
20
- model_path: ckpts/instant_mesh_base.ckpt
21
- texture_resolution: 1024
22
- render_resolution: 512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/instant-mesh-large-train.yaml DELETED
@@ -1,67 +0,0 @@
1
- model:
2
- base_learning_rate: 4.0e-05
3
- target: src.model_mesh.MVRecon
4
- params:
5
- init_ckpt: logs/instant-nerf-large-train/checkpoints/last.ckpt
6
- input_size: 320
7
- render_size: 512
8
-
9
- lrm_generator_config:
10
- target: src.models.lrm_mesh.InstantMesh
11
- params:
12
- encoder_feat_dim: 768
13
- encoder_freeze: false
14
- encoder_model_name: facebook/dino-vitb16
15
- transformer_dim: 1024
16
- transformer_layers: 16
17
- transformer_heads: 16
18
- triplane_low_res: 32
19
- triplane_high_res: 64
20
- triplane_dim: 80
21
- rendering_samples_per_ray: 128
22
- grid_res: 128
23
- grid_scale: 2.1
24
-
25
-
26
- data:
27
- target: src.data.objaverse.DataModuleFromConfig
28
- params:
29
- batch_size: 2
30
- num_workers: 8
31
- train:
32
- target: src.data.objaverse.ObjaverseData
33
- params:
34
- root_dir: data/objaverse
35
- meta_fname: filtered_obj_name.json
36
- input_image_dir: rendering_random_32views
37
- target_image_dir: rendering_random_32views
38
- input_view_num: 6
39
- target_view_num: 4
40
- total_view_n: 32
41
- fov: 50
42
- camera_rotation: true
43
- validation: false
44
- validation:
45
- target: src.data.objaverse.ValidationData
46
- params:
47
- root_dir: data/valid_samples
48
- input_view_num: 6
49
- input_image_size: 320
50
- fov: 30
51
-
52
-
53
- lightning:
54
- modelcheckpoint:
55
- params:
56
- every_n_train_steps: 2000
57
- save_top_k: -1
58
- save_last: true
59
- callbacks: {}
60
-
61
- trainer:
62
- benchmark: true
63
- max_epochs: -1
64
- val_check_interval: 1000
65
- num_sanity_val_steps: 0
66
- accumulate_grad_batches: 1
67
- check_val_every_n_epoch: null # if not set this, validation does not run
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/instant-mesh-large.yaml DELETED
@@ -1,22 +0,0 @@
1
- model_config:
2
- target: src.models.lrm_mesh.InstantMesh
3
- params:
4
- encoder_feat_dim: 768
5
- encoder_freeze: false
6
- encoder_model_name: facebook/dino-vitb16
7
- transformer_dim: 1024
8
- transformer_layers: 16
9
- transformer_heads: 16
10
- triplane_low_res: 32
11
- triplane_high_res: 64
12
- triplane_dim: 80
13
- rendering_samples_per_ray: 128
14
- grid_res: 128
15
- grid_scale: 2.1
16
-
17
-
18
- infer_config:
19
- unet_path: ckpts/diffusion_pytorch_model.bin
20
- model_path: ckpts/instant_mesh_large.ckpt
21
- texture_resolution: 1024
22
- render_resolution: 512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/instant-nerf-base.yaml DELETED
@@ -1,21 +0,0 @@
1
- model_config:
2
- target: src.models.lrm.InstantNeRF
3
- params:
4
- encoder_feat_dim: 768
5
- encoder_freeze: false
6
- encoder_model_name: facebook/dino-vitb16
7
- transformer_dim: 1024
8
- transformer_layers: 12
9
- transformer_heads: 16
10
- triplane_low_res: 32
11
- triplane_high_res: 64
12
- triplane_dim: 40
13
- rendering_samples_per_ray: 96
14
-
15
-
16
- infer_config:
17
- unet_path: ckpts/diffusion_pytorch_model.bin
18
- model_path: ckpts/instant_nerf_base.ckpt
19
- mesh_threshold: 10.0
20
- mesh_resolution: 256
21
- render_resolution: 384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/instant-nerf-large-train.yaml DELETED
@@ -1,65 +0,0 @@
1
- model:
2
- base_learning_rate: 4.0e-04
3
- target: src.model.MVRecon
4
- params:
5
- input_size: 320
6
- render_size: 192
7
-
8
- lrm_generator_config:
9
- target: src.models.lrm.InstantNeRF
10
- params:
11
- encoder_feat_dim: 768
12
- encoder_freeze: false
13
- encoder_model_name: facebook/dino-vitb16
14
- transformer_dim: 1024
15
- transformer_layers: 16
16
- transformer_heads: 16
17
- triplane_low_res: 32
18
- triplane_high_res: 64
19
- triplane_dim: 80
20
- rendering_samples_per_ray: 128
21
-
22
-
23
- data:
24
- target: src.data.objaverse.DataModuleFromConfig
25
- params:
26
- batch_size: 2
27
- num_workers: 8
28
- train:
29
- target: src.data.objaverse.ObjaverseData
30
- params:
31
- root_dir: data/objaverse
32
- meta_fname: filtered_obj_name.json
33
- input_image_dir: rendering_random_32views
34
- target_image_dir: rendering_random_32views
35
- input_view_num: 6
36
- target_view_num: 4
37
- total_view_n: 32
38
- fov: 50
39
- camera_rotation: true
40
- validation: false
41
- validation:
42
- target: src.data.objaverse.ValidationData
43
- params:
44
- root_dir: data/valid_samples
45
- input_view_num: 6
46
- input_image_size: 320
47
- fov: 30
48
-
49
-
50
- lightning:
51
- modelcheckpoint:
52
- params:
53
- every_n_train_steps: 1000
54
- save_top_k: -1
55
- save_last: true
56
- callbacks: {}
57
-
58
- trainer:
59
- benchmark: true
60
- max_epochs: -1
61
- gradient_clip_val: 1.0
62
- val_check_interval: 1000
63
- num_sanity_val_steps: 0
64
- accumulate_grad_batches: 1
65
- check_val_every_n_epoch: null # if not set this, validation does not run
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/instant-nerf-large.yaml DELETED
@@ -1,21 +0,0 @@
1
- model_config:
2
- target: src.models.lrm.InstantNeRF
3
- params:
4
- encoder_feat_dim: 768
5
- encoder_freeze: false
6
- encoder_model_name: facebook/dino-vitb16
7
- transformer_dim: 1024
8
- transformer_layers: 16
9
- transformer_heads: 16
10
- triplane_low_res: 32
11
- triplane_high_res: 64
12
- triplane_dim: 80
13
- rendering_samples_per_ray: 128
14
-
15
-
16
- infer_config:
17
- unet_path: ckpts/diffusion_pytorch_model.bin
18
- model_path: ckpts/instant_nerf_large.ckpt
19
- mesh_threshold: 10.0
20
- mesh_resolution: 256
21
- render_resolution: 384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/zero123plus-finetune.yaml DELETED
@@ -1,47 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-05
3
- target: zero123plus.model.MVDiffusion
4
- params:
5
- drop_cond_prob: 0.1
6
-
7
- stable_diffusion_config:
8
- pretrained_model_name_or_path: sudo-ai/zero123plus-v1.2
9
- custom_pipeline: ./zero123plus
10
-
11
- data:
12
- target: src.data.objaverse_zero123plus.DataModuleFromConfig
13
- params:
14
- batch_size: 6
15
- num_workers: 8
16
- train:
17
- target: src.data.objaverse_zero123plus.ObjaverseData
18
- params:
19
- root_dir: data/objaverse
20
- meta_fname: lvis-annotations.json
21
- image_dir: rendering_zero123plus
22
- validation: false
23
- validation:
24
- target: src.data.objaverse_zero123plus.ObjaverseData
25
- params:
26
- root_dir: data/objaverse
27
- meta_fname: lvis-annotations.json
28
- image_dir: rendering_zero123plus
29
- validation: true
30
-
31
-
32
- lightning:
33
- modelcheckpoint:
34
- params:
35
- every_n_train_steps: 1000
36
- save_top_k: -1
37
- save_last: true
38
- callbacks: {}
39
-
40
- trainer:
41
- benchmark: true
42
- max_epochs: -1
43
- gradient_clip_val: 1.0
44
- val_check_interval: 1000
45
- num_sanity_val_steps: 0
46
- accumulate_grad_batches: 1
47
- check_val_every_n_epoch: null # if not set this, validation does not run
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/female.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:89a4fa9a16b6463f852cf9424f72c3d3c87aa83010e89db534c53fcd1ae12c02
3
- size 1002030
 
 
 
 
examples/male.wav DELETED
Binary file (762 kB)
 
recomendation_pic/1.8.jpg DELETED
Binary file (99 kB)
 
recomendation_pic/1.9.jpg DELETED
Binary file (59.1 kB)
 
recomendation_pic/2.8.jpg DELETED
Binary file (71.6 kB)
 
recomendation_pic/2.9.png DELETED
Binary file (298 kB)
 
recomendation_pic/3.8.png DELETED
Binary file (439 kB)
 
recomendation_pic/3.9.png DELETED
Binary file (739 kB)
 
recomendation_pic/basket-2.png DELETED
Binary file (449 kB)
 
recomendation_pic/readme.md DELETED
File without changes
test_images/1.The Ambassadors.jpg DELETED
Binary file (78 kB)
 
test_images/2.Football Players.jpg DELETED
Binary file (86.1 kB)
 
test_images/3-square.jpg DELETED

Git LFS Details

  • SHA256: e2a8f2e93e275b853d47803136cf8a8dc10f62001779a8d903ceb9c3678cc481
  • Pointer size: 132 Bytes
  • Size of remote file: 1.06 MB
test_images/3.Along the River during the Qingming Festival.jpeg DELETED

Git LFS Details

  • SHA256: 3fc255019acfe629f0838ec225028f32f38b71ebd01a2abcaa8e261eae48a521
  • Pointer size: 132 Bytes
  • Size of remote file: 1.17 MB
test_images/MUS.png DELETED
Binary file (471 kB)
 
test_images/Picture0.png DELETED
Binary file (399 kB)
 
test_images/Picture1.png DELETED
Binary file (452 kB)
 
test_images/Picture10.png DELETED
Binary file (268 kB)
 
test_images/Picture2.png DELETED
Binary file (293 kB)