Spaces:

flax-community
/

koclip

Build error

App Files Files Community

jaketae commited on Jul 19, 2021

Commit

a811816

•

1 Parent(s): 48a1fa8

feature: add intro page, cleanup descriptions

Browse files

Files changed (6) hide show

app.py +4 -2
image2text.py +12 -6
intro.md +32 -0
intro.py +6 -0
text2image.py +2 -10
text2patch.py +4 -2

app.py CHANGED Viewed

@@ -1,17 +1,19 @@
 import streamlit as st
 import image2text
 import text2image
 import text2patch
 PAGES = {
     "Text to Image": text2image,
     "Image to Text": image2text,
-    "Patch Importance Ranking": text2patch,
 }
 st.sidebar.title("Navigation")
 model = st.sidebar.selectbox("Choose a model", ["koclip-base", "koclip-large"])
-page = st.sidebar.selectbox("Choose a task", list(PAGES.keys()))
 PAGES[page].app(model)

 import streamlit as st
 import image2text
+import intro
 import text2image
 import text2patch
 PAGES = {
+    "Introduction": intro,
     "Text to Image": text2image,
     "Image to Text": image2text,
+    "Text to Patch": text2patch,
 }
 st.sidebar.title("Navigation")
 model = st.sidebar.selectbox("Choose a model", ["koclip-base", "koclip-large"])
+page = st.sidebar.selectbox("Navigate to...", list(PAGES.keys()))
 PAGES[page].app(model)

image2text.py CHANGED Viewed

@@ -14,9 +14,9 @@ def app(model_name):
     st.title("Zero-shot Image Classification")
     st.markdown(
         """
-        This demonstration explores capability of KoCLIP in the field of Zero-Shot Prediction. This demo takes a set of image and captions from the user, and predicts the most likely label among the different captions given.
-        KoCLIP is a retraining of OpenAI's CLIP model using 82,783 images from [MSCOCO](https://cocodataset.org/#home) dataset and Korean caption annotations. Korean translation of caption annotations were obtained from [AI Hub](https://aihub.or.kr/keti_data_board/visual_intelligence). Base model `koclip` uses `klue/roberta` as text encoder and `openai/clip-vit-base-patch32` as image encoder. Larger model `koclip-large` uses `klue/roberta` as text encoder and bigger `google/vit-large-patch16-224` as image encoder.
         """
     )
@@ -30,6 +30,7 @@ def app(model_name):
     with col2:
         captions_count = st.selectbox("Number of labels", options=range(1, 6), index=2)
         compute = st.button("Classify")
     with col1:
@@ -37,7 +38,7 @@ def app(model_name):
         defaults = ["귀여운 고양이", "멋있는 강아지", "포동포동한 햄스터"]
         for idx in range(captions_count):
             value = defaults[idx] if idx < len(defaults) else ""
-            captions.append(st.text_input(f"Insert label {idx+1}", value=value))
     if compute:
         if not any([query1, query2]):
@@ -61,8 +62,13 @@ def app(model_name):
                     inputs["pixel_values"], axes=[0, 2, 3, 1]
                 )
                 outputs = model(**inputs)
-                probs = jax.nn.softmax(outputs.logits_per_image, axis=1)
-                chart_data = pd.Series(probs[0], index=captions)
                 col1, col2 = st.beta_columns(2)
                 with col1:

     st.title("Zero-shot Image Classification")
     st.markdown(
         """
+        This demo explores KoCLIP's zero-shot prediction capabilities. The model takes an image and a list of candidate captions from the user and predicts the most likely caption that best describes the given image.
+        ---
         """
     )
     with col2:
         captions_count = st.selectbox("Number of labels", options=range(1, 6), index=2)
+        normalize = st.checkbox("Apply Softmax")
         compute = st.button("Classify")
     with col1:
         defaults = ["귀여운 고양이", "멋있는 강아지", "포동포동한 햄스터"]
         for idx in range(captions_count):
             value = defaults[idx] if idx < len(defaults) else ""
+            captions.append(st.text_input(f"Insert caption {idx+1}", value=value))
     if compute:
         if not any([query1, query2]):
                     inputs["pixel_values"], axes=[0, 2, 3, 1]
                 )
                 outputs = model(**inputs)
+                if normalize:
+                    name = "normalized prob"
+                    probs = jax.nn.softmax(outputs.logits_per_image, axis=1)
+                else:
+                    name = "cosine sim"
+                    probs = outputs.logits_per_image
+                chart_data = pd.Series(probs[0], index=captions, name=name)
                 col1, col2 = st.beta_columns(2)
                 with col1:

intro.md ADDED Viewed

	@@ -0,0 +1,32 @@

+# KoCLIP
+KoCLIP is a Korean port of OpenAI's CLIP.
+## Models
+We trained a total of two models, `koclip-base` and `koclip-large`. Both models use RoBERTa-large, a fairly large language model. This decision was motivated by the intuition that annotated Korean datasets are rare; a well-trained, performant LM would be key to producing a performant multimodal pipeline given limited data.
+| KoCLIP         | LM                   | ViT                            |
+|----------------|----------------------|--------------------------------|
+| `koclip-base`  | `klue/roberta-large` | `openai/clip-vit-base-patch32` |
+| `koclip-large` | `klue/roberta-large` | `google/vit-large-patch16-224` |
+## Data
+KoCLIP was fine-tuned using 82,783 images from the [MSCOCO](https://cocodataset.org/#home) 2014 image captioning dataset. Korean translations of image captions were obtained from [AI Hub](https://aihub.or.kr/keti_data_board/visual_intelligence), an open database maintained by subsidiaries of the Korean Ministry of Science and ICT. Validation metrics were monitored using approximately 40000 images from the validation set of the aforementioned dataset.
+While we also considered alternative multilingual image captioning datsets, notably the Wikipedia-based Image Text Dataset, we found non-trivial discrepancies in the way captions were curated in WiT and MSCOCO, and eventually decided to train the model on relatively cleaner captions of MSCOCO instead of introducing more noise.
+## Demo
+We present three demos, which each illustrate different use cases of KoCLIP.
+* *Image to Text*: This is essentially a zero-shot image classification task. Given an input image, the models finds the most likely caption among the text labels provided.
+* *Text to * Image*: This is essentially an image retrieval task. Given a text, the model looks up a database of pre-computed image embeddings to retrive the image that best matches given text.
+* *Text to Patch*: This is also a variant of zero-shot image classification. Given a text and an image, the image is partitioned into subsections, and the model ranks them based on their relevance with the text query.
+---
+We thank the teams at Hugging Face and Google for arranging this wonderful oportunity. It has been a busy yet enormously rewarding week for all of us. Hope you enjoy the demo!

intro.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import streamlit as st
+def app(*args):
+    with open("intro.md") as f:
+        st.markdown(f.read())

text2image.py CHANGED Viewed

@@ -17,17 +17,9 @@ def app(model_name):
     st.title("Text to Image Search Engine")
     st.markdown(
         """
-        This demonstration explores capability of KoCLIP as a Korean-language Image search engine. Embeddings for each of
-        5000 images from [MSCOCO](https://cocodataset.org/#home) 2017 validation set was generated using trained KoCLIP
-        vision model. They are ranked based on cosine similarity distance from input Text query embeddings and top 10 images
-        are displayed below.
-        KoCLIP is a retraining of OpenAI's CLIP model using 82,783 images from [MSCOCO](https://cocodataset.org/#home) dataset and
-        Korean caption annotations. Korean translation of caption annotations were obtained from [AI Hub](https://aihub.or.kr/keti_data_board/visual_intelligence).
-        Base model `koclip` uses `klue/roberta` as text encoder and `openai/clip-vit-base-patch32` as image encoder.
-        Larger model `koclip-large` uses `klue/roberta` as text encoder and bigger `google/vit-large-patch16-224` as image encoder.
-        Example Queries : 컴퓨터하는 고양이(Cat playing on a computer), 길 위에서 달리는 자동차(Car running on the road)
     """
     )

     st.title("Text to Image Search Engine")
     st.markdown(
         """
+        This demo explores KoCLIP's use case as a Korean image search engine. We pre-computed embeddings of 5000 images from [MSCOCO](https://cocodataset.org/#home) 2017 validation using KoCLIP's ViT backbone. Then, given a text query from the user, these image embeddings are ranked based on cosine similarity. Top matches are displayed below.
+        Example Queries: 컴퓨터하는 고양이 (Cat playing on a computer), 길 위에서 달리는 자동차 (Car on the road)
     """
     )

text2patch.py CHANGED Viewed

@@ -25,7 +25,7 @@ def split_image(im, num_rows=3, num_cols=3):
 def app(model_name):
     model, processor = load_model(f"koclip/{model_name}")
-    st.title("Patch-based Relevance Retrieval")
     st.markdown(
         """
         Given a piece of text, the CLIP model finds the part of an image that best explains the text.
@@ -37,6 +37,8 @@ def app(model_name):
         which will yield the most relevant image tile from a grid of the image. You can specify how
         granular you want to be with your search by specifying the number of rows and columns that
         make up the image grid.
         """
     )
@@ -46,7 +48,7 @@ def app(model_name):
     )
     query2 = st.file_uploader("or upload an image...", type=["jpg", "jpeg", "png"])
     captions = st.text_input(
-        "Enter query to find most relevant part of image ",
         value="이건 서울의 경복궁 사진이다.",
     )

 def app(model_name):
     model, processor = load_model(f"koclip/{model_name}")
+    st.title("Patch-based Relevance Ranking")
     st.markdown(
         """
         Given a piece of text, the CLIP model finds the part of an image that best explains the text.
         which will yield the most relevant image tile from a grid of the image. You can specify how
         granular you want to be with your search by specifying the number of rows and columns that
         make up the image grid.
+        ---
         """
     )
     )
     query2 = st.file_uploader("or upload an image...", type=["jpg", "jpeg", "png"])
     captions = st.text_input(
+        "Enter a prompt to query the image.",
         value="이건 서울의 경복궁 사진이다.",
     )