custom-diffusion

Build error

App Files Files Community

Nupur Kumari commited on Dec 21, 2022

Commit

dbc579c

1 Parent(s): f4d0eb6

update

Browse files

Files changed (3) hide show

app.py +86 -25
inference.py +1 -0
trainer.py +54 -24

app.py CHANGED Viewed

@@ -25,8 +25,7 @@ It is recommended to upgrade to GPU in Settings after duplicating this space to
 DETAILDESCRIPTION='''
 Custom Diffusion allows you to fine-tune text-to-image diffusion models, such as Stable Diffusion, given a few images of a new concept (~4-20).
 We fine-tune only a subset of model parameters, namely key and value projection matrices, in the cross-attention layers and the modifier token used to represent the object.
-This also reduces the extra storage for each additional concept to 75MB.
-Our method further allows you to use a combination of concepts. Demo for multiple concepts will be added soon.
 <center>
 <img src="https://huggingface.co/spaces/nupurkmr9/custom-diffusion/resolve/main/method.jpg" width="600" align="center" >
 </center>
@@ -81,27 +80,82 @@ def create_training_demo(trainer: Trainer,
         with gr.Row():
             with gr.Box():
-                gr.Markdown('Training Data')
-                concept_images = gr.Files(label='Images for your concept')
-                with gr.Row():
-                    class_prompt = gr.Textbox(label='Class Prompt',
-                                            max_lines=1, placeholder='Example: "cat"')
-                    with gr.Column():
-                        modifier_token = gr.Checkbox(label='modifier token',
-                                                    value=True)
-                        train_text_encoder = gr.Checkbox(label='Train Text Encoder',
-                                                 value=False)
-                concept_prompt = gr.Textbox(label='Concept Prompt',
-                                                max_lines=1, placeholder='Example: "photo of a \<new1\> cat"')
                 gr.Markdown('''
-                    - We use "\<new1\>" modifier token in front of the concept, e.g., "\<new1\> cat". By default modifier_token is enabled.
-                    - If "Train Text Encoder", disable "modifier token" and use any unique text to describe the concept e.g. "ktn cat".
-                    - For a new concept an e.g. concept prompt is "photo of a \<new1\> cat" and "cat" for class prompt.
-                    - For a style concept, use "painting in the style of \<new1\> art" for concept prompt and "art" for class prompt.
-                    - Class prompt should be the object category.
-                    ''')
             with gr.Box():
                 gr.Markdown('Training Parameters')
                 num_training_steps = gr.Number(
                     label='Number of Training Steps', value=1000, precision=0)
                 learning_rate = gr.Number(label='Learning Rate', value=0.00001)
@@ -115,6 +169,10 @@ def create_training_demo(trainer: Trainer,
                         label='Number of Gradient Accumulation',
                         value=1,
                         precision=0)
                     gen_images = gr.Checkbox(label='Generated images as regularization',
                                                  value=False)
                 gr.Markdown('''
@@ -122,6 +180,7 @@ def create_training_demo(trainer: Trainer,
                     - Our results in the paper are trained with batch-size 4 (8 including class regularization samples).
                     - Enable gradient checkpointing for lower memory requirements (~14GB) at the expense of slower backward pass.
                     - Note that your trained models will be deleted when the second training is started. You can upload your trained model in the "Upload" tab.
                     ''')
         run_button = gr.Button('Start Training')
@@ -141,9 +200,6 @@ def create_training_demo(trainer: Trainer,
                          inputs=[
                              base_model,
                              resolution,
-                             concept_images,
-                             concept_prompt,
-                             class_prompt,
                              num_training_steps,
                              learning_rate,
                              train_text_encoder,
@@ -152,8 +208,13 @@ def create_training_demo(trainer: Trainer,
                              batch_size,
                              use_8bit_adam,
                              gradient_checkpointing,
-                             gen_images
-                         ],
                          outputs=[
                              training_status,
                              output_files,

 DETAILDESCRIPTION='''
 Custom Diffusion allows you to fine-tune text-to-image diffusion models, such as Stable Diffusion, given a few images of a new concept (~4-20).
 We fine-tune only a subset of model parameters, namely key and value projection matrices, in the cross-attention layers and the modifier token used to represent the object.
+This also reduces the extra storage for each additional concept to 75MB. Our method also allows you to use a combination of concepts. There's still limitations on which compositions work. For more analysis please refer to our [website](https://www.cs.cmu.edu/~custom-diffusion/).
 <center>
 <img src="https://huggingface.co/spaces/nupurkmr9/custom-diffusion/resolve/main/method.jpg" width="600" align="center" >
 </center>
         with gr.Row():
             with gr.Box():
+                concept_images_collection = []
+                concept_prompt_collection = []
+                class_prompt_collection = []
+                buttons_collection = []
+                delete_collection = []
+                is_visible = []
+                maximum_concepts = 3
+                row = [None] * maximum_concepts
+                for x in range(maximum_concepts):
+                    ordinal = lambda n: "%d%s" % (n, "tsnrhtdd"[(n // 10 % 10 != 1) * (n % 10 < 4) * n % 10::4])
+                    ordinal_concept = ["<new1> cat", "<new2> wooden pot", "<new3> chair"]
+                    if(x == 0):
+                        visible = True
+                        is_visible.append(gr.State(value=True))
+                    else:
+                        visible = False
+                        is_visible.append(gr.State(value=False))
+                    concept_images_collection.append(gr.Files(label=f'''Upload the images for your {ordinal(x+1) if (x>0) else ""} concept''', visible=visible))
+                    with gr.Column(visible=visible) as row[x]:
+                        concept_prompt_collection.append(
+                            gr.Textbox(label=f'''{ordinal(x+1) if (x>0) else ""} concept prompt ''', max_lines=1,
+                                        placeholder=f'''Example: "photo of a {ordinal_concept[x]}"''' )
+                            )
+                        class_prompt_collection.append(
+                            gr.Textbox(label=f'''{ordinal(x+1) if (x>0) else ""} class prompt ''',
+                                        max_lines=1, placeholder=f'''Example: "{ordinal_concept[x][7:]}"''')
+                            )
+                    with gr.Row():
+                        if(x < maximum_concepts-1):
+                            buttons_collection.append(gr.Button(value=f"Add {ordinal(x+2)} concept", visible=visible))
+                        if(x > 0):
+                            delete_collection.append(gr.Button(value=f"Delete {ordinal(x+1)} concept"))
+                counter_add = 1
+                for button in buttons_collection:
+                    if(counter_add < len(buttons_collection)):
+                        button.click(lambda:
+                        [gr.update(visible=True),gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), True, None],
+                        None,
+                        [row[counter_add], concept_images_collection[counter_add], buttons_collection[counter_add-1], buttons_collection[counter_add], is_visible[counter_add], concept_images_collection[counter_add]], queue=False)
+                    else:
+                        button.click(lambda:
+                        [gr.update(visible=True),gr.update(visible=True), gr.update(visible=False), True],
+                        None,
+                        [row[counter_add], concept_images_collection[counter_add], buttons_collection[counter_add-1], is_visible[counter_add]], queue=False)
+                    counter_add += 1
+                counter_delete = 1
+                for delete_button in delete_collection:
+                    if(counter_delete < len(delete_collection)+1):
+                        if counter_delete == 1:
+                            delete_button.click(lambda:
+                            [gr.update(visible=False, value=None),gr.update(visible=False), gr.update(visible=True), gr.update(visible=False),False],
+                            None,
+                            [concept_images_collection[counter_delete], row[counter_delete], buttons_collection[counter_delete-1], buttons_collection[counter_delete], is_visible[counter_delete]], queue=False)
+                        else:
+                            delete_button.click(lambda:
+                            [gr.update(visible=False, value=None),gr.update(visible=False), gr.update(visible=True), False],
+                            None,
+                            [concept_images_collection[counter_delete], row[counter_delete], buttons_collection[counter_delete-1], is_visible[counter_delete]], queue=False)
+                    counter_delete += 1
                 gr.Markdown('''
+                        - We use "\<new1\>" modifier_token in front of the concept, e.g., "\<new1\> cat". For multiple concepts use "\<new2\>",  "\<new3\>" etc. Increase the number of steps with more concepts.
+                        - For a new concept an e.g. concept prompt is "photo of a \<new1\> cat" and "cat" for class prompt.
+                        - For a style concept, use "painting in the style of \<new1\> art" for concept prompt and "art" for class prompt.
+                        - Class prompt should be the object category.
+                        - If "Train Text Encoder", disable "modifier token" and use any unique text to describe the concept e.g. "ktn cat".
+                        ''')
             with gr.Box():
                 gr.Markdown('Training Parameters')
+                with gr.Row():
+                    modifier_token = gr.Checkbox(label='modifier token',
+                                                value=True)
+                    train_text_encoder = gr.Checkbox(label='Train Text Encoder',
+                                            value=False)
                 num_training_steps = gr.Number(
                     label='Number of Training Steps', value=1000, precision=0)
                 learning_rate = gr.Number(label='Learning Rate', value=0.00001)
                         label='Number of Gradient Accumulation',
                         value=1,
                         precision=0)
+                    num_reg_images = gr.Number(
+                        label='Number of Class Concept images',
+                        value=200,
+                        precision=0)
                     gen_images = gr.Checkbox(label='Generated images as regularization',
                                                  value=False)
                 gr.Markdown('''
                     - Our results in the paper are trained with batch-size 4 (8 including class regularization samples).
                     - Enable gradient checkpointing for lower memory requirements (~14GB) at the expense of slower backward pass.
                     - Note that your trained models will be deleted when the second training is started. You can upload your trained model in the "Upload" tab.
+                    - We retrieve real images for class concept using clip_retireval library which can take some time.
                     ''')
         run_button = gr.Button('Start Training')
                          inputs=[
                              base_model,
                              resolution,
                              num_training_steps,
                              learning_rate,
                              train_text_encoder,
                              batch_size,
                              use_8bit_adam,
                              gradient_checkpointing,
+                             gen_images,
+                             num_reg_images,
+                         ] +
+                             concept_images_collection +
+                             concept_prompt_collection +
+                             class_prompt_collection
+                         ,
                          outputs=[
                              training_status,
                              output_files,

inference.py CHANGED Viewed

@@ -75,6 +75,7 @@ class InferencePipeline:
                         height=resolution, width=resolution,
                         eta = eta,
                         generator=generator)  # type: ignore
         out = out.images
         out = PIL.Image.fromarray(np.hstack([np.array(x) for x in out]))
         return out

                         height=resolution, width=resolution,
                         eta = eta,
                         generator=generator)  # type: ignore
+        torch.cuda.empty_cache()
         out = out.images
         out = PIL.Image.fromarray(np.hstack([np.array(x) for x in out]))
         return out

trainer.py CHANGED Viewed

@@ -9,6 +9,7 @@ import subprocess
 import gradio as gr
 import PIL.Image
 import torch
 os.environ['PYTHONPATH'] = f'custom-diffusion:{os.getenv("PYTHONPATH", "")}'
@@ -45,23 +46,41 @@ class Trainer:
     def cleanup_dirs(self) -> None:
         shutil.rmtree(self.output_dir, ignore_errors=True)
-    def prepare_dataset(self, concept_images: list, resolution: int) -> None:
         self.instance_data_dir.mkdir(parents=True)
-        for i, temp_path in enumerate(concept_images):
-            image = PIL.Image.open(temp_path.name)
-            image = pad_image(image)
-            image = image.resize((resolution, resolution))
-            image = image.convert('RGB')
-            out_path = self.instance_data_dir / f'{i:03d}.jpg'
-            image.save(out_path, format='JPEG', quality=100)
     def run(
         self,
         base_model: str,
         resolution_s: str,
-        concept_images: list | None,
-        concept_prompt: str,
-        class_prompt: str,
         n_steps: int,
         learning_rate: float,
         train_text_encoder: bool,
@@ -71,32 +90,40 @@ class Trainer:
         use_8bit_adam: bool,
         gradient_checkpointing: bool,
         gen_images: bool,
     ) -> tuple[dict, list[pathlib.Path]]:
         if not torch.cuda.is_available():
             raise gr.Error('CUDA is not available.')
         if self.is_running:
             return gr.update(value=self.is_running_message), []
-        if concept_images is None:
             raise gr.Error('You need to upload images.')
-        if not concept_prompt:
             raise gr.Error('The concept prompt is missing.')
         resolution = int(resolution_s)
         self.cleanup_dirs()
-        self.prepare_dataset(concept_images, resolution)
         command = f'''
         accelerate launch custom-diffusion/src/diffuser_training.py \
           --pretrained_model_name_or_path={base_model}   \
-          --instance_data_dir={self.instance_data_dir}  \
           --output_dir={self.output_dir} \
-          --instance_prompt="{concept_prompt}" \
-          --class_data_dir={self.class_data_dir} \
           --with_prior_preservation --prior_loss_weight=1.0 \
-          --class_prompt="{class_prompt}" \
           --resolution={resolution}  \
           --train_batch_size={batch_size}  \
           --gradient_accumulation_steps={gradient_accumulation}  \
@@ -104,11 +131,14 @@ class Trainer:
           --lr_scheduler="constant" \
           --lr_warmup_steps=0 \
           --max_train_steps={n_steps} \
-          --num_class_images=200 \
-          --scale_lr
         '''
         if modifier_token:
-            command += ' --modifier_token "<new1>"'
         if not gen_images:
             command += ' --real_prior'
         if use_8bit_adam:
@@ -117,7 +147,7 @@ class Trainer:
             command += f' --train_text_encoder'
         if gradient_checkpointing:
             command += f' --gradient_checkpointing'
         with open(self.output_dir / 'train.sh', 'w') as f:
             command_s = ' '.join(command.split())
             f.write(command_s)

 import gradio as gr
 import PIL.Image
 import torch
+import json
 os.environ['PYTHONPATH'] = f'custom-diffusion:{os.getenv("PYTHONPATH", "")}'
     def cleanup_dirs(self) -> None:
         shutil.rmtree(self.output_dir, ignore_errors=True)
+    def prepare_dataset(self, concept_images_collection: list, concept_prompt_collection: list, class_prompt_collection: list, resolution: int) -> None:
         self.instance_data_dir.mkdir(parents=True)
+        concepts_list = []
+        for i in range(len(concept_images_collection)):
+            concept_dir =  self.instance_data_dir /  f'{i}'
+            class_dir = self.class_data_dir / f'{i}'
+            concept_dir.mkdir(parents=True)
+            concept_images = concept_images_collection[i]
+            concepts_list.append(
+                    {
+                        "instance_prompt": concept_prompt_collection[i],
+                        "class_prompt": class_prompt_collection[i],
+                        "instance_data_dir": f'{concept_dir}',
+                        "class_data_dir": f'{class_dir}'
+                    }
+                )
+            for i, temp_path in enumerate(concept_images):
+                image = PIL.Image.open(temp_path.name)
+                image = pad_image(image)
+                # image = image.resize((resolution, resolution))
+                image = image.convert('RGB')
+                out_path = concept_dir / f'{i:03d}.jpg'
+                image.save(out_path, format='JPEG', quality=100)
+        print(concepts_list)
+        json.dump(concepts_list, open( f'{self.output_dir}/temp.json' , 'w') )
     def run(
         self,
         base_model: str,
         resolution_s: str,
         n_steps: int,
         learning_rate: float,
         train_text_encoder: bool,
         use_8bit_adam: bool,
         gradient_checkpointing: bool,
         gen_images: bool,
+        num_reg_images: int,
+        *inputs,
     ) -> tuple[dict, list[pathlib.Path]]:
         if not torch.cuda.is_available():
             raise gr.Error('CUDA is not available.')
+        num_concept = 0
+        for i in range(len(inputs) // 3):
+            if inputs[i] != None:
+                num_concept +=1
+        print(num_concept, inputs)
+        concept_images_collection = inputs[: num_concept]
+        concept_prompt_collection = inputs[3:  3 + num_concept]
+        class_prompt_collection = inputs[6: 6+num_concept]
         if self.is_running:
             return gr.update(value=self.is_running_message), []
+        if concept_images_collection is None:
             raise gr.Error('You need to upload images.')
+        if not concept_prompt_collection:
             raise gr.Error('The concept prompt is missing.')
         resolution = int(resolution_s)
         self.cleanup_dirs()
+        self.prepare_dataset(concept_images_collection, concept_prompt_collection, class_prompt_collection, resolution)
+        torch.cuda.empty_cache()
         command = f'''
         accelerate launch custom-diffusion/src/diffuser_training.py \
           --pretrained_model_name_or_path={base_model}   \
           --output_dir={self.output_dir} \
+          --concepts_list={f'{self.output_dir}/temp.json'} \
           --with_prior_preservation --prior_loss_weight=1.0 \
           --resolution={resolution}  \
           --train_batch_size={batch_size}  \
           --gradient_accumulation_steps={gradient_accumulation}  \
           --lr_scheduler="constant" \
           --lr_warmup_steps=0 \
           --max_train_steps={n_steps} \
+          --num_class_images={num_reg_images} \
+          --initializer_token="ktn+pll+ucd" \
+          --scale_lr --hflip
         '''
         if modifier_token:
+            tokens = '+'.join([f'<new{i+1}>' for i in range(num_concept)])
+            command += f' --modifier_token {tokens}'
         if not gen_images:
             command += ' --real_prior'
         if use_8bit_adam:
             command += f' --train_text_encoder'
         if gradient_checkpointing:
             command += f' --gradient_checkpointing'
         with open(self.output_dir / 'train.sh', 'w') as f:
             command_s = ' '.join(command.split())
             f.write(command_s)