Spaces:

BK-Lee
/

TroL

Sleeping

App Files Files Community

BK-Lee commited on Jun 18, 2024

Commit

c75a3b7

1 Parent(s): a56928d

v1

Browse files

Files changed (2) hide show

app.py +50 -51
trol/load_trol.py +4 -4

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # A100 Zero GPU
-import spaces
 # TroL Package
 import torch
@@ -18,8 +18,8 @@ from transformers import TextIteratorStreamer
 from torchvision.transforms.functional import pil_to_tensor
 # flash attention
-import subprocess
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 # accel
 accel = Accelerator()
@@ -55,7 +55,7 @@ def threading_function(inputs, image_token_number, streamer, device, model, toke
     generation_kwargs.update({'use_cache': True})
     return model.generate(**generation_kwargs)
-@spaces.GPU
 def bot_streaming(message, history, link, temperature, new_max_token, top_p):
     # model selection
@@ -70,53 +70,52 @@ def bot_streaming(message, history, link, temperature, new_max_token, top_p):
         tokenizer = tokenizer_7
     # cpu -> gpu
-    for param in model.parameters():
-        if not param.is_cuda:
-            param.data = param.to(accel.device)
-    try:
-        # prompt type -> input prompt
-        image_token_number = None
-        if len(message['files']) == 1:
-            # Image Load
-            image = pil_to_tensor(Image.open(message['files'][0]).convert("RGB"))
-            if "3.8B" not in link:
-                image_token_number = 1225
-                image = F.interpolate(image.unsqueeze(0), size=(490, 490), mode='bicubic').squeeze(0)
-            inputs = [{'image': image.to(accel.device), 'question': message['text']}]
-        elif len(message['files']) > 1:
-            raise Exception("No way!")
-        else:
-            inputs = [{'question': message['text']}]
-        # Text Generation
-        with torch.inference_mode():
-            # kwargs
-            streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
-            # Threading generation
-            thread = Thread(target=threading_function, kwargs=dict(inputs=inputs,
-                                                                image_token_number=image_token_number,
-                                                                streamer=streamer,
-                                                                model=model,
-                                                                tokenizer=tokenizer,
-                                                                device=accel.device,
-                                                                temperature=temperature,
-                                                                new_max_token=new_max_token,
-                                                                top_p=top_p))
-            thread.start()
-            # generated text
-            generated_text = ""
-            for new_text in streamer:
-                generated_text += new_text
-            generated_text
-        # Text decoding
-        response = output_filtering(generated_text, model)
-    except:
-        response = "There may be unsupported format: ex) pdf, video, sound. Only supported is a single image in this version."
     # private log print
     text = message['text']

 # A100 Zero GPU
+# import spaces
 # TroL Package
 import torch
 from torchvision.transforms.functional import pil_to_tensor
 # flash attention
+# import subprocess
+# subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 # accel
 accel = Accelerator()
     generation_kwargs.update({'use_cache': True})
     return model.generate(**generation_kwargs)
+# @spaces.GPU
 def bot_streaming(message, history, link, temperature, new_max_token, top_p):
     # model selection
         tokenizer = tokenizer_7
     # cpu -> gpu
+    # for param in model.parameters():
+    #     if not param.is_cuda:
+    #         param.data = param.to(accel.device)
+    # prompt type -> input prompt
+    image_token_number = None
+    if len(message['files']) == 1:
+        # Image Load
+        image = pil_to_tensor(Image.open(message['files'][0]).convert("RGB"))
+        if "3.8B" not in link:
+            image_token_number = 1225
+            image = F.interpolate(image.unsqueeze(0), size=(490, 490), mode='bicubic').squeeze(0)
+        inputs = [{'image': image.to(accel.device), 'question': message['text']}]
+    elif len(message['files']) > 1:
+        raise Exception("No way!")
+    else:
+        inputs = [{'question': message['text']}]
+    # Text Generation
+    with torch.inference_mode():
+        # kwargs
+        streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+        # Threading generation
+        thread = Thread(target=threading_function, kwargs=dict(inputs=inputs,
+                                                            image_token_number=image_token_number,
+                                                            streamer=streamer,
+                                                            model=model,
+                                                            tokenizer=tokenizer,
+                                                            device=accel.device,
+                                                            temperature=temperature,
+                                                            new_max_token=new_max_token,
+                                                            top_p=top_p))
+        thread.start()
+        # generated text
+        generated_text = ""
+        for new_text in streamer:
+            generated_text += new_text
+        generated_text
+    # Text decoding
+    response = output_filtering(generated_text, model)
+    # except:
+    #     response = "There may be unsupported format: ex) pdf, video, sound. Only supported is a single image in this version."
     # private log print
     text = message['text']

trol/load_trol.py CHANGED Viewed

@@ -14,14 +14,14 @@ def load_trol(link):
     if link == 'TroL-1.8B':
         from .arch_internlm2.modeling_trol import TroLForCausalLM
         from .arch_internlm2.tokenization_internlm2 import InternLM2Tokenizer as TroLTokenizer
-        bits = 4
         path = TROL_1_8B
         bit_quant_skip = ["vit", "vision_proj", "ffn", "output"]
     elif link == 'TroL-3.8B':
         from trol.arch_phi3.modeling_trol import TroLForCausalLM
         from transformers import LlamaTokenizerFast as TroLTokenizer
-        bits = 8
         path = TROL_3_8B
         bit_quant_skip = ["vision_model", "vision_proj", "lm_head"]
@@ -64,8 +64,8 @@ def load_trol(link):
     # Loading tokenizer & Loading backbone model (error -> then delete flash attention)
     tok_trol = TroLTokenizer.from_pretrained(path, padding_side='left')
     try:
-        trol = TroLForCausalLM.from_pretrained(path, **huggingface_config)
     except:
         del huggingface_config["attn_implementation"]
-        trol = TroLForCausalLM.from_pretrained(path, **huggingface_config)
     return trol, tok_trol

     if link == 'TroL-1.8B':
         from .arch_internlm2.modeling_trol import TroLForCausalLM
         from .arch_internlm2.tokenization_internlm2 import InternLM2Tokenizer as TroLTokenizer
+        bits = 16
         path = TROL_1_8B
         bit_quant_skip = ["vit", "vision_proj", "ffn", "output"]
     elif link == 'TroL-3.8B':
         from trol.arch_phi3.modeling_trol import TroLForCausalLM
         from transformers import LlamaTokenizerFast as TroLTokenizer
+        bits = 16
         path = TROL_3_8B
         bit_quant_skip = ["vision_model", "vision_proj", "lm_head"]
     # Loading tokenizer & Loading backbone model (error -> then delete flash attention)
     tok_trol = TroLTokenizer.from_pretrained(path, padding_side='left')
     try:
+        trol = TroLForCausalLM.from_pretrained(path, **huggingface_config).cuda()
     except:
         del huggingface_config["attn_implementation"]
+        trol = TroLForCausalLM.from_pretrained(path, **huggingface_config).cuda()
     return trol, tok_trol