Spaces:
Paused
Paused
File size: 6,035 Bytes
acac358 1f0abfd acac358 d3e17a0 20295a8 0857b9d acac358 cf92825 08b43f9 ae13ec8 08b43f9 1f0abfd 44e1808 acac358 6a32172 acac358 6a32172 67b6903 6a32172 67b6903 6a32172 67b6903 28b7854 67b6903 28b7854 67b6903 28b7854 44e1808 28b7854 20295a8 6a32172 20295a8 6a32172 20295a8 6a32172 cf92825 6a32172 cf92825 08b43f9 6a32172 67b6903 20295a8 cf92825 6a32172 cf92825 6a32172 cf92825 6a32172 acac358 6a32172 4d19c40 acac358 6a32172 acac358 2901f54 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
import os
# import torch
import transformers
import gradio as gr
# from huggingface_hub import hf_hub_download
from huggingface_hub import snapshot_download
import safetensors
# from transformer_engine.pytorch import fp8_autocast
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
import torch
print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# Tesla T4
# os.environ['HF_HOME'] = '/data/.huggingface'
auth_token = os.environ['HF_TOKEN'] or True
model_id = "fcastanedo/energy_v1"
files_to_download = [
"config.json",
"model-00001-of-00030.safetensors",
"model-00002-of-00030.safetensors",
"model-00003-of-00030.safetensors",
"model-00004-of-00030.safetensors",
"model-00005-of-00030.safetensors",
"model-00006-of-00030.safetensors",
"model-00007-of-00030.safetensors",
"model-00008-of-00030.safetensors",
"model-00009-of-00030.safetensors",
"model-00010-of-00030.safetensors",
"model-00011-of-00030.safetensors",
"model-00012-of-00030.safetensors",
"model-00013-of-00030.safetensors",
"model-00014-of-00030.safetensors",
"model-00015-of-00030.safetensors",
"model-00016-of-00030.safetensors",
"model-00017-of-00030.safetensors",
"model-00018-of-00030.safetensors",
"model-00019-of-00030.safetensors",
"model-00020-of-00030.safetensors",
"model-00021-of-00030.safetensors",
"model-00022-of-00030.safetensors",
"model-00023-of-00030.safetensors",
"model-00024-of-00030.safetensors",
"model-00025-of-00030.safetensors",
"model-00026-of-00030.safetensors",
"model-00027-of-00030.safetensors",
"model-00028-of-00030.safetensors",
"model-00029-of-00030.safetensors",
"model-00030-of-00030.safetensors",
"special_tokens_map.json",
"tokenizer.json",
"tokenizer_config.json"
]
'''
# Directory to store downloaded files
model_dir = f"./{model_id}"
os.makedirs(model_dir, exist_ok=True)
'''
# Use /data for persistent storage
model_dir = f"/data/{model_id}"
os.makedirs(model_dir, exist_ok=True)
# snapshot_download(repo_id=model_id, ignore_patterns="*.bin", token=auth_token)
# '''
# Download model to persistent storage (if not already there)
if not os.path.exists(model_dir) or not os.listdir(model_dir):
print("Downloading Weights")
snapshot_download(repo_id=model_id, local_dir=model_dir, ignore_patterns="*.bin", token=auth_token)
snapshot_download(repo_id=model_id, local_dir=model_dir, ignore_patterns=["*.safetensors", "*.json"], token=auth_token)
# '''
# snapshot_download(repo_id=model_id, local_dir=model_dir, ignore_patterns=["*.safetensors", "*.json"], token=auth_token)
'''
# Download each file
for file in files_to_download:
hf_hub_download(repo_id=model_id, filename=file, local_dir=model_dir, token=auth_token)
'''
'''
with fp8_autocast(): # Enables FP8 computations
model = transformers.AutoModelForCausalLM.from_pretrained(
model_dir,
# state_dict=state_dict,
torch_dtype=torch.float16 # Load in FP16 first, then convert
)
'''
# Load the model manually from local files
# model = transformers.AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.int8)
# model = transformers.AutoModelForCausalLM.from_pretrained(model_dir, load_in_4bit=True)
model = transformers.AutoModelForCausalLM.from_pretrained(model_dir, quantization_config=quantization_config)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir)
'''
model.to(dtype=torch.float16) # Load as FP16 first
model = model.half() # Convert to FP8-like (closest possible)
'''
# Create pipeline with manually loaded model & tokenizer
pipeline = transformers.pipeline(
"text-generation",
model=model,
# model_kwargs={"torch_dtype": torch.int8},
tokenizer=tokenizer,
# device=3,
decive="cuda",
# device_map="auto",
)
'''
pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
token=auth_token,
device=3
# device_map="auto",
)
'''
messages = [
{
"role":"system",
"content":"You are an expert in Oil, Gas, and Petroleum for certifications like Petroleum Engineering Certificate (SPE). You will be provided Multiple Choice Questions. Select the correct response out of the four choices."
},
{
"role":"user",
"content":"Who are you?"
}
]
prompt = pipeline.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
terminators = [
pipeline.tokenizer.eos_token_id,
pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
outputs = pipeline(
prompt,
max_new_tokens = 256,
eos_token_id = terminators,
do_sample = True,
temperature = 0.6,
top_p = 0.9,
)
def chat_function(message, history, system_prompt, max_new_tokens, temperature):
messages = [{"role":"system","content":system_prompt},
{"role":"user", "content":message}]
prompt = pipeline.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,)
terminators = [
pipeline.tokenizer.eos_token_id,
pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
outputs = pipeline(
prompt,
max_new_tokens = max_new_tokens,
eos_token_id = terminators,
do_sample = True,
temperature = temperature + 0.1,
top_p = 0.9,)
return outputs[0]["generated_text"][len(prompt):]
gr.ChatInterface(
chat_function,
textbox=gr.Textbox(placeholder="Enter message here", container=False, scale = 7),
chatbot=gr.Chatbot(height=400),
additional_inputs=[
gr.Textbox("You are helpful AI", label="System Prompt"),
gr.Slider(500,4000, label="Max New Tokens"),
gr.Slider(0,1, label="Temperature")
]
).launch()
|