Spaces:
Running
on
T4
Running
on
T4
Update app.py
Browse files
app.py
CHANGED
@@ -350,16 +350,17 @@ def load_models_and_documents():
|
|
350 |
providers=['CPUExecutionProvider']
|
351 |
)
|
352 |
|
353 |
-
st.write('Downloading and Loading Mistral
|
354 |
|
355 |
llm = LLM(
|
356 |
-
model='
|
|
|
357 |
tensor_parallel_size=1,
|
358 |
trust_remote_code=True,
|
359 |
enforce_eager=True,
|
360 |
quantization="awq",
|
361 |
-
gpu_memory_utilization=0.
|
362 |
-
max_model_len=
|
363 |
dtype=torch.float16,
|
364 |
max_num_seqs=128
|
365 |
)
|
|
|
350 |
providers=['CPUExecutionProvider']
|
351 |
)
|
352 |
|
353 |
+
st.write('Downloading and Loading Mistral v0.2 by AWS Prototyping quantized with AWQ and using Outlines + vLLM Engine as backend...')
|
354 |
|
355 |
llm = LLM(
|
356 |
+
model='"aws-prototyping/MegaBeam-Mistral-7B-300k-AWQ"',
|
357 |
+
revision='MegaBeam-Mistral-7B-300k-AWQ-64g-4b-GEMM',
|
358 |
tensor_parallel_size=1,
|
359 |
trust_remote_code=True,
|
360 |
enforce_eager=True,
|
361 |
quantization="awq",
|
362 |
+
gpu_memory_utilization=0.7,
|
363 |
+
max_model_len=12288,
|
364 |
dtype=torch.float16,
|
365 |
max_num_seqs=128
|
366 |
)
|