Spaces:
Running
on
Zero
Running
on
Zero
Feature(MInference): update information
Browse files
app.py
CHANGED
@@ -14,12 +14,15 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
14 |
|
15 |
|
16 |
DESCRIPTION = """
|
17 |
-
# MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via Dynamic Sparse Attention
|
18 |
_Huiqiang Jiang†, Yucheng Li†, Chengruidong Zhang†, Qianhui Wu, Xufang Luo, Surin Ahn, Zhenhua Han, Amir H. Abdi, Dongsheng Li, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
|
19 |
|
20 |
<h2 style="text-align: center;"><a href="https://github.com/microsoft/MInference" target="blank"> [Code]</a>
|
21 |
-
<a href="https://
|
22 |
-
<a href="https://arxiv.org/abs/
|
|
|
|
|
|
|
23 |
|
24 |
<font color="brown"><b>This is only a deployment demo. Due to limited GPU resources, we do not provide an online demo. You will need to follow the code below to try MInference locally.</b></font>
|
25 |
|
@@ -55,7 +58,7 @@ h1 {
|
|
55 |
"""
|
56 |
|
57 |
# Load the tokenizer and model
|
58 |
-
model_name = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
|
59 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
60 |
model = AutoModelForCausalLM.from_pretrained(
|
61 |
model_name, torch_dtype="auto", device_map="auto"
|
|
|
14 |
|
15 |
|
16 |
DESCRIPTION = """
|
17 |
+
# [MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via Dynamic Sparse Attention](https://aka.ms/MInference)(Under Review, ES-FoMo @ ICML'24)
|
18 |
_Huiqiang Jiang†, Yucheng Li†, Chengruidong Zhang†, Qianhui Wu, Xufang Luo, Surin Ahn, Zhenhua Han, Amir H. Abdi, Dongsheng Li, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
|
19 |
|
20 |
<h2 style="text-align: center;"><a href="https://github.com/microsoft/MInference" target="blank"> [Code]</a>
|
21 |
+
<a href="https://aka.ms/MInference" target="blank"> [Project Page]</a>
|
22 |
+
<a href="https://arxiv.org/abs/2407" target="blank"> [Paper]</a></h2>
|
23 |
+
|
24 |
+
## News
|
25 |
+
- 🧩 We will present **MInference 1.0** at the _**Microsoft Booth**_ and _**ES-FoMo**_ at ICML'24. See you in Vienna!
|
26 |
|
27 |
<font color="brown"><b>This is only a deployment demo. Due to limited GPU resources, we do not provide an online demo. You will need to follow the code below to try MInference locally.</b></font>
|
28 |
|
|
|
58 |
"""
|
59 |
|
60 |
# Load the tokenizer and model
|
61 |
+
model_name = "gradientai/Llama-3-8B-Instruct-Gradient-1048k" if torch.cuda.is_available() else "Qwen/Qwen2-0.5B"
|
62 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
63 |
model = AutoModelForCausalLM.from_pretrained(
|
64 |
model_name, torch_dtype="auto", device_map="auto"
|