NyxKrage's picture
Support gated models (#8)
3930797 verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<script>
function strToHtml(str) {
let parser = new DOMParser();
return parser.parseFromString(str, "text/html");
}
//Short, jQuery-independent function to read html table and write them into an Array.
//Kudos to RobG at StackOverflow
function tableToObj(table) {
var rows = table.rows;
var propCells = rows[0].cells;
var propNames = [];
var results = [];
var obj, row, cells;
// Use the first row for the property names
// Could use a header section but result is the same if
// there is only one header row
for (var i = 0, iLen = propCells.length; i < iLen; i++) {
propNames.push(
(propCells[i].textContent || propCells[i].innerText).trim()
);
}
// Use the rows for data
// Could use tbody rows here to exclude header & footer
// but starting from 1 gives required result
for (var j = 1, jLen = rows.length; j < jLen; j++) {
cells = rows[j].cells;
obj = {};
for (var k = 0; k < iLen; k++) {
obj[propNames[k]] = (
cells[k].textContent || cells[k].innerText
).trim();
}
results.push(obj);
}
return results;
}
function formatGpu(gpus) {
return gpus.map(
(g) => `${g["Product Name"]} - ${g["Memory"].split(",")[0]}`
);
}
const gguf_quants = {
"IQ1_S": 1.56,
"IQ2_XXS": 2.06,
"IQ2_XS": 2.31,
"IQ2_S": 2.5,
"IQ2_M": 2.7,
"IQ3_XXS": 3.06,
"IQ3_XS": 3.3,
"Q2_K": 3.35,
"Q3_K_S": 3.5,
"IQ3_S": 3.5,
"IQ3_M": 3.7,
"Q3_K_M": 3.91,
"Q3_K_L": 4.27,
"IQ4_XS": 4.25,
"IQ4_NL": 4.5,
"Q4_0": 4.55,
"Q4_K_S": 4.58,
"Q4_K_M": 4.85,
"Q5_0": 5.54,
"Q5_K_S": 5.54,
"Q5_K_M": 5.69,
"Q6_K": 6.59,
"Q8_0": 8.5,
}
async function modelConfig(hf_model, hf_token) {
auth = hf_token == "" ? {} : {
headers: {
'Authorization': `Bearer ${hf_token}`
}
}
let config = await fetch(
`https://huggingface.co./${hf_model}/raw/main/config.json`, auth
).then(r => r.json())
let model_size = 0
try {
model_size = (await fetch(`https://huggingface.co./${hf_model}/resolve/main/model.safetensors.index.json`, auth).then(r => r.json()))["metadata"]["total_size"] / 2
if (isNaN(model_size)) {
throw new Erorr("no size in safetensors metadata")
}
} catch (e) {
try {
model_size = (await fetch(`https://huggingface.co./${hf_model}/resolve/main/pytorch_model.bin.index.json`, auth).then(r => r.json()))["metadata"]["total_size"] / 2
if (isNaN(model_size)) {
throw new Erorr("no size in pytorch metadata")
}
} catch {
let model_page = await fetch(
"https://corsproxy.io/?" + encodeURIComponent(`https://huggingface.co./${hf_model}`)
).then(r => r.text())
let el = document.createElement( 'html' );
el.innerHTML = model_page
let params_el = el.querySelector('div[data-target="ModelSafetensorsParams"]')
if (params_el !== null) {
model_size = JSON.parse(params_el.attributes.getNamedItem("data-props").value)["safetensors"]["total"]
} else {
params_el = el.querySelector('div[data-target="ModelHeader"]')
model_size = JSON.parse(params_el.attributes.getNamedItem("data-props").value)["model"]["safetensors"]["total"]
}
}
}
config.parameters = model_size
return config
}
function inputBuffer(context=8192, model_config, bsz=512) {
/* Calculation taken from github:ggerganov/llama.cpp/llama.cpp:11248
ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);
n_embd is hidden size (github:ggeranov/llama.cpp/convert.py:248)
*/
const inp_tokens = bsz
const inp_embd = model_config["hidden_size"] * bsz
const inp_pos = bsz
const inp_KQ_mask = context * bsz
const inp_K_shift = context
const inp_sum = bsz
return inp_tokens + inp_embd + inp_pos + inp_KQ_mask + inp_K_shift + inp_sum
}
function computeBuffer(context=8192, model_config, bsz=512) {
if (bsz != 512) {
alert("batch size other than 512 is currently not supported for the compute buffer, using batchsize 512 for compute buffer calculation, end result result will be an overestimatition")
}
return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
}
function kvCache(context=8192, model_config, cache_bit=16) {
const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
const n_embd_gqa = model_config["hidden_size"] / n_gqa
const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
const size = 2 * n_elements
return size * (cache_bit / 8)
}
function contextSize(context=8192, model_config, bsz=512, cache_bit=16) {
return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))
}
function modelSize(model_config, bpw=4.5) {
return Number.parseFloat((model_config["parameters"] * bpw / 8).toFixed(2))
}
async function calculateSizes(format) {
try {
const model_config = await modelConfig(document.getElementById("modelsearch").value, document.getElementById("hf_token").value)
const context = parseInt(document.getElementById("contextsize").value)
let bsz = 512
let cache_bit = 16
let bpw = 0
if (format === "gguf") {
bsz = parseInt(document.getElementById("batchsize").value)
bpw = gguf_quants[document.getElementById("quantsize").innerText]
} else if (format == "exl2") {
cache_bit = Number.parseInt(document.getElementById("kvCache").value)
bpw = Number.parseFloat(document.getElementById("bpw").value)
}
const model_size = modelSize(model_config, bpw)
const context_size = contextSize(context, model_config, bsz, cache_bit)
const total_size = ((model_size + context_size) / 2**30)
document.getElementById("resultmodel").innerText = (model_size / 2**30).toFixed(2)
document.getElementById("resultcontext").innerText = (context_size / 2**30).toFixed(2)
const result_total_el = document.getElementById("resulttotal");
result_total_el.innerText = total_size.toFixed(2)
const gpu = document.getElementById("gpusearch").value
if (gpu !== "") {
const vram = parseFloat(gpu.split("-")[1].replace("GB", "").trim())
if (vram - total_size > 0.5) {
result_total_el.style.backgroundColor = "#bef264"
} else if (vram - total_size > 0) {
result_total_el.style.backgroundColor = "#facc15"
} else {
result_total_el.style.backgroundColor = "#ef4444"
}
}
} catch(e) {
alert(e);
}
}
</script>
<link href="./styles.css" rel="stylesheet">
<title>Can I run it? - LLM VRAM Calculator</title>
</head>
<body class="p-8">
<div x-data="{ format: 'gguf' }" class="flex flex-col max-h-screen items-center mt-16 gap-10">
<h1 class="text-xl font-semibold leading-6 text-gray-900">
LLM Model, Can I run it?
</h1>
<p>
To support gated or private repos, you need to <a href="https://huggingface.co./settings/tokens" style="color: #4444ff"><b>create an authentification token</b></a>, to check the box <span style="color: #6e1818"><b>"Read access to contents of all public gated repos you can access"</b></span> and then enter the token in the field below.
</p>
<div class="flex flex-col gap-10">
<div class="w-auto flex flex-col gap-4">
<!-- Huggingface Authentification Token -->
<div
class="relative"
x-data="{
results: null,
query: null
}"
>
<label
for="gpusearch"
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>Huggingface Token (optional)</label
>
<input
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
id="hf_token"
/>
</div>
<!-- GPU Selector -->
<div
class="relative"
x-data="{
results: null,
query: null
}"
>
<label
for="gpusearch"
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>GPU (optional)</label
>
<input
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
placeholder="GeForce RTX 3090 - 24 GB"
id="gpusearch"
name="gpusearch"
list="gpulist"
x-model="query"
@keypress.debounce.150ms="results = query === '' ? [] : formatGpu(tableToObj(strToHtml(await fetch('https://corsproxy.io/?https://www.techpowerup.com/gpu-specs/?ajaxsrch=' + query).then(r => r.text())).querySelector('table')))"
/>
<datalist id="gpulist">
<template x-for="item in results">
<option :value="item" x-text="item"></option>
</template>
</datalist>
</div>
<!-- Model Selector -->
<div class="flex flex-row gap-4 relative">
<label
for="contextsize"
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>
Model (unquantized)
</label>
<div
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
x-data="{
open: false,
value: 'Nexusflow/Starling-LM-7B-beta',
results: null,
toggle() {
if (this.open) {
return this.close()
}
this.$refs.input.focus()
this.open = true
},
close(focusAfter) {
if (! this.open) return
this.open = false
focusAfter && focusAfter.focus()
}
}"
x-on:keydown.escape.prevent.stop="close($refs.input)"
x-id="['model-typeahead']"
class="relative"
>
<!-- Input -->
<input
id="modelsearch"
x-ref="input"
x-on:click="toggle()"
@keypress.debounce.150ms="results = (await
fetch('https://huggingface.co./api/quicksearch?type=model&q=' +
encodeURIComponent(value)).then(r => r.json())).models.filter(m => !m.id.includes('GGUF') && !m.id.includes('AWQ') && !m.id.includes('GPTQ') && !m.id.includes('exl2'));"
:aria-expanded="open"
:aria-controls="$id('model-typeahead')"
x-model="value"
class="flex justify-between items-center gap-2 w-full"
/>
<!-- Panel -->
<div
x-ref="panel"
x-show="open"
x-transition.origin.top.left
x-on:click.outside="close($refs.input)"
:id="$id('model-typeahead')"
style="display: none"
class="absolute left-0 mt-4 w-full rounded-md bg-white shadow-sm ring-1 ring-inset ring-gray-300 z-10"
>
<template x-for="result in results">
<a
@click="value = result.id; close($refs.input)"
x-text="result.id"
class="flex cursor-pointer items-center gap-2 w-full first-of-type:rounded-t-md last-of-type:rounded-b-md px-4 py-2.5 text-left text-sm hover:bg-gray-500/5 disabled:text-gray-500"
></a>
</template>
</div>
</div>
</div>
<!-- Context Size Selector -->
<div class="relative">
<label
for="contextsize"
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>
Context Size
</label>
<input
value="8192"
type="number"
name="contextsize"
id="contextsize"
step="1024"
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
/>
</div>
<!-- Quant Format Selector -->
<div class="relative">
<label
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>Quant Format</label
>
<fieldset
x-model="format"
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
>
<legend class="sr-only">Quant format</legend>
<div
class="space-y-4 sm:flex sm:items-center sm:space-x-10 sm:space-y-0"
>
<div class="flex items-center">
<input
id="gguf-format"
name="quant-format"
type="radio"
value="gguf"
checked
class="h-4 w-4 border-gray-300 text-indigo-600 focus:ring-indigo-600"
/>
<label
for="gguf-format"
class="ml-3 block text-sm font-medium leading-6 text-gray-900"
>GGUF</label
>
</div>
<div class="flex items-center">
<input
id="exl2-format"
name="quant-format"
type="radio"
value="exl2"
class="h-4 w-4 border-gray-300 text-indigo-600 focus:ring-indigo-600"
/>
<label
for="exl2-format"
class="ml-3 block text-sm font-medium leading-6 text-gray-900"
>EXL2</label
>
</div>
<div class="flex items-center">
<input
id="gptq-format"
name="quant-format"
type="radio"
disabled
value="gptq"
class="h-4 w-4 border-gray-300 text-indigo-600 focus:ring-indigo-600"
/>
<label
for="gptq-format"
class="ml-3 block text-sm font-medium leading-6 text-gray-900"
>GPTQ (coming soon)</label
>
</div>
</div>
</fieldset>
</div>
<!-- EXL2 Options -->
<div x-show="format === 'exl2'" class="flex flex-row gap-4">
<div class="relative flex-grow">
<label
for="bpw"
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>
BPW
</label>
<input
value="4.5"
type="number"
step="0.01"
id="bpw"
name="bpw"
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
/>
</div>
<div
class="flex-shrink relative rounded-md"
>
<div
class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
>
<label
for="kvCache"
class="inline-block bg-white text-xs font-medium text-gray-900"
>
KV Cache
</label>
<select id="kvCache" name="kvCache">
<option value="16">16 bit</option>
<option value="8">8 bit</option>
<option value="4">4 bit</option>
</select>
</div>
</div>
</div>
<!-- GGUF Options -->
<div x-show="format === 'gguf'" class="relative">
<div class="flex flex-row gap-4">
<label
for="contextsize"
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>
Quantization Size
</label>
<div
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
x-data="{
open: false,
value: '',
toggle() {
if (this.open) {
return this.close()
}
this.$refs.button.focus()
this.open = true
},
close(focusAfter) {
if (! this.open) return
this.open = false
focusAfter && focusAfter.focus()
}
}"
x-on:keydown.escape.prevent.stop="close($refs.button)"
x-id="['dropdown-button']"
class="relative"
>
<!-- Button -->
<button
x-ref="button"
x-on:click="toggle()"
:aria-expanded="open"
:aria-controls="$id('dropdown-button')"
type="button"
id="quantsize"
x-text="value.length === 0 ? 'Q4_K_S' : value"
class="flex justify-between items-center gap-2 w-full"
>
Q4_K_S
<!-- Heroicon: chevron-down -->
<svg
xmlns="http://www.w3.org/2000/svg"
class="h-5 w-5 text-gray-400"
viewBox="0 0 20 20"
fill="currentColor"
>
<path
fill-rule="evenodd"
d="M5.293 7.293a1 1 0 011.414 0L10 10.586l3.293-3.293a1 1 0 111.414 1.414l-4 4a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414z"
clip-rule="evenodd"
/>
</svg>
</button>
<!-- Panel -->
<div
x-data="{ quants: [
'IQ1_S',
'IQ2_XXS',
'IQ2_XS',
'IQ2_S',
'IQ2_M',
'IQ3_XXS',
'IQ3_XS',
'Q2_K',
'Q3_K_S',
'IQ3_S',
'IQ3_M',
'Q3_K_M',
'Q3_K_L',
'IQ4_XS',
'IQ4_NL',
'Q4_0',
'Q4_K_S',
'Q4_K_M',
'Q5_0',
'Q5_K_S',
'Q5_K_M',
'Q6_K',
'Q8_0'
]}"
x-ref="panel"
x-show="open"
x-transition.origin.top.left
x-on:click.outside="close($refs.button)"
:id="$id('dropdown-button')"
style="display: none"
class="absolute left-0 mt-4 w-full rounded-md bg-white shadow-sm ring-1 ring-inset ring-gray-300 z-10"
>
<template x-for="quant in quants">
<a
@click="value = quant; close($refs.button)"
x-text="quant"
class="flex cursor-pointer items-center gap-2 w-full first-of-type:rounded-t-md last-of-type:rounded-b-md px-4 py-2.5 text-left text-sm hover:bg-gray-500/5 disabled:text-gray-500"
></a>
</template>
</div>
</div>
<div class="relative">
<label
for="batchsize"
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>
Batch Size
</label>
<input
value="512"
type="number"
step="128"
id="batchsize"
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
/>
</div>
</div>
</div>
<button
type="button"
class="rounded-md bg-slate-800 px-3 py-2 text-sm font-semibold text-white shadow-sm hover:bg-slate-700 focus-visible:outline focus-visible:outline-2 focus-visible:outline-offset-2 focus-visible:outline-indigo-600"
@click="calculateSizes(format)"
>
Submit
</button>
</div>
<div class="w-auto flex flex-col gap-4">
<div class="relative">
<label
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>
Model Size (GB)
</label>
<div
id="resultmodel"
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
>4.20</div>
</div>
<div class="relative">
<label
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>
Context Size (GB)
</label>
<div
id="resultcontext"
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
>6.90</div>
</div>
<div class="relative">
<label
class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
>
Total Size (GB)
</label>
<div
id="resulttotal"
class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
>420.69</div>
</div>
</div>
</div>
</div>
<script
src="https://cdn.jsdelivr.net/npm/[email protected]/dist/cdn.min.js"
></script>
<script defer>
calculateSizes("gguf")
</script>
</body>
</html>