duyntnet/openchat-3.6-8b-20240522-imatrix-GGUF

Win10, 64GB, RTX3090
openchat-3.6-8b-20240522-Q8_0.gguf
This is an exceptional model. Thank you for creating it!
47 questions answered in a thread before decoherence at about 15.7K
started at 67 tok/s, ended at about 47 tok/s.
Nice bake!

Using this as my preset.json file:

{
"name": "Llama 3 70B Instruct V3",
"load_params": {
"n_ctx": 16384,
"n_batch": 512,
"rope_freq_base": 1000000,
"rope_freq_scale": 0.85,
"n_gpu_layers": 64,
"use_mlock": true,
"main_gpu": 0,
"tensor_split": [
0
],
"seed": -1,
"f16_kv": true,
"use_mmap": true,
"no_kv_offload": false,
"num_experts_used": 0
},
"inference_params": {
"n_threads": 12,
"n_predict": -1,
"top_k": 40,
"min_p": 0.05,
"top_p": 0.9,
"temp": 0.2,
"repeat_penalty": 1.1,
"input_prefix": "<|start_header_id|>user<|end_header_id|>\n\n",
"input_suffix": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
"antiprompt": [
"<|start_header_id|>",
"<|eot_id|>"
],
"pre_prompt": "You are a knowledgeable, efficient, and direct AI assistant. Utilize multi-step reasoning to provide concise answers, focusing on key information. If multiple questions are asked, split them up and address in the order that yields the most logical and accurate response. Offer tactful suggestions to improve outcomes. Engage in productive collaboration with the user.",
"pre_prompt_suffix": "<|eot_id|>",
"pre_prompt_prefix": "<|start_header_id|>system<|end_header_id|>\n\n",
"seed": -1,
"tfs_z": 1,
"typical_p": 1,
"repeat_last_n": 64,
"frequency_penalty": 0,
"presence_penalty": 0,
"n_keep": 0,
"logit_bias": {},
"mirostat": 0,
"mirostat_tau": 5,
"mirostat_eta": 0.1,
"memory_f16": true,
"multiline_input": false,
"penalize_nl": true
}
}

duyntnet
/

openchat-3.6-8b-20240522-imatrix-GGUF

I love this build