Spaces:
Build error
Build error
import getDB from "@/utils/getDB" | |
import Head from "next/head" | |
import Link from "next/link" | |
import { useRouter } from "next/router" | |
import { useEffect, useMemo, useState } from "react" | |
// import styles from '@/styles/Home.module.css' | |
export const getStaticProps = async () => { | |
const db = await getDB() | |
const prompts = await db.all(`SELECT * FROM prompts ORDER BY text ASC`) | |
// get all models that have at least 1 result | |
const models = await db.all( | |
`SELECT * FROM models WHERE id IN (SELECT DISTINCT model FROM results) ORDER BY name ASC` | |
) | |
return { props: { prompts, models } } | |
} | |
export default function Home({ prompts, models }) { | |
const router = useRouter() | |
const [viewBy, setViewBy] = useState(router.query.viewBy || "prompt") | |
const changeView = (viewBy) => { | |
router.push({ query: { viewBy } }) | |
} | |
useEffect(() => { | |
if (router.query.viewBy) setViewBy(router.query.viewBy) | |
}, [router.query.viewBy]) | |
const types = useMemo(() => { | |
return Array.from(new Set(prompts.map((p) => p.type))) | |
}, [prompts]) | |
return ( | |
<> | |
<Head> | |
<title>LLM Benchmarks</title> | |
<meta | |
name="description" | |
content="Human-readable benchmarks of 60+ open-source and proprietary LLMs." | |
/> | |
<meta name="viewport" content="width=device-width, initial-scale=1" /> | |
</Head> | |
<main> | |
<h1>Crowdsourced LLM Benchmark</h1> | |
<br /> | |
<p> | |
Benchmarks like HellaSwag are a bit too abstract for me to get a sense | |
of how well they perform in real-world workflows. | |
</p> | |
<br /> | |
<p> | |
I had the idea of writing a script that asks prompts testing basic | |
reasoning, instruction following, and creativity on around 60 models | |
that I could get my hands on through inferences API. | |
</p> | |
<br /> | |
<p> | |
The script stored all the answers in a SQLite database, and those are | |
the raw results. | |
</p> | |
<br /> | |
<br /> | |
<p> | |
{`view: `} | |
<a href="#" onClick={() => changeView("model")}> | |
models | |
</a>{" "} | |
/ | |
<a href="#" onClick={() => changeView("prompt")}> | |
prompts | |
</a>{" "} | |
</p> | |
<br /> | |
{viewBy === "prompt" ? ( | |
<> | |
{types.map((type, k) => ( | |
<div key={k}> | |
<p>{type}:</p> | |
<br /> | |
<ul> | |
{prompts | |
.filter((p) => p.type === type) | |
.map((prompt, i) => ( | |
<li key={i}> | |
<pre style={{ maxWidth: 800 }}> | |
{prompt.text} | |
<br /> | |
<br /> | |
<Link href={`/${prompt.slug}`}>results</Link> | |
</pre> | |
</li> | |
))} | |
</ul> | |
</div> | |
))} | |
</> | |
) : ( | |
<ul> | |
{models | |
.score((s) => s.score) | |
.map((model, i) => ( | |
<li key={i}> | |
{model.name} -{" "} | |
<Link | |
href={`/model/${model.api_id | |
.split("/") | |
.pop() | |
.toLowerCase()}`} | |
> | |
results | |
</Link>{" "} | |
- score: {model.score} | |
</li> | |
))} | |
</ul> | |
)} | |
<br /> | |
<br /> | |
<h3>Notes</h3> | |
<br /> | |
<ul> | |
<li> | |
I used a temperature of 0 and a max token limit of 240 for each test | |
(that's why a lot of answers are cropped). The rest are default | |
settings. | |
</li> | |
<li> | |
I made this with a mix of APIs from OpenRouter, TogetherAI, OpenAI, | |
Cohere, Aleph Alpha & AI21. | |
</li> | |
<li> | |
<b>This is imperfect.</b> I want to improve this by using better | |
stop sequences and prompt formatting tailored to each model. But | |
hopefully it can already make picking models a bit easier. | |
</li> | |
<li> | |
Ideas for the future: public votes to compute an ELO rating, compare | |
2 models side by side, community-submitted prompts (open to | |
suggestions) | |
</li> | |
<li> | |
Prompt suggestions, feedback or say hi: vince [at] llmonitor.com | |
</li> | |
<li> | |
{`Shameless plug: I'm building an `} | |
<a href="https://github.com/llmonitor/llmonitor" target="_blank"> | |
open-source observability tool for AI devs. | |
</a> | |
</li> | |
</ul> | |
<br /> | |
<br /> | |
<table style={{ maxWidth: 600 }}> | |
<th> | |
<p> | |
Edit: as this got popular, I added an email form to receive | |
notifications for future benchmark results: | |
</p> | |
<iframe | |
src="https://embeds.beehiiv.com/65bd6af1-2dea-417a-baf2-b65bc27e1610?slim=true" | |
height="52" | |
frameborder="0" | |
scrolling="no" | |
style={{ | |
width: 400, | |
border: "none", | |
transform: "scale(0.8)", | |
transformOrigin: "left", | |
}} | |
></iframe> | |
<br /> | |
<small>(no spam, max 1 email per month)</small> | |
</th> | |
</table> | |
<br /> | |
</main> | |
</> | |
) | |
} | |