benchmarks / pages /index.js
vincelwt's picture
add llmonitor & start scoring
19513c9 unverified
raw
history blame
5.66 kB
import getDB from "@/utils/getDB"
import Head from "next/head"
import Link from "next/link"
import { useRouter } from "next/router"
import { useEffect, useMemo, useState } from "react"
// import styles from '@/styles/Home.module.css'
export const getStaticProps = async () => {
const db = await getDB()
const prompts = await db.all(`SELECT * FROM prompts ORDER BY text ASC`)
// get all models that have at least 1 result
const models = await db.all(
`SELECT * FROM models WHERE id IN (SELECT DISTINCT model FROM results) ORDER BY name ASC`
)
return { props: { prompts, models } }
}
export default function Home({ prompts, models }) {
const router = useRouter()
const [viewBy, setViewBy] = useState(router.query.viewBy || "prompt")
const changeView = (viewBy) => {
router.push({ query: { viewBy } })
}
useEffect(() => {
if (router.query.viewBy) setViewBy(router.query.viewBy)
}, [router.query.viewBy])
const types = useMemo(() => {
return Array.from(new Set(prompts.map((p) => p.type)))
}, [prompts])
return (
<>
<Head>
<title>LLM Benchmarks</title>
<meta
name="description"
content="Human-readable benchmarks of 60+ open-source and proprietary LLMs."
/>
<meta name="viewport" content="width=device-width, initial-scale=1" />
</Head>
<main>
<h1>Crowdsourced LLM Benchmark</h1>
<br />
<p>
Benchmarks like HellaSwag are a bit too abstract for me to get a sense
of how well they perform in real-world workflows.
</p>
<br />
<p>
I had the idea of writing a script that asks prompts testing basic
reasoning, instruction following, and creativity on around 60 models
that I could get my hands on through inferences API.
</p>
<br />
<p>
The script stored all the answers in a SQLite database, and those are
the raw results.
</p>
<br />
<br />
<p>
{`view: `}
<a href="#" onClick={() => changeView("model")}>
models
</a>{" "}
/
<a href="#" onClick={() => changeView("prompt")}>
prompts
</a>{" "}
</p>
<br />
{viewBy === "prompt" ? (
<>
{types.map((type, k) => (
<div key={k}>
<p>{type}:</p>
<br />
<ul>
{prompts
.filter((p) => p.type === type)
.map((prompt, i) => (
<li key={i}>
<pre style={{ maxWidth: 800 }}>
{prompt.text}
<br />
<br />
<Link href={`/${prompt.slug}`}>results</Link>
</pre>
</li>
))}
</ul>
</div>
))}
</>
) : (
<ul>
{models
.score((s) => s.score)
.map((model, i) => (
<li key={i}>
{model.name} -{" "}
<Link
href={`/model/${model.api_id
.split("/")
.pop()
.toLowerCase()}`}
>
results
</Link>{" "}
- score: {model.score}
</li>
))}
</ul>
)}
<br />
<br />
<h3>Notes</h3>
<br />
<ul>
<li>
I used a temperature of 0 and a max token limit of 240 for each test
(that's why a lot of answers are cropped). The rest are default
settings.
</li>
<li>
I made this with a mix of APIs from OpenRouter, TogetherAI, OpenAI,
Cohere, Aleph Alpha & AI21.
</li>
<li>
<b>This is imperfect.</b> I want to improve this by using better
stop sequences and prompt formatting tailored to each model. But
hopefully it can already make picking models a bit easier.
</li>
<li>
Ideas for the future: public votes to compute an ELO rating, compare
2 models side by side, community-submitted prompts (open to
suggestions)
</li>
<li>
Prompt suggestions, feedback or say hi: vince [at] llmonitor.com
</li>
<li>
{`Shameless plug: I'm building an `}
<a href="https://github.com/llmonitor/llmonitor" target="_blank">
open-source observability tool for AI devs.
</a>
</li>
</ul>
<br />
<br />
<table style={{ maxWidth: 600 }}>
<th>
<p>
Edit: as this got popular, I added an email form to receive
notifications for future benchmark results:
</p>
<iframe
src="https://embeds.beehiiv.com/65bd6af1-2dea-417a-baf2-b65bc27e1610?slim=true"
height="52"
frameborder="0"
scrolling="no"
style={{
width: 400,
border: "none",
transform: "scale(0.8)",
transformOrigin: "left",
}}
></iframe>
<br />
<small>(no spam, max 1 email per month)</small>
</th>
</table>
<br />
</main>
</>
)
}