import getDB from "@/utils/getDB" import Head from "next/head" import Link from "next/link" import { useRouter } from "next/router" import { useEffect, useMemo, useState } from "react" // import styles from '@/styles/Home.module.css' export const getStaticProps = async () => { const db = await getDB() const prompts = await db.all(`SELECT * FROM prompts ORDER BY text ASC`) // get all models that have at least 1 result const models = await db.all( `SELECT * FROM models WHERE id IN (SELECT DISTINCT model FROM results) ORDER BY name ASC` ) return { props: { prompts, models } } } export default function Home({ prompts, models }) { const router = useRouter() const [viewBy, setViewBy] = useState(router.query.viewBy || "prompt") const changeView = (viewBy) => { router.push({ query: { viewBy } }) } useEffect(() => { if (router.query.viewBy) setViewBy(router.query.viewBy) }, [router.query.viewBy]) const types = useMemo(() => { return Array.from(new Set(prompts.map((p) => p.type))) }, [prompts]) return ( <> LLM Benchmarks

Crowdsourced LLM Benchmark


Benchmarks like HellaSwag are a bit too abstract for me to get a sense of how well they perform in real-world workflows.


I had the idea of writing a script that asks prompts testing basic reasoning, instruction following, and creativity on around 60 models that I could get my hands on through inferences API.


The script stored all the answers in a SQLite database, and those are the raw results.



{`view: `} changeView("model")}> models {" "} / changeView("prompt")}> prompts {" "}


{viewBy === "prompt" ? ( <> {types.map((type, k) => (

{type}:


))} ) : ( )}

Notes




Edit: as this got popular, I added an email form to receive notifications for future benchmark results:


(no spam, max 1 email per month)

) }