omwdataset / main.py
omkarenator's picture
initial commit
e137e27
raw
history blame
12.6 kB
from fasthtml.common import *
from fasthtml.components import *
from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline
from plotly import graph_objects as go
from fh_plotly import plotly2fasthtml
import pandas as pd
import json
from rich import print
import curated
import web
import common
import results
app, rt = fast_app(
debug=True,
pico=False,
hdrs=(
Meta(charset="UTF-8"),
Meta(name="viewport", content="width=device-width, initial-scale=1.0"),
Script(src="https://distill.pub/template.v2.js"),
Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"),
Script(src="https://cdn.plot.ly/plotly-latest.min.js"),
Link(rel="stylesheet", href="style.css"),
MarkdownJS(),
HighlightJS(langs=["python", "javascript", "html", "css"]),
),
)
@app.get("/")
def main():
return Div(
D_front_matter(),
D_title(
H1(
"TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models",
cls="l-body",
style="text-align: center;",
),
Div(
Img(src="images/llm360_logo.png"),
id="title-plot",
cls="main-plot-container l-page",
),
),
D_article(
D_contents(
Nav(
H3("Table of Contents"),
Div(
A("TxT360", href="#_self"),
hx_get="/intro",
hx_target="#inner-text",
),
Div(
Ul(
Li(
A(
"Introduction",
href="/intro#section1",
hx_get="/intro#section1",
hx_target="#inner-text",
)
),
Li(
A(
"Background",
href="/intro#section2",
hx_get="/intro#section2",
hx_target="#inner-text",
)
),
Li(
A(
"Main Content",
href="/intro#section3",
hx_get="/intro#section3",
hx_target="#inner-text",
)
),
Li(
A(
"Conclusion",
href="/intro#section4",
hx_get="/intro#section4",
hx_target="#inner-text",
)
),
),
),
Div(
A("Web Data", href="#inner-text"),
hx_get="/webdata",
hx_target="#inner-text",
),
Div(
A("Curated Sources", href="#inner-text"),
hx_get="/curated",
hx_target="#inner-text",
),
Div(
A("Common Steps", href="#inner-text"),
hx_get="/common",
hx_target="#inner-text",
),
Div(
A("TxT360 Results", href="#inner-text"),
hx_get="/results",
hx_target="#inner-text",
),
role="navigation",
cls="l-text figcaption",
),
),
intro(),
),
)
@app.get("/intro")
def intro():
return Div(
Section(
H2("Introduction"),
P("""We are excited to introduce TxT360, a
large-scale, comprehensive, and fully transparent
dataset designed for Large Language Model (LLM)
pre-training. TxT360 is engineered to strike a
balance between the quantity and quality of
pre-training data, pushing the limit on both
fronts. This comprehensive dataset encompasses both
expansive web-based data and highly curated data
sources, making it one of the most robust LLM
pre-training corpora available today. Our web data
component includes 99 snapshots from Common Crawl,
amassing 5.7 trillion tokens and occupying 11 TB of
disk space in jsonl.gz format. On the curated side,
TxT360 integrates one of the most extensive
collections of high-quality sources across multiple
domains, ensuring diverse and rich content referred
to as curated sources, 14 sources across 10
domains. To maintain the highest quality, we
meticulously pre-processed the web data to filter
out low-quality content and conducted thorough
reviews of the curated sources. This process not
only unified their formats but also identified and
rectified any anomalies. Not only do we 100%
open-source our processing scripts, but we also
release the details of our data reviews, revealing
the decision-making processes behind data selection
and quality assurance. This level of transparency
allows researchers and practitioners to fully
understand the dataset’s composition and make
informed decisions when using TxT360 for training.
Additionally, TxT360 includes detailed
documentation and analysis of the data, covering
distribution statistics, domain coverage, and
processing pipeline, which helps users navigate and
utilize the dataset effectively. Overall, TxT360
represents a significant step forward in the
availability and transparency of large-scale
training data for language models, setting a new
standard for dataset quality and openness."""),
id="section1",
),
Section(
H2("Background"),
P(
""" The quality and size of a pre-training dataset
play a crucial role in the performance of large
language models (LLMs). The community has
introduced a variety of datasets for this purpose,
including purely web-based datasets like RefinedWeb
[1], RedPajama-Data-V2 [2], DCLM [3], and
FineWeb [4], as well as comprehensive datasets
derived from multiple highly-curated data sources
such as The Pile [5], RedPajama-Data-V1 [6], and
Dolma [7] . It is commonly known that web-based
datasets provide a vast quantity of data, while
highly-curated multi-source datasets consistently
deliver high quality and diversity, both critical
for effective LLM pre-training. However, despite
the advancements in both types of data, each type
of dataset has its limitations. For instance, the
processing scripts for the web dataset, RefinedWeb,
known for its high quality, are not public, and
only about 10% of the entire dataset has been
disclosed. Conversely, the web component of
existing highly-curated multi-source datasets is
relatively small compared to purely web-based
datasets, limiting their coverage and diversity
compared to the scale of information from the
internet. By integrating the extensive reach of
web data with the exceptional quality of curated
sources, TxT360 is crafted to meet and surpass the
rigorous standards required for state-of-the-art
LLM pre-training. """
),
id="section2",
),
Section(
H2("Main Content"),
P("""The performance of a large language model (LLM)
depends heavily on the quality and size of its
pretraining dataset. However, the pretraining
datasets for state-of-the-art open LLMs like Llama
3 and Mixtral are not publicly available and very
little is known about how they were created.
Reading time: 45 min. For the best reading
experience, we recommend not using a mobile phone.
Recently, we released 🍷 FineWeb, a new,
large-scale (15-trillion tokens, 44TB disk space)
dataset for LLM pretraining. FineWeb is derived
from 96 CommonCrawl snapshots and produces
better-performing LLMs than other open pretraining
datasets. To bring more clarity in machine learning
and advance the open understanding of how to train
good quality large language models, we carefully
documented and ablated all of the design choices
used in FineWeb, including in-depth investigations
of deduplication and filtering strategies. The
present long form report is a deep dive in how to
create a large and high-quality web-scale dataset
for LLM pretraining. The dataset itself, 🍷
FineWeb, is available here. We are extremely
thankful to the whole distill.pub team (Christopher
Olah, Shan Carter, Ludwig Schubert in particular)
for creating the template on which we based this
blog post. Thanks also for inspiring us with
exquisitely crafted articles and blog posts. In
this report we also introduce 📚 FineWeb-Edu, a
subset of FineWeb constructed using scalable
automated high-quality annotations for educational
value, and which outperforms all openly accessible
web-datasets on a number of educational benchmarks
such as MMLU, ARC, and OpenBookQA. 📚 FineWeb-Edu
is available in two sizes/filtering-level: 1.3
trillion (very high educational content) and 5.4
trillion (high educational content) tokens (all
tokens are measured with GPT2 tokenizer). You can
download it here. Both datasets are released under
the permissive ODC-By 1.0 license TLDR: This blog
covers a discussion on processing and evaluating
data quality at scale, the 🍷 FineWeb recipe
(listing and explaining all of our design choices),
and the process followed to create its 📚
FineWeb-Edu subset."""),
id="section3",
),
Section(
H2("Conclusion"),
P("""This is the conclusion section where we
summarize the key points discussed in the blog post
and provide final thoughts."""),
id="section4",
),
id="inner-text",
)
rt("/curated")(curated.curated)
rt("/curated/{target}")(curated.update)
rt("/webdata")(web.web_data)
rt("/webdata/{target}")(web.update)
rt("/common")(common.common_steps)
rt("/results")(results.results)
serve()