omwdataset / data_viewer.py
omkarenator's picture
unify data viewer, DV, DV2, DVS
87a6313
from fasthtml.common import *
from fasthtml.components import *
import json
import string
import random
import jsonlines
def gen_random_id() -> str:
return "".join(random.choices(string.ascii_lowercase, k=8))
def view_data(
before,
after,
doc_id,
data_source: str = None,
data_sources=None,
target: str = "colcontent",
):
if data_sources is not None:
drop_down = Select(
*[
Option(ds, value=ds, selected=(ds == data_source))
for ds in data_sources
],
name=f"data_source_{target}",
hx_get=f"/curated/{target}",
hx_target=f"#{target}",
hx_trigger="change",
hx_swap="innerHTML",
)
slider = Input(
type="range",
name=f"doc_id_{target}",
min="0",
max="9",
value=str(doc_id),
hx_get=f"/curated/{target}",
hx_target=f"#{target}",
hx_trigger="change",
hx_swap="innerHTML",
hx_include=f'[name="data_source_{target}"]',
)
form = Form(
Div(
Label("Data source: ", drop_down),
)
if (data_sources is not None)
else None,
Div(
Label("Data sample: ", slider, f"{doc_id}", cls="plotly_slider"),
),
cls="plotly_input_container",
)
col1 = Div(
H3("Raw format"),
Pre(
json.dumps(before, indent=4),
style="white-space: pre-wrap; word-break: break-all;",
),
style="width: 48%; float: left; overflow-x: auto;",
)
col2 = Div(
H3("Extracted format"),
Pre(
json.dumps(after, indent=4),
style="white-space: pre-wrap; word-break: break-all;",
),
style="width: 48%; float: right; overflow-x: auto;",
)
data_display = Div(
col1,
col2,
style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
)
return Div(form, data_display, style="margin-top: 10px;", id=target)
def DVS(
left,
header,
):
col1 = Div(
Pre(
json.dumps(left, indent=4, ensure_ascii=False),
style="white-space: pre-wrap; word-break: break-all;",
),
style="float: left; overflow-x: auto;",
)
data_display = Div(
col1,
style="overflow: auto; clear: both; height: 200px; border: 1px solid #ccc; padding: 20px;",
)
return Div(H3(header), data_display, style="margin-top: 10px;")
def DV(
left_file,
doc_id,
header,
target: str = None,
):
if target is None:
target = "".join(random.choices(string.ascii_lowercase, k=8))
if left_file.endswith("jsonl"):
left = [x for x in jsonlines.open(left_file)]
else:
left = json.load(open(left_file, encoding="utf-8"))
max_doc_id = len(left) - 1
slider = Input(
type="range",
name=f"doc_id_{target}",
min="0",
max=str(max_doc_id),
value=str(doc_id),
hx_get=f"/update/{target}",
hx_target=f"#{target}",
hx_trigger="change",
hx_swap="innerHTML",
hx_vals=json.dumps({"left_file": f"{left_file}", "header": f"{header}"}),
)
form = Div(
H3(header),
Label(
"Data sample: ", slider, f"{doc_id} of {max_doc_id}", cls="plotly_slider"
),
cls="plotly_input_container",
style="padding: 20px;",
)
col1 = Div(
Pre(
json.dumps(left[doc_id], indent=4, ensure_ascii=False),
style="white-space: pre-wrap; word-break: break-all;",
),
style="float: left; overflow-x: auto;",
)
data_display = Div(
col1,
style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
)
return Div(form, data_display, style="margin-top: 10px;", id=target)
def DV2(
left_file,
right_file,
doc_id,
target: str = None,
):
if target is None:
target = "".join(random.choices(string.ascii_lowercase, k=8))
left = json.load(open(left_file, encoding="utf-8"))
right = json.load(open(right_file, encoding="utf-8"))
max_doc_id = len(left) - 1
slider = Input(
type="range",
name=f"doc_id_{target}",
min="0",
max=str(max_doc_id),
value=str(doc_id),
hx_get=f"/update/{target}",
hx_target=f"#{target}",
hx_trigger="change",
hx_swap="innerHTML",
hx_vals=json.dumps(
{"left_file": f"{left_file}", "right_file": f"{right_file}"}
),
)
form = Div(
Label(
"Data sample: ", slider, f"{doc_id} of {max_doc_id}", cls="plotly_slider"
),
cls="plotly_input_container",
style="padding: 20px;",
)
col1 = Div(
H3("Raw format", style="margin-top: 0px;"),
Pre(
json.dumps(left[doc_id], indent=4, ensure_ascii=False),
style="white-space: pre-wrap; word-break: break-all;",
),
style="width: 48%; float: left; overflow-x: auto;",
)
col2 = Div(
H3("Extracted format", style="margin-top: 0px;"),
Pre(
json.dumps(right[doc_id], indent=4, ensure_ascii=False),
style="white-space: pre-wrap; word-break: break-all;",
),
style="width: 48%; float: right; overflow-x: auto;",
)
data_display = Div(
col1,
col2,
style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
)
return Div(form, data_display, style="margin-top: 10px;", id=target)
def update(target: str, request):
params = request.query_params
doc_id = int(params.get(f"doc_id_{target}", 3))
left_file = params.get("left_file")
right_file = params.get("right_file")
if left_file and right_file:
return (
DV2(
left_file,
right_file,
doc_id,
target,
),
)
else:
return DV(
left_file,
doc_id,
params.get("header"),
target,
)