freemt
Update css height: 20rem
6a93cbd
raw
history blame
13.3 kB
"""Run interactively."""
from typing import Tuple # , Optional
import joblib
from random import randint
from textwrap import dedent
from itertools import zip_longest
from sklearn.cluster import DBSCAN
from socket import socket, AF_INET, SOCK_STREAM
import signal
from varname import nameof
from logzero import logger
# import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# from tabulate import tabulate
from fastlid import fastlid
import gradio as gr
from radiobee.process_upload import process_upload
from radiobee.files2df import files2df
from radiobee.file2text import file2text
from radiobee.lists2cmat import lists2cmat
# from radiobee.plot_df import plot_df
from radiobee.cmat2tset import cmat2tset
sns.set()
sns.set_style("darkgrid")
fastlid.set_languages = ["en", "zh"]
signal.signal(signal.SIGINT, signal.SIG_DFL)
print("Press Ctrl+C to quit\n")
def savelzma(obj, fileloc: str = None):
if fileloc is None:
fileloc = nameof(obj) # this wont work
joblib.dump(obj, f"data/{fileloc}.lzma")
def greet(input):
"""Greet yo."""
return f"'Sup yo! (your input: {input})"
def upfile1(file1, file2=None) -> Tuple[str, str]:
"""Upload file1, file2."""
return file1.name, f"'Sup yo! (your input: {input})"
def process_2upoads(file1, file2):
"""Process stuff."""
# return f"{process_upload(file1)}\n===***\n{process_upload(file2)}"
text1 = [_.strip() for _ in process_upload(file1).splitlines() if _.strip()]
text2 = [_.strip() for _ in process_upload(file2).splitlines() if _.strip()]
text1, text2 = zip(*zip_longest(text1, text2, fillvalue=""))
df = pd.DataFrame({"text1": text1, "text2": text2})
# return tabulate(df)
# return tabulate(df, tablefmt="grid")
# return tabulate(df, tablefmt='html')
return df
if __name__ == "__main__":
_ = """
fn = process_2upoads
inputs = ["file", "file"]
examples = [
["data/test_zh.txt", "data/test_en.txt"],
["data/test_en.txt", "data/test_zh.txt"],
]
outputs = ["dataframe"]
# """
# import logzero
# logzero.loglevel(10)
logger.debug(" debug ")
logger.info(" info ")
# _ = """
inputs = [
gr.inputs.Textbox(
# placeholder="Input something here",
default="test text"
)
]
inputs = ["file", "file"]
inputs = [
gr.inputs.File(label="file 1"),
# gr.inputs.File(file_count="multiple", label="file 2", optional=True),
gr.inputs.File(label="file 2", optional=True),
]
inputs = [
gr.inputs.File(label="file 1"),
gr.inputs.File(label="file 2", optional=True),
gr.inputs.Slider(
minimum=1,
maximum=20,
step=1,
default=6,
# label="suggested min_samples value: 4-8",
),
gr.inputs.Slider(
minimum=1,
maximum=20,
step=0.1,
default=2,
# label="suggested esp value: 1.7-3",
),
]
examples = [
["data/test_zh.txt", "data/test_en.txt", 6, 10, ],
["data/test_en.txt", "data/test_zh.txt", 6, 10, ],
["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt", 6, 10, ],
["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt", 6, 10, ],
["data/hlm-ch1-zh.txt", "data/hlm-ch1-en.txt", 6, 10, ],
["data/hlm-ch1-en.txt", "data/hlm-ch1-zh.txt", 6, 10, ],
]
outputs = ["dataframe", "plot"]
outputs = ["plot"]
outputs = ["dataframe", "plot"]
out1 = gr.outputs.Dataframe(
headers=None,
max_rows=12, # 20
max_cols=None,
overflow_row_behaviour="paginate",
type="auto",
label="To be aligned",
)
outputs = [
out1,
"plot",
]
# outputs = ["dataframe", "plot", "plot"] # wont work
# outputs = ["dataframe"]
# outputs = ["dataframe", "dataframe", ]
# def fn(file1, file2):
def fn(file1, file2, min_samples, eps):
"""Process inputs."""
logger.debug(" *debug* ")
# logger.info("file1: *%s*, file2: *%s*", file1, file2)
logger.info("file1.name: *%s*, file2.name: *%s*", file1.name, file2.name)
# bypass if file1 or file2 is str input
if not (isinstance(file1, str) or isinstance(file2, str)):
text1 = file2text(file1)
text2 = file2text(file2)
lang1, _ = fastlid(text1)
lang2, _ = fastlid(text2)
df1 = files2df(file1, file2)
lst1 = [elm for elm in df1.text1 if elm]
lst2 = [elm for elm in df1.text2 if elm]
len1 = len(lst1)
len2 = len(lst2)
# this wont work
# for obj in [text1, text2, df1, lst1, lst2, ]:
# savelzma(text1) wont work
# for debugging
# joblib.dump(text1, f"data/{nameof(text1)}.lzma")
# joblib.dump(text2, f"data/{nameof(text2)}.lzma")
# joblib.dump(df1, f"data/{nameof(df1)}.lzma")
# joblib.dump(lst1, f"data/{nameof(lst1)}.lzma")
# joblib.dump(lst2, f"data/{nameof(lst2)}.lzma")
cmat = lists2cmat(lst1, lst2)
tset = pd.DataFrame(cmat2tset(cmat))
tset.columns = ["x", "y", "cos"]
# for debugging, logger.debug logger.info dont show up
# print("lst1: %s" % lst1)
# print("lst2: %s" % lst2)
# print("cmat: %s" % cmat)
# print("tset: %s" % tset)
logger.debug("lst1: %s", lst1)
logger.debug("lst2: %s", lst2)
logger.debug("cmat: %s", cmat)
logger.debug("tset: %s", tset)
# plt0 = plot_df(pd.DataFrame(cmat))
df_ = tset
# moved to inputs
# min_samples: int = 6
# eps: float = 10
# ylim: Optional[int] = None
xlabel: str = lang1
ylabel: str = lang2
sns.set()
sns.set_style("darkgrid")
# fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(11.69, 8.27))
# fig, ([ax2, ax0], [ax1, ax3]) = plt.subplots(2, 2, figsize=(11.69, 8.27))
# fig, (ax2, ax0, ax1) = plt.subplots(3)
# fig, (ax2, ax0, ax1) = plt.subplots(3, figsize=(11.69, 8.27))
# fig, (ax2, ax0, ax1) = plt.subplots(1, 3, figsize=(36.69, 8.27))
# fig, (ax2, ax0, ax1) = plt.subplots(1, 3, figsize=(66.69, 22.27))
# fig, (ax2, ax0, ax1) = plt.subplots(1, 3)
# fig.subplots_adjust(hspace=.4)
fig = plt.figure()
gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
ax2 = fig.add_subplot(gs[0, 0])
ax0 = fig.add_subplot(gs[0, 1])
ax1 = fig.add_subplot(gs[1, 0])
cmap = "viridis_r"
sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
ax2.set_xlabel(xlabel)
ax2.set_ylabel(ylabel)
ax2.set_title("cos similarity heatmap")
fig.suptitle("alignment projection")
_ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
_x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)
# clustered
df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
# outliers
df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
# ax0.set_xlabel("")
# ax0.set_ylabel("zh")
ax0.set_xlabel(xlabel)
ax0.set_ylabel(ylabel)
ax0.set_xlim(0, len1)
ax0.set_ylim(0, len2)
ax0.set_title("max along columns ('x': outliers)")
# ax1.set_xlabel("en")
# ax1.set_ylabel("zh")
ax1.set_xlabel(xlabel)
ax1.set_ylabel(ylabel)
ax1.set_xlim(0, len1)
ax1.set_ylim(0, len2)
ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
# return df, plot_df(pd.DataFrame(cmat))
# tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
else:
fig, ax1 = plt.subplots()
df1 = pd.DataFrame(
[
[5.1, 3.5, 0],
[4.9, 3.0, 0],
[7.0, 3.2, 1],
[6.4, 3.2, 1],
[5.9, 3.0, 2],
],
columns=["length", "width", "species"],
)
df1.plot.scatter(x="length", y="width", c="DarkBlue", ax=ax1)
# plt_heatmap = plt
# plt.scatter(df.length, df.width) # gradio eturn plt.gcf() or plt
# return df, plt
# return plt
# return df, df
# return df1.iloc[:10, :], plt
# pd.concat([df0, pd.DataFrame([[".", ".", "..."]], columns=df0.columns)], ignore_index=1)
# pd.concat([df0.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df0.columns), df0.iloc[-1:, :]], ignore_index=1)
# _ = pd.concat([df1.iloc[:4, :], pd.DataFrame([["...", "...", "...", ]], columns=df1.columns), df1.iloc[-2:, :]], ignore_index=True)
# _ = pd.concat([df.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df.columns), df.iloc[-1:, :]], ignore_index=1)
_ = pd.concat(
[
df1.iloc[:4, :],
pd.DataFrame(
[
[
"...",
"...",
]
],
columns=df1.columns,
),
df1.iloc[-4:, :],
],
ignore_index=1,
)
return _, plt
# return _, plt
# """
server_port = 7860
with socket(AF_INET, SOCK_STREAM) as sock:
sock.settimeout(0.01) # 10ms
# try numb times before giving up
numb = 5
for _ in range(numb):
if sock.connect_ex(("127.0.0.1", server_port)) != 0: # port idle
break
server_port = server_port + randint(0, 50)
else:
raise SystemExit(f"Tried {numb} times to no avail, giving up...")
article = dedent(
"""
## NB
* Click "Clear" first for subsequent submits when uploading files.
* Suggested values : min_samples: 4-8, esp (minimum epsilon): 8-12.
- Smaller min_samples or larger esp will result in more aligned pairs but also more **false positives** (pairs falsly identified as candidates). On the other hand, larger min_samples or smaller esp values tend to miss 'good' pairs.
"""
)
css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
css = ".output_image, .input_image {height: 20rem !important; width: 100% !important;}"
css_file = (
".input_file, .output_file {height: 9rem !important; width: 100% !important;}"
)
logger.info("running at port %s", server_port)
iface = gr.Interface(
# fn=greet,
# inputs="text",
# fn=process_upload,
# fn=process_2upoads,
# inputs=["file", "file"],
# outputs="text",
# outputs="html",
fn=fn,
inputs=inputs,
outputs=outputs,
title="radiobee-aligner🔠",
description="showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
article=article,
examples=examples,
# theme="darkgrass",
layout="vertical", # horizontal unaligned
# height=150, # 500
width=900, # 900
allow_flagging=False,
flagging_options=["fatal", "bug", "brainstorm", "excelsior", "paragon"],
css=f"{css} {css_file}",
)
iface.launch(
# share=False,
share=True,
debug=True,
server_name="0.0.0.0",
server_port=server_port,
# show_tips=True,
enable_queue=True,
)
_ = """
ax = sns.heatmap(cmat, cmap="viridis_r")
ax.invert_yaxis()
ax.set_xlabel(fastlid(df.text1)[0])
ax.set_xlabel(fastlid(df.text2)[0])
# return df, plt
return plt.gca()
https://colab.research.google.com/drive/1Gz9624VeAQLT7wlETgjOjPVURzQckXI0#scrollTo=qibtTvwecgsL colab gradio-file-inputs-upload.ipynb
iface = gr.Interface(plot_text, "file", "image")
def is_port_in_use(port):
import socket
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
return s.connect_ex(('localhost', port)) == 0
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex(('127.0.0.1', 7911))
---
css https://huggingface.co./spaces/nielsr/LayoutLMv2-FUNSD/blob/main/app.py#L83
css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
#css = "@media screen and (max-width: 600px) { .output_image, .input_image {height:20rem !important; width: 100% !important;} }"
# css = ".output_image, .input_image {height: 600px !important}"
mod = 'en2zh'
packname = packx.__name__
globals()[mod] = getattr(importlib.import_module(f"{packname}.{mod}"), mod)
"""