Spaces:

mikeee
/

radiobee-aligner

Build error

File size: 13,252 Bytes

"""Run interactively."""
from typing import Tuple  # , Optional

import joblib
from random import randint
from textwrap import dedent
from itertools import zip_longest
from sklearn.cluster import DBSCAN

from socket import socket, AF_INET, SOCK_STREAM
import signal
from varname import nameof
from logzero import logger

# import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# from tabulate import tabulate
from fastlid import fastlid

import gradio as gr
from radiobee.process_upload import process_upload
from radiobee.files2df import files2df
from radiobee.file2text import file2text
from radiobee.lists2cmat import lists2cmat

# from radiobee.plot_df import plot_df
from radiobee.cmat2tset import cmat2tset

sns.set()
sns.set_style("darkgrid")
fastlid.set_languages = ["en", "zh"]

signal.signal(signal.SIGINT, signal.SIG_DFL)
print("Press Ctrl+C to quit\n")


def savelzma(obj, fileloc: str = None):
    if fileloc is None:
        fileloc = nameof(obj)  # this wont work
    joblib.dump(obj, f"data/{fileloc}.lzma")


def greet(input):
    """Greet yo."""
    return f"'Sup yo! (your input: {input})"


def upfile1(file1, file2=None) -> Tuple[str, str]:
    """Upload file1, file2."""
    return file1.name, f"'Sup yo! (your input: {input})"


def process_2upoads(file1, file2):
    """Process stuff."""
    # return f"{process_upload(file1)}\n===***\n{process_upload(file2)}"

    text1 = [_.strip() for _ in process_upload(file1).splitlines() if _.strip()]
    text2 = [_.strip() for _ in process_upload(file2).splitlines() if _.strip()]

    text1, text2 = zip(*zip_longest(text1, text2, fillvalue=""))

    df = pd.DataFrame({"text1": text1, "text2": text2})

    # return tabulate(df)
    # return tabulate(df, tablefmt="grid")
    # return tabulate(df, tablefmt='html')

    return df


if __name__ == "__main__":
    _ = """
    fn = process_2upoads
    inputs = ["file", "file"]
    examples = [
        ["data/test_zh.txt", "data/test_en.txt"],
        ["data/test_en.txt", "data/test_zh.txt"],
    ]
    outputs = ["dataframe"]
    # """
    # import logzero
    # logzero.loglevel(10)
    logger.debug(" debug ")
    logger.info(" info ")

    # _ = """
    inputs = [
        gr.inputs.Textbox(
            # placeholder="Input something here",
            default="test text"
        )
    ]
    inputs = ["file", "file"]
    inputs = [
        gr.inputs.File(label="file 1"),
        # gr.inputs.File(file_count="multiple", label="file 2", optional=True),
        gr.inputs.File(label="file 2", optional=True),
    ]
    inputs = [
        gr.inputs.File(label="file 1"),
        gr.inputs.File(label="file 2", optional=True),
        gr.inputs.Slider(
            minimum=1,
            maximum=20,
            step=1,
            default=6,
            # label="suggested min_samples value: 4-8",
        ),
        gr.inputs.Slider(
            minimum=1,
            maximum=20,
            step=0.1,
            default=2,
            # label="suggested esp value: 1.7-3",
        ),
    ]

    examples = [
        ["data/test_zh.txt", "data/test_en.txt", 6, 10, ],
        ["data/test_en.txt", "data/test_zh.txt", 6, 10, ],
        ["data/shakespeare_zh500.txt", "data/shakespeare_en500.txt", 6, 10, ],
        ["data/shakespeare_en500.txt", "data/shakespeare_zh500.txt", 6, 10, ],
        ["data/hlm-ch1-zh.txt", "data/hlm-ch1-en.txt", 6, 10, ],
        ["data/hlm-ch1-en.txt", "data/hlm-ch1-zh.txt", 6, 10, ],
    ]
    outputs = ["dataframe", "plot"]
    outputs = ["plot"]
    outputs = ["dataframe", "plot"]
    out1 = gr.outputs.Dataframe(
        headers=None,
        max_rows=12,  # 20
        max_cols=None,
        overflow_row_behaviour="paginate",
        type="auto",
        label="To be aligned",
    )
    outputs = [
        out1,
        "plot",
    ]
    # outputs = ["dataframe", "plot", "plot"]  # wont work
    # outputs = ["dataframe"]
    # outputs = ["dataframe", "dataframe", ]

    # def fn(file1, file2):
    def fn(file1, file2, min_samples, eps):
        """Process inputs."""
        logger.debug(" *debug* ")

        # logger.info("file1: *%s*, file2: *%s*", file1, file2)
        logger.info("file1.name: *%s*, file2.name: *%s*", file1.name, file2.name)

        # bypass if file1 or file2 is str input
        if not (isinstance(file1, str) or isinstance(file2, str)):
            text1 = file2text(file1)
            text2 = file2text(file2)
            lang1, _ = fastlid(text1)
            lang2, _ = fastlid(text2)

            df1 = files2df(file1, file2)

            lst1 = [elm for elm in df1.text1 if elm]
            lst2 = [elm for elm in df1.text2 if elm]
            len1 = len(lst1)
            len2 = len(lst2)

            # this wont work
            # for obj in [text1, text2, df1, lst1, lst2, ]:
            # savelzma(text1) wont work

            # for debugging
            # joblib.dump(text1, f"data/{nameof(text1)}.lzma")
            # joblib.dump(text2, f"data/{nameof(text2)}.lzma")
            # joblib.dump(df1, f"data/{nameof(df1)}.lzma")
            # joblib.dump(lst1, f"data/{nameof(lst1)}.lzma")
            # joblib.dump(lst2, f"data/{nameof(lst2)}.lzma")

            cmat = lists2cmat(lst1, lst2)

            tset = pd.DataFrame(cmat2tset(cmat))
            tset.columns = ["x", "y", "cos"]

            # for debugging, logger.debug logger.info dont show up
            # print("lst1: %s" % lst1)
            # print("lst2: %s" % lst2)
            # print("cmat: %s" % cmat)
            # print("tset: %s" % tset)

            logger.debug("lst1: %s", lst1)
            logger.debug("lst2: %s", lst2)
            logger.debug("cmat: %s", cmat)
            logger.debug("tset: %s", tset)

            # plt0 = plot_df(pd.DataFrame(cmat))
            df_ = tset

            # moved to inputs
            # min_samples: int = 6
            # eps: float = 10

            # ylim: Optional[int] = None
            xlabel: str = lang1
            ylabel: str = lang2

            sns.set()
            sns.set_style("darkgrid")

            # fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(11.69, 8.27))
            # fig, ([ax2, ax0], [ax1, ax3]) = plt.subplots(2, 2, figsize=(11.69, 8.27))
            # fig, (ax2, ax0, ax1) = plt.subplots(3)
            # fig, (ax2, ax0, ax1) = plt.subplots(3, figsize=(11.69, 8.27))
            # fig, (ax2, ax0, ax1) = plt.subplots(1, 3, figsize=(36.69, 8.27))
            # fig, (ax2, ax0, ax1) = plt.subplots(1, 3, figsize=(66.69, 22.27))
            # fig, (ax2, ax0, ax1) = plt.subplots(1, 3)
            # fig.subplots_adjust(hspace=.4)

            fig = plt.figure()
            gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
            ax2 = fig.add_subplot(gs[0, 0])
            ax0 = fig.add_subplot(gs[0, 1])
            ax1 = fig.add_subplot(gs[1, 0])

            cmap = "viridis_r"
            sns.heatmap(cmat, cmap=cmap, ax=ax2).invert_yaxis()
            ax2.set_xlabel(xlabel)
            ax2.set_ylabel(ylabel)
            ax2.set_title("cos similarity heatmap")

            fig.suptitle("alignment projection")

            _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
            _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0

            df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)

            # clustered
            df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)

            # outliers
            df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)

            # ax0.set_xlabel("")
            # ax0.set_ylabel("zh")
            ax0.set_xlabel(xlabel)
            ax0.set_ylabel(ylabel)

            ax0.set_xlim(0, len1)
            ax0.set_ylim(0, len2)
            ax0.set_title("max along columns ('x': outliers)")

            # ax1.set_xlabel("en")
            # ax1.set_ylabel("zh")
            ax1.set_xlabel(xlabel)
            ax1.set_ylabel(ylabel)

            ax1.set_xlim(0, len1)
            ax1.set_ylim(0, len2)
            ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")

            # return df, plot_df(pd.DataFrame(cmat))
            # tset.plot.scatter("x", "y", c="cos", cmap="viridis_r")
        else:
            fig, ax1 = plt.subplots()
            df1 = pd.DataFrame(
                [
                    [5.1, 3.5, 0],
                    [4.9, 3.0, 0],
                    [7.0, 3.2, 1],
                    [6.4, 3.2, 1],
                    [5.9, 3.0, 2],
                ],
                columns=["length", "width", "species"],
            )
            df1.plot.scatter(x="length", y="width", c="DarkBlue", ax=ax1)
            # plt_heatmap = plt

        # plt.scatter(df.length, df.width)  # gradio eturn plt.gcf() or plt

        # return df, plt
        # return plt
        # return df, df
        # return df1.iloc[:10, :], plt

        # pd.concat([df0, pd.DataFrame([[".", ".", "..."]], columns=df0.columns)], ignore_index=1)
        # pd.concat([df0.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df0.columns),  df0.iloc[-1:, :]], ignore_index=1)

        # _ = pd.concat([df1.iloc[:4, :], pd.DataFrame([["...", "...", "...", ]], columns=df1.columns), df1.iloc[-2:, :]], ignore_index=True)
        # _ = pd.concat([df.iloc[:2, :], pd.DataFrame([[".", ".", "..."]], columns=df.columns),  df.iloc[-1:, :]], ignore_index=1)

        _ = pd.concat(
            [
                df1.iloc[:4, :],
                pd.DataFrame(
                    [
                        [
                            "...",
                            "...",
                        ]
                    ],
                    columns=df1.columns,
                ),
                df1.iloc[-4:, :],
            ],
            ignore_index=1,
        )

        return _, plt
        # return _, plt

    # """

    server_port = 7860
    with socket(AF_INET, SOCK_STREAM) as sock:
        sock.settimeout(0.01)  # 10ms

        # try numb times before giving up
        numb = 5
        for _ in range(numb):
            if sock.connect_ex(("127.0.0.1", server_port)) != 0:  # port idle
                break
            server_port = server_port + randint(0, 50)
        else:
            raise SystemExit(f"Tried {numb} times to no avail, giving up...")

    article = dedent(
        """
        ## NB
        *   Click "Clear" first for subsequent submits when uploading files.
        *   Suggested values : min_samples: 4-8, esp (minimum epsilon): 8-12.
           -   Smaller min_samples or larger esp will result in more aligned pairs but also more **false positives** (pairs falsly identified as candidates). On the other hand, larger min_samples or smaller esp values tend to miss 'good' pairs.
    """
    )
    css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
    css = ".output_image, .input_image {height: 20rem !important; width: 100% !important;}"
    css_file = (
        ".input_file, .output_file {height: 9rem !important; width: 100% !important;}"
    )

    logger.info("running at port %s", server_port)

    iface = gr.Interface(
        # fn=greet,
        # inputs="text",
        # fn=process_upload,
        # fn=process_2upoads,
        # inputs=["file", "file"],
        # outputs="text",
        # outputs="html",
        fn=fn,
        inputs=inputs,
        outputs=outputs,
        title="radiobee-aligner🔠",
        description="showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
        article=article,
        examples=examples,
        # theme="darkgrass",
        layout="vertical",  # horizontal unaligned
        # height=150,  # 500
        width=900,  # 900
        allow_flagging=False,
        flagging_options=["fatal", "bug", "brainstorm", "excelsior", "paragon"],
        css=f"{css} {css_file}",
    )

    iface.launch(
        # share=False,
        share=True,
        debug=True,
        server_name="0.0.0.0",
        server_port=server_port,
        # show_tips=True,
        enable_queue=True,
    )

_ = """

        ax = sns.heatmap(cmat, cmap="viridis_r")

        ax.invert_yaxis()
        ax.set_xlabel(fastlid(df.text1)[0])
        ax.set_xlabel(fastlid(df.text2)[0])

        # return df, plt
        return plt.gca()

https://colab.research.google.com/drive/1Gz9624VeAQLT7wlETgjOjPVURzQckXI0#scrollTo=qibtTvwecgsL colab gradio-file-inputs-upload.ipynb
    iface = gr.Interface(plot_text, "file", "image")

def is_port_in_use(port):
    import socket
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(('localhost', port)) == 0

socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex(('127.0.0.1', 7911))

---
css https://huggingface.co./spaces/nielsr/LayoutLMv2-FUNSD/blob/main/app.py#L83

css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
#css = "@media screen and (max-width: 600px) { .output_image, .input_image {height:20rem !important; width: 100% !important;} }"
# css = ".output_image, .input_image {height: 600px !important}"

mod = 'en2zh'
packname = packx.__name__

globals()[mod] = getattr(importlib.import_module(f"{packname}.{mod}"), mod)

"""