Spaces:

mikeee
/

radiobee-aligner

Build error

radiobee-aligner / radiobee /__main__.py

freemt

Update plot_mat

d5ff673 almost 3 years ago

17.8 kB

	"""Run interactively."""
	# pylint: disable=invalid-name, too-many-arguments, unused-argument, redefined-builtin, wrong-import-position, too-many-locals, too-many-statements
	from typing import Tuple # , Optional

	import sys
	from pathlib import Path
	import signal
	from random import randint
	from textwrap import dedent
	from itertools import zip_longest
	from socket import socket, AF_INET, SOCK_STREAM

	from sklearn.cluster import DBSCAN # noqa
	import joblib
	from varname import nameof
	from logzero import logger

	# import numpy as np
	import pandas as pd
	import seaborn as sns

	import matplotlib.pyplot as plt # noqa

	# from tabulate import tabulate
	from fastlid import fastlid

	if "." not in sys.path:
	sys.path.insert(0, ".")
	import gradio as gr
	from radiobee.process_upload import process_upload
	from radiobee.files2df import files2df
	from radiobee.file2text import file2text
	from radiobee.lists2cmat import lists2cmat
	from radiobee.gen_pset import gen_pset
	from radiobee.gen_aset import gen_aset
	from radiobee.align_texts import align_texts

	from radiobee.cmat2tset import cmat2tset

	# from radiobee.plot_df import plot_df
	# from radiobee.plot_cmat import plot_cmat
	from radiobee.trim_df import trim_df

	sns.set()
	sns.set_style("darkgrid")
	fastlid.set_languages = ["en", "zh"]

	signal.signal(signal.SIGINT, signal.SIG_DFL)
	print("Press Ctrl+C to quit\n")


	def savelzma(obj, fileloc: str = None):
	"""Aux funciton."""
	if fileloc is None:
	fileloc = nameof(obj) # this wont work
	joblib.dump(obj, f"data/{fileloc}.lzma")


	def greet(input):
	"""Greet yo."""
	return f"'Sup yo! (your input: {input})"


	def upfile1(file1, file2=None) -> Tuple[str, str]:
	"""Upload file1, file2."""
	del file2
	return file1.name, f"'Sup yo! (your input: {input})"


	def process_2upoads(file1, file2):
	"""Process stuff."""
	# return f"{process_upload(file1)}\n===***\n{process_upload(file2)}"

	text1 = [_.strip() for _ in process_upload(file1).splitlines() if _.strip()]
	text2 = [_.strip() for _ in process_upload(file2).splitlines() if _.strip()]

	text1, text2 = zip(*zip_longest(text1, text2, fillvalue=""))

	df = pd.DataFrame({"text1": text1, "text2": text2})

	# return tabulate(df)
	# return tabulate(df, tablefmt="grid")
	# return tabulate(df, tablefmt='html')

	return df


	if __name__ == "__main__":
	_ = """
	fn = process_2upoads
	inputs = ["file", "file"]
	examples = [
	["data/test_zh.txt", "data/test_en.txt"],
	["data/test_en.txt", "data/test_zh.txt"],
	]
	outputs = ["dataframe"]
	# """
	import logzero

	# debug = True
	debug = False
	if debug:
	logzero.loglevel(10)
	logger.debug(" debug ")
	logger.info(" info ")

	# _ = """
	inputs = [
	gr.inputs.Textbox(
	# placeholder="Input something here",
	default="test text"
	)
	]
	inputs = ["file", "file"]
	inputs = [
	gr.inputs.File(label="file 1"),
	# gr.inputs.File(file_count="multiple", label="file 2", optional=True),
	gr.inputs.File(label="file 2", optional=True),
	]

	# modi 1
	_ = """
	tf_type: Literal[linear, sqrt, log, binary] = 'linear'
	idf_type: Optional[Literal[standard, smooth, bm25]] = None
	dl_type: Optional[Literal[linear, sqrt, log]] = None
	norm: norm: Optional[Literal[l1, l2]] = None
	x min_df: int \| float = 1
	x max_df: int \| float = 1.0
	# """
	input_tf_type = gr.inputs.Dropdown(
	["linear", "sqrt", "log", "binary"], default="linear"
	)
	input_idf_type = gr.inputs.Radio(
	["None", "standard", "smooth", "bm25"], default="None"
	) # need to convert "None" this to None in fn
	input_dl_type = gr.inputs.Radio(
	["None", "linear", "sqrt", "log"], default="None"
	) # ditto
	input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None") # ditto

	inputs = [
	gr.inputs.File(label="file 1"),
	gr.inputs.File(label="file 2", optional=True),
	input_tf_type, # modi inputs
	input_idf_type,
	input_dl_type,
	input_norm_type,
	gr.inputs.Slider(
	minimum=1,
	maximum=20,
	step=0.1,
	default=10,
	),
	gr.inputs.Slider(
	minimum=1,
	maximum=20,
	step=1,
	default=6,
	),
	]

	# modi
	examples = [
	[
	"data/test_zh.txt",
	"data/test_en.txt",
	"linear",
	"None",
	"None",
	"None",
	10,
	6,
	],
	[
	"data/test_en.txt",
	"data/test_zh.txt",
	"linear",
	"None",
	"None",
	"None",
	10,
	6,
	],
	[
	"data/shakespeare_zh500.txt",
	"data/shakespeare_en500.txt",
	"linear",
	"None",
	"None",
	"None",
	10,
	6,
	],
	[
	"data/shakespeare_en500.txt",
	"data/shakespeare_zh500.txt",
	"linear",
	"None",
	"None",
	"None",
	10,
	6,
	],
	[
	"data/hlm-ch1-zh.txt",
	"data/hlm-ch1-en.txt",
	"linear",
	"None",
	"None",
	"None",
	10,
	6,
	],
	[
	"data/hlm-ch1-en.txt",
	"data/hlm-ch1-zh.txt",
	"linear",
	"None",
	"None",
	"None",
	10,
	6,
	],
	[
	"data/ps-cn.txt",
	"data/ps-en.txt",
	"linear",
	"None",
	"None",
	"None",
	10,
	4,
	],
	]
	outputs = ["dataframe", "plot"]
	outputs = ["plot"]
	outputs = ["dataframe", "plot"]
	out_df = gr.outputs.Dataframe(
	headers=None,
	max_rows=12, # 20
	max_cols=None,
	overflow_row_behaviour="paginate",
	type="auto",
	label="To be aligned",
	)
	out_df_aligned = gr.outputs.Dataframe(
	headers=None,
	# max_rows=12, # 20
	max_cols=3,
	overflow_row_behaviour="paginate",
	type="auto",
	label="aligned pairs",
	)
	out_file_dl = gr.outputs.File(
	label="Click to download csv",
	)
	out_file_dl_excel = gr.outputs.File(
	label="Click to download xlsx",
	)

	# modi outputs
	outputs = [
	out_df,
	"plot",
	out_file_dl,
	out_file_dl_excel,
	out_df_aligned,
	]
	# outputs = ["dataframe", "plot", "plot"] # wont work
	# outputs = ["dataframe"]
	# outputs = ["dataframe", "dataframe", ]

	# def fn(file1, file2):
	# def fn(file1, file2, min_samples, eps):
	def fn(
	file1,
	file2,
	tf_type,
	idf_type,
	dl_type,
	norm,
	eps,
	min_samples,
	):
	# modi fn
	"""Process inputs and return outputs."""
	logger.debug(" debug ")

	# conver "None" to None for those Radio types
	for _ in [idf_type, dl_type, norm]:
	if _ in "None":
	_ = None

	# logger.info("file1: %s, file2: %s", file1, file2)
	logger.info("file1.name: %s, file2.name: %s", file1.name, file2.name)

	# bypass if file1 or file2 is str input
	# if not (isinstance(file1, str) or isinstance(file2, str)):
	text1 = file2text(file1)
	text2 = file2text(file2)
	lang1, _ = fastlid(text1)
	lang2, _ = fastlid(text2)

	df1 = files2df(file1, file2)

	lst1 = [elm for elm in df1.text1 if elm]
	lst2 = [elm for elm in df1.text2 if elm]
	# len1 = len(lst1) # noqa
	# len2 = len(lst2) # noqa

	cmat = lists2cmat(
	lst1,
	lst2,
	tf_type=tf_type,
	idf_type=idf_type,
	dl_type=dl_type,
	norm=norm,
	)

	tset = pd.DataFrame(cmat2tset(cmat))
	tset.columns = ["x", "y", "cos"]

	df_trimmed = trim_df(df1)
	_ = """
	df_trimmed = pd.concat(
	[
	df1.iloc[:4, :],
	pd.DataFrame(
	[
	[
	"...",
	"...",
	]
	],
	columns=df1.columns,
	),
	df1.iloc[-4:, :],
	],
	ignore_index=1,
	)
	# """

	# process lst1, lst2 to obtained df_aligned
	# quick fix ValueError: not enough values to unpack (expected at least 1, got 0)
	# fixed in gen_pet, but we leave the loop here
	for min_s in range(min_samples):
	logger.info(" min_samples, using %s", min_samples - min_s)
	try:
	pset = gen_pset(
	cmat,
	eps=eps,
	min_samples=min_samples - min_s,
	delta=7,
	)
	break
	except ValueError:
	logger.info(" decrease min_samples by %s", min_s + 1)
	continue
	except Exception as e:
	logger.error(e)
	continue
	else:
	# break should happen above when min_samples = 2
	raise Exception("bummer, this shouldn't happen, probably another bug")

	min_samples = gen_pset.min_samples

	# will result in error message:
	# UserWarning: Starting a Matplotlib GUI outside of
	# the main thread will likely fail."
	_ = """
	plot_cmat(
	cmat,
	eps=eps,
	min_samples=min_samples,
	xlabel=lang1,
	ylabel=lang2,
	)
	# """

	# move plot_cmat's code to the main thread here
	# to make it work
	xlabel = lang1
	ylabel = lang2

	len1, len2 = cmat.shape
	ylim, xlim = len1, len2

	# does not seem to show up
	logger.debug(" len1 (ylim): %s, len2 (xlim): %s", len1, len2)
	if debug:
	print(f" len1 (ylim): {len1}, len2 (xlim): {len2}")

	df_ = pd.DataFrame(cmat2tset(cmat))
	df_.columns = ["x", "y", "cos"]

	sns.set()
	sns.set_style("darkgrid")

	# close all existing figures, necesssary for hf spaces
	plt.close("all")
	# if sys.platform not in ["win32", "linux"]:
	plt.switch_backend('Agg') # to cater for Mac, thanks to WhiteFox

	# figsize=(13, 8), (339, 212) mm on '1280x800+0+0'
	fig = plt.figure(figsize=(13, 8))

	# gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
	gs = fig.add_gridspec(1, 2, wspace=0.4, hspace=0.58)
	ax_heatmap = fig.add_subplot(gs[0, 0]) # ax2
	ax0 = fig.add_subplot(gs[0, 1])
	# ax1 = fig.add_subplot(gs[1, 0])

	cmap = "viridis_r"
	sns.heatmap(cmat, cmap=cmap, ax=ax_heatmap).invert_yaxis()
	ax_heatmap.set_xlabel(xlabel)
	ax_heatmap.set_ylabel(ylabel)
	ax_heatmap.set_title("cos similarity heatmap")

	fig.suptitle(f"alignment projection\n(eps={eps}, min_samples={min_samples})")

	_ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
	# _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
	_x = ~_

	# max cos along columns
	df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)

	# outliers
	df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
	ax0.set_xlabel(xlabel)
	ax0.set_ylabel(ylabel)
	ax0.set_xlim(xmin=0, xmax=xlim)
	ax0.set_ylim(ymin=0, ymax=ylim)
	ax0.set_title(
	"max along columns ('x': outliers)\n"
	"potential aligned pairs (green line)\n"
	f"({round(sum(_) / xlim, 2):.0%})"
	)

	# clustered
	# df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
	# ax1.set_xlabel(xlabel)
	# ax1.set_ylabel(ylabel)
	# ax1.set_xlim(0, len1)
	# ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
	# end of plot_cmat

	src_len, tgt_len = cmat.shape
	aset = gen_aset(pset, src_len, tgt_len)
	final_list = align_texts(aset, lst2, lst1) # note the order

	# df_aligned = df_trimmed
	df_aligned = pd.DataFrame(final_list, columns=["text1", "text2", "likelihood"])

	# swap text1 text2
	df_aligned = df_aligned[["text2", "text1", "likelihood"]]
	df_aligned.columns = ["text1", "text2", "likelihood"]

	_ = df_aligned.to_csv(index=False)
	file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
	file_dl.write_text(_, encoding="utf8")

	# file_dl.write_text(_, encoding="gb2312") # no go

	file_dl_xlsx = Path(
	f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.xlsx"
	)
	df_aligned.to_excel(file_dl_xlsx)

	# return df_trimmed, plt
	return df_trimmed, plt, file_dl, file_dl_xlsx, df_aligned
	# modi outputs

	server_port = 7860
	with socket(AF_INET, SOCK_STREAM) as sock:
	sock.settimeout(0.01) # 10ms

	# try numb times before giving up
	numb = 5
	for _ in range(numb):
	if sock.connect_ex(("127.0.0.1", server_port)) != 0: # port idle
	break
	server_port = server_port + randint(0, 50)
	else:
	raise SystemExit(f"Tried {numb} times to no avail, giving up...")

	article = dedent(
	"""
	## NB
	* `radiobee aligner` is a sibling of `bumblebee aligner`. To know more about these aligners, please join qq group `316287378`.
	* Uploaded files should be in pure text format (txt, md, csv etc). `docx`, `pdf`, `srt`, `html` etc may be supported later on.
	* Click "Clear" first for subsequent submits when uploading files.
	* `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
	* Suggested `esp` and `min_samples` values -- `esp` (minimum epsilon): 8-12, `min_samples`: 4-8.
	- Smaller larger `esp` or `min_samples` will result in more aligned pairs but also more false positives (pairs
	falsely identified as candidates). On the other hand,
	larger smaller `esp` or `min_samples` values tend to miss
	'good' pairs.
	* If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
	* `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
	"""
	)
	css_image = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
	# css = ".output_image, .input_image {height: 20rem !important; width: 100% !important;}"
	css_input_file = (
	".input_file, {height: 9rem !important; width: 100% !important;}"
	)
	css_output_file = (
	".output_file , {height: 4rem !important; width: 100% !important;}"
	)

	logger.info("running at port %s", server_port)

	iface = gr.Interface(
	# fn=greet,
	# inputs="text",
	# fn=process_upload,
	# fn=process_2upoads,
	# inputs=["file", "file"],
	# outputs="text",
	# outputs="html",
	fn=fn,
	inputs=inputs,
	outputs=outputs,
	title="radiobee-aligner🔠",
	description="WIP showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
	article=article,
	examples=examples,
	# theme="darkgrass",
	theme="grass",
	layout="vertical", # horizontal unaligned
	# height=150, # 500
	width=900, # 900
	allow_flagging=True,
	flagging_options=[
	"fatal",
	"bug",
	"brainstorm",
	"excelsior",
	], # "paragon"],
	css=f"{css_image} {css_input_file} {css_output_file}",
	)

	iface.launch(
	share=False,
	# share=True,
	debug=debug,
	# server_name="0.0.0.0",
	server_name="127.0.0.1",
	server_port=server_port,
	# show_tips=True,
	enable_queue=True,
	)

	_ = """

	ax = sns.heatmap(cmat, cmap="viridis_r")

	ax.invert_yaxis()
	ax.set_xlabel(fastlid(df.text1)[0])
	ax.set_xlabel(fastlid(df.text2)[0])

	# return df, plt
	return plt.gca()

	https://colab.research.google.com/drive/1Gz9624VeAQLT7wlETgjOjPVURzQckXI0#scrollTo=qibtTvwecgsL colab gradio-file-inputs-upload.ipynb
	iface = gr.Interface(plot_text, "file", "image")

	def is_port_in_use(port):
	import socket
	with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
	return s.connect_ex(('localhost', port)) == 0

	socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex(('127.0.0.1', 7911))

	---
	css https://huggingface.co./spaces/nielsr/LayoutLMv2-FUNSD/blob/main/app.py#L83

	css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
	#css = "@media screen and (max-width: 600px) { .output_image, .input_image {height:20rem !important; width: 100% !important;} }"
	# css = ".output_image, .input_image {height: 600px !important}"

	mod = 'en2zh'
	packname = packx.__name__

	globals()[mod] = getattr(importlib.import_module(f"{packname}.{mod}"), mod)

	"""