File size: 2,838 Bytes
dab2de2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""Plot pandas.DataFrame with DBSCAN clustering."""
# pylint: disable=invalid-name, too-many-arguments
# import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN

from logzero import logger

# turn interactive when in ipython session
if "get_ipython" in globals():
    plt.ion()


# fmt: off
def plot_df(
        df_: pd.DataFrame,
        min_samples: int = 6,
        eps: float = 10,
        ylim: int = None,
        xlabel: str = "en",
        ylabel: str = "zh",
) -> plt:
    # fmt: on
    """Plot df with DBSCAN clustering.

    Args:
        df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
    Returns:
        matplotlib.pyplot: for possible use in gradio

    plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']))
    df_ = pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos'])

    # sort 'x', axis 0 changes, index regenerated
    df_s = df_.sort_values('x', axis=0, ignore_index=True)

    # sorintg does not seem to impact clustering
    DBSCAN(1.5, min_samples=3).fit(df_).labels_
    DBSCAN(1.5, min_samples=3).fit(df_s).labels_

    """
    df_ = pd.DataFrame(df_)
    if df_.columns.__len__() < 3:
        logger.error(
            "expected 3 columns DataFram, got: %s, cant proceed, returninng None",
            df_.columns.tolist(),
        )
        return None

    # take first three columns
    columns = df_.columns[:3]
    df_ = df_[columns]

    # rename columns to "x", "y", "cos"
    df_.columns = ["x", "y", "cos"]

    sns.set()
    sns.set_style("darkgrid")
    # fig, (ax0, ax1) = plt.subplots(2, figsize=(11.69, 8.27))
    fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(11.69, 8.27))

    fig.suptitle("alignment projection")
    _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
    _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0

    # ax0.scatter(df_[_].x, df_[_].y, marker='o', c='g', alpha=0.5)
    # ax0.grid()
    # print("ratio: %.2f%%" % (100 * sum(_)/len(df_)))

    df_.plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax0)

    # clustered
    df_[_].plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax1)

    # outliers
    df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)

    # ax0.set_xlabel("")
    # ax0.set_ylabel("zh")
    ax0.set_xlabel("")
    ax0.set_ylabel(ylabel)
    xlim = len(df_)
    ax0.set_xlim(0, xlim)
    if ylim:
        ax0.set_ylim(0, ylim)
    ax0.set_title("max similarity along columns (outliers denoted by 'x')")

    # ax1.set_xlabel("en")
    # ax1.set_ylabel("zh")
    ax1.set_xlabel(xlabel)
    ax1.set_ylabel(ylabel)

    ax1.set_xlim(0, xlim)
    if ylim:
        ax1.set_ylim(0, ylim)
    ax1.set_title(f"potential aligned pairs ({round(sum(_) / len(df_), 2):.0%})")

    return plt