Spaces:

Yuichiroh
/

UOT

Sleeping

App Files Files Community

4kasha commited on Feb 13

Commit

37d364a

•

1 Parent(s): 94f5fd3

update

Browse files

Files changed (5) hide show

aligner.py +8 -19
app.py +85 -42
otfuncs.py +28 -14
plotools.py +137 -75
requirements.txt +2 -1

aligner.py CHANGED Viewed

@@ -10,10 +10,9 @@ from otfuncs import (
 )
 class Aligner:
-    def __init__(self, ot_type, sinkhorn, chimera, dist_type, weight_type, distortion, thresh, tau, **kwargs):
         self.ot_type = ot_type
         self.sinkhorn = sinkhorn
-        self.chimera = chimera
         self.dist_type = dist_type
         self.weight_type = weight_type
         self.distotion = distortion
@@ -31,20 +30,19 @@ class Aligner:
             self.weight_func = compute_weights_norm
     def compute_alignment_matrixes(self, s1_word_embeddigs, s2_word_embeddigs):
-        P, Cost, log, similarity_matrix = self.compute_optimal_transport(s1_word_embeddigs, s2_word_embeddigs)
         print(log.keys())
         if torch.is_tensor(P):
             P = P.to('cpu').numpy()
         loss = log.get('cost', 'NotImplemented')
-        return P, Cost, loss, similarity_matrix
     def compute_optimal_transport(self, s1_word_embeddigs, s2_word_embeddigs):
         s1_word_embeddigs = s1_word_embeddigs.to(torch.float64)
         s2_word_embeddigs = s2_word_embeddigs.to(torch.float64)
-        C, similarity_matrix = self.dist_func(s1_word_embeddigs, s2_word_embeddigs, self.distotion)
         s1_weights, s2_weights = self.weight_func(s1_word_embeddigs, s2_word_embeddigs)
         if self.ot_type == 'ot':
@@ -64,14 +62,8 @@ class Aligner:
             P = min_max_scaling(P)
         elif self.ot_type == 'pot':
-            if self.chimera:
-                m = self.tau * self.bertscore_F1(s1_word_embeddigs, s2_word_embeddigs)
-                m = min(1.0, m.item())
-            else:
-                m = self.tau
             s1_weights, s2_weights, C = self.convert_to_numpy(s1_weights, s2_weights, C)
-            m = np.min((np.sum(s1_weights), np.sum(s2_weights))) * m
             if self.sinkhorn:
                 P, log = ot.partial.entropic_partial_wasserstein(
@@ -86,10 +78,7 @@ class Aligner:
             P = min_max_scaling(P)
         elif 'uot' in self.ot_type:
-            if self.chimera:
-                tau = self.tau * self.bertscore_F1(s1_word_embeddigs, s2_word_embeddigs)
-            else:
-                tau = self.tau
             if self.ot_type == 'uot':
                 P, log = ot.unbalanced.sinkhorn_stabilized_unbalanced(
@@ -107,7 +96,7 @@ class Aligner:
         elif self.ot_type == 'none':
             P = 1 - C
-        return P, C, log, similarity_matrix
     def convert_to_numpy(self, s1_weights, s2_weights, C):
         if torch.is_tensor(s1_weights):
@@ -116,4 +105,4 @@ class Aligner:
         if torch.is_tensor(C):
             C = C.to('cpu').numpy()
-        return s1_weights, s2_weights, C

 )
 class Aligner:
+    def __init__(self, ot_type, sinkhorn, dist_type, weight_type, distortion, thresh, tau, **kwargs):
         self.ot_type = ot_type
         self.sinkhorn = sinkhorn
         self.dist_type = dist_type
         self.weight_type = weight_type
         self.distotion = distortion
             self.weight_func = compute_weights_norm
     def compute_alignment_matrixes(self, s1_word_embeddigs, s2_word_embeddigs):
+        P, Cost, log, similarity_matrix, relative_distance = self.compute_optimal_transport(s1_word_embeddigs, s2_word_embeddigs)
         print(log.keys())
         if torch.is_tensor(P):
             P = P.to('cpu').numpy()
         loss = log.get('cost', 'NotImplemented')
+        return P, Cost, loss, similarity_matrix, relative_distance
     def compute_optimal_transport(self, s1_word_embeddigs, s2_word_embeddigs):
         s1_word_embeddigs = s1_word_embeddigs.to(torch.float64)
         s2_word_embeddigs = s2_word_embeddigs.to(torch.float64)
+        C, similarity_matrix, relative_distance = self.dist_func(s1_word_embeddigs, s2_word_embeddigs, self.distotion)
         s1_weights, s2_weights = self.weight_func(s1_word_embeddigs, s2_word_embeddigs)
         if self.ot_type == 'ot':
             P = min_max_scaling(P)
         elif self.ot_type == 'pot':
             s1_weights, s2_weights, C = self.convert_to_numpy(s1_weights, s2_weights, C)
+            m = np.min((np.sum(s1_weights), np.sum(s2_weights))) * self.tau
             if self.sinkhorn:
                 P, log = ot.partial.entropic_partial_wasserstein(
             P = min_max_scaling(P)
         elif 'uot' in self.ot_type:
+            tau = self.tau
             if self.ot_type == 'uot':
                 P, log = ot.unbalanced.sinkhorn_stabilized_unbalanced(
         elif self.ot_type == 'none':
             P = 1 - C
+        return P, C, log, similarity_matrix, relative_distance
     def convert_to_numpy(self, s1_weights, s2_weights, C):
         if torch.is_tensor(s1_weights):
         if torch.is_tensor(C):
             C = C.to('cpu').numpy()
+        return s1_weights, s2_weights, C

app.py CHANGED Viewed

@@ -1,44 +1,53 @@
-import streamlit as st
 import random
 import numpy as np
 import torch
 from nltk.tokenize import word_tokenize
-from transformers import AutoTokenizer, AutoModel
 from aligner import Aligner
-from utils import (
-    encode_sentence,
-    centering,
-    convert_to_word_embeddings
 )
-from plotools import plot_align_matrix_heatmap_plotly, plot_similarity_matrix_heatmap_plotly
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch.manual_seed(42)
 np.random.seed(42)
 random.seed(42)
 import nltk
-nltk.download('punkt')
 @st.cache_resource
 def init_model(model: str):
     tokenizer = AutoTokenizer.from_pretrained(model)
-    model = AutoModel.from_pretrained(model, output_hidden_states=True).to(device).eval()
     return tokenizer, model
 @st.cache_resource(max_entries=100)
-def init_aligner(ot_type: str, sinkhorn: bool, distortion: float, threshhold: float, tau: float):
     return Aligner(
         ot_type=ot_type,
         sinkhorn=sinkhorn,
-        chimera=False,
         dist_type="cos",
         weight_type="uniform",
         distortion=distortion,
-        thresh=threshhold,
-        tau=tau,
-        div_type="--"
     )
@@ -47,51 +56,70 @@ def main():
     # Sidebar
     st.sidebar.markdown("## Settings & Parameters")
-    model = st.sidebar.selectbox('model', ['microsoft/deberta-v3-base', 'bert-base-uncased'])
     layer = st.sidebar.slider(
-        'layer number for embeddings', 0, 11, value=9,
     )
-    is_centering = st.sidebar.checkbox('centering embeddings', value=True)
     ot_type = st.sidebar.selectbox(
-        'ot_type', ['OT', 'POT', 'UOT'],
-        help="optimal transport algorithm to be used"
     )
     ot_type = ot_type.lower()
     sinkhorn = st.sidebar.checkbox(
-        'sinkhorn', value=True,
-        help="use sinkhorn algorithm"
     )
     distortion = st.sidebar.slider(
-        'distortion: $\kappa$', 0.0, 1.0, value=0.20,
-        help="suppression of off-diagonal alignments"
     )
     tau = st.sidebar.slider(
-        'm / $\\tau$', 0.0, 1.0, value=0.98,
-        help="fraction of fertility to be aligned (fraction of mass to be transported) / penalties"
-    )
     threshhold = st.sidebar.slider(
-        'threshhold: $\lambda$', 0.0, 1.0, value=0.22,
-        help="sparsity of alignment matrix"
-    )
     # Content
-    st.markdown('## Playground: Unbalanced Optimal Transport for Unbalanced Word Alignment')
     col1, col2 = st.columns(2)
     with col1:
-      sent1 = st.text_area(
-          'sentence 1',
-          'By one estimate, fewer than 20,000 lions exist in the wild, a drop of about 40 percent in the past two decades.',
-          help="Initial text"
-      )
     with col2:
-      sent2 = st.text_area(
-          'sentence 2',
-          'Today there are only around 20,000 wild lions left in the world.',
-          help="Text to compare"
-      )
     tokenizer, model = init_model(model)
     aligner = init_aligner(ot_type, sinkhorn, distortion, threshhold, tau)
@@ -115,10 +143,25 @@ def main():
         st.write(f"**word similarity matrix**")
         fig2 = plot_similarity_matrix_heatmap_plotly(similarity_matrix.T, sent1, sent2, cost_matrix.T)
         st.plotly_chart(fig2, use_container_width=True)
     st.divider()
     st.subheader('Refs')
     st.write("Yuki Arase, Han Bao, Sho Yokoi, [Unbalanced Optimal Transport for Unbalanced Word Alignment](https://arxiv.org/abs/2306.04116), ACL2023 [[github](https://github.com/yukiar/OTAlign/tree/main)]")
 if __name__ == '__main__':
-    main()

 import random
 import numpy as np
+import streamlit as st
 import torch
+import umap
 from nltk.tokenize import word_tokenize
+from transformers import AutoModel, AutoTokenizer
 from aligner import Aligner
+# from utils import align_matrix_heatmap, plot_align_matrix_heatmap
+from plotools import (
+    plot_align_matrix_heatmap_plotly,
+    plot_similarity_matrix_heatmap_plotly,
+    show_assignments_plotly,
 )
+from utils import centering, convert_to_word_embeddings, encode_sentence
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch.manual_seed(42)
 np.random.seed(42)
 random.seed(42)
 import nltk
+nltk.download("punkt")
 @st.cache_resource
 def init_model(model: str):
     tokenizer = AutoTokenizer.from_pretrained(model)
+    model = (
+        AutoModel.from_pretrained(model, output_hidden_states=True).to(device).eval()
+    )
     return tokenizer, model
 @st.cache_resource(max_entries=100)
+def init_aligner(
+    ot_type: str, sinkhorn: bool, distortion: float, threshhold: float, tau: float
+):
     return Aligner(
         ot_type=ot_type,
         sinkhorn=sinkhorn,
         dist_type="cos",
         weight_type="uniform",
         distortion=distortion,
+        thresh=threshhold,  # 0.25252525252525254
+        tau=tau,  # 0.9803921568627451
+        div_type="--",
     )
     # Sidebar
     st.sidebar.markdown("## Settings & Parameters")
+    model = st.sidebar.selectbox(
+        "model", ["microsoft/deberta-v3-base", "bert-base-uncased"]
+    )
     layer = st.sidebar.slider(
+        "layer number for embeddings",
+        0,
+        11,
+        value=9,
     )
+    is_centering = st.sidebar.checkbox("centering embeddings", value=True)
     ot_type = st.sidebar.selectbox(
+        "ot_type", ["POT", "UOT", "OT"], help="optimal transport algorithm to be used"
     )
     ot_type = ot_type.lower()
     sinkhorn = st.sidebar.checkbox(
+        "sinkhorn", value=True, help="use sinkhorn algorithm"
     )
     distortion = st.sidebar.slider(
+        "distortion: $\kappa$",
+        0.0,
+        1.0,
+        value=0.20,
+        help="suppression of off-diagonal alignments",
     )
     tau = st.sidebar.slider(
+        "m / $\\tau$",
+        0.0,
+        1.0,
+        value=0.98,
+        help="fraction of fertility to be aligned (fraction of mass to be transported) / penalties",
+    )  # with 0.02 interva
     threshhold = st.sidebar.slider(
+        "threshhold: $\lambda$",
+        0.0,
+        1.0,
+        value=0.22,
+        help="sparsity of alignment matrix",
+    )  # with 0.01 interval
+    show_assignments = st.sidebar.checkbox("show assignments", value=True)
+    if show_assignments:
+        n_neighbors = st.sidebar.slider(
+            "n_neighbors", 2, 10, value=8, help="number of neighbors for umap"
+        )
     # Content
+    st.markdown(
+        "## Playground: Unbalanced Optimal Transport for Unbalanced Word Alignment"
+    )
     col1, col2 = st.columns(2)
     with col1:
+        sent1 = st.text_area(
+            "sentence 1",
+            "By one estimate, fewer than 20,000 lions exist in the wild, a drop of about 40 percent in the past two decades.",
+            help="Initial text",
+        )
     with col2:
+        sent2 = st.text_area(
+            "sentence 2",
+            "Today there are only around 20,000 wild lions left in the world.",
+            help="Text to compare",
+        )
     tokenizer, model = init_model(model)
     aligner = init_aligner(ot_type, sinkhorn, distortion, threshhold, tau)
         st.write(f"**word similarity matrix**")
         fig2 = plot_similarity_matrix_heatmap_plotly(similarity_matrix.T, sent1, sent2, cost_matrix.T)
         st.plotly_chart(fig2, use_container_width=True)
+        if show_assignments:
+            st.write(f"**Alignments after UMAP**")
+            word_embeddings = torch.vstack([s1_vec, s2_vec])
+            umap_embeddings = umap.UMAP(
+                n_neighbors=n_neighbors,
+                n_components=2,
+                random_state=42,
+                metric="cosine",
+            ).fit_transform(word_embeddings.detach().numpy())
+            print(umap_embeddings.shape)
+            fig3 = show_assignments_plotly(
+                align_matrix, umap_embeddings, sent1, sent2, thr=threshhold
+            )
+            st.plotly_chart(fig3, use_container_width=True)
     st.divider()
     st.subheader('Refs')
     st.write("Yuki Arase, Han Bao, Sho Yokoi, [Unbalanced Optimal Transport for Unbalanced Word Alignment](https://arxiv.org/abs/2306.04116), ACL2023 [[github](https://github.com/yukiar/OTAlign/tree/main)]")
 if __name__ == '__main__':
+    main()

otfuncs.py CHANGED Viewed

@@ -1,17 +1,22 @@
-import numpy as np
 import torch
 import torch.nn.functional as F
 from ot.backend import get_backend
 device = "cuda" if torch.cuda.is_available() else "cpu"
-def compute_distance_matrix_cosine(s1_word_embeddigs, s2_word_embeddigs, distortion_ratio):
-    sim_matrix = (torch.matmul(F.normalize(s1_word_embeddigs), F.normalize(s2_word_embeddigs).t()) + 1.0) / 2  # Range 0-1
-    C = apply_distortion(sim_matrix, distortion_ratio)
     C = min_max_scaling(C)  # Range 0-1
     C = 1.0 - C  # Convert to distance
-    return C, sim_matrix
 def compute_distance_matrix_l2(s1_word_embeddigs, s2_word_embeddigs, distortion_ratio):
@@ -30,15 +35,20 @@ def apply_distortion(sim_matrix, ratio):
     if (shape[0] < 2 or shape[1] < 2) or ratio == 0.0:
         return sim_matrix
-    pos_x = torch.tensor([[y / float(shape[1] - 1) for y in range(shape[1])] for x in range(shape[0])],
-                         device=device)
-    pos_y = torch.tensor([[x / float(shape[0] - 1) for x in range(shape[0])] for y in range(shape[1])],
-                         device=device)
-    distortion_mask = 1.0 - ((pos_x - pos_y.T) ** 2) * ratio
     sim_matrix = torch.mul(sim_matrix, distortion_mask)
-    return sim_matrix
 def compute_weights_norm(s1_word_embeddigs, s2_word_embeddigs):
@@ -48,8 +58,12 @@ def compute_weights_norm(s1_word_embeddigs, s2_word_embeddigs):
 def compute_weights_uniform(s1_word_embeddigs, s2_word_embeddigs):
-    s1_weights = torch.ones(s1_word_embeddigs.shape[0], dtype=torch.float64, device=device)
-    s2_weights = torch.ones(s2_word_embeddigs.shape[0], dtype=torch.float64, device=device)
     # # Uniform weights to make L2 norm=1
     # s1_weights /= torch.linalg.norm(s1_weights)
@@ -65,4 +79,4 @@ def min_max_scaling(C):
     C_min = nx.min(C)
     C_max = nx.max(C)
     C = (C - C_min + eps) / (C_max - C_min + eps)
-    return C

 import torch
 import torch.nn.functional as F
 from ot.backend import get_backend
 device = "cuda" if torch.cuda.is_available() else "cpu"
+def compute_distance_matrix_cosine(
+    s1_word_embeddigs, s2_word_embeddigs, distortion_ratio
+):
+    sim_matrix = (
+        torch.matmul(F.normalize(s1_word_embeddigs), F.normalize(s2_word_embeddigs).t())
+        + 1.0
+    ) / 2  # Range 0-1
+    C, relative_distance = apply_distortion(sim_matrix, distortion_ratio)
     C = min_max_scaling(C)  # Range 0-1
     C = 1.0 - C  # Convert to distance
+    return C, sim_matrix, relative_distance
 def compute_distance_matrix_l2(s1_word_embeddigs, s2_word_embeddigs, distortion_ratio):
     if (shape[0] < 2 or shape[1] < 2) or ratio == 0.0:
         return sim_matrix
+    pos_x = torch.tensor(
+        [[y / float(shape[1] - 1) for y in range(shape[1])] for x in range(shape[0])],
+        device=device,
+    )
+    pos_y = torch.tensor(
+        [[x / float(shape[0] - 1) for x in range(shape[0])] for y in range(shape[1])],
+        device=device,
+    )
+    relative_distance = (pos_x - pos_y.T) ** 2
+    distortion_mask = 1.0 - relative_distance * ratio
     sim_matrix = torch.mul(sim_matrix, distortion_mask)
+    return sim_matrix, relative_distance
 def compute_weights_norm(s1_word_embeddigs, s2_word_embeddigs):
 def compute_weights_uniform(s1_word_embeddigs, s2_word_embeddigs):
+    s1_weights = torch.ones(
+        s1_word_embeddigs.shape[0], dtype=torch.float64, device=device
+    )
+    s2_weights = torch.ones(
+        s2_word_embeddigs.shape[0], dtype=torch.float64, device=device
+    )
     # # Uniform weights to make L2 norm=1
     # s1_weights /= torch.linalg.norm(s1_weights)
     C_min = nx.min(C)
     C_max = nx.max(C)
     C = (C - C_min + eps) / (C_max - C_min + eps)
+    return C

plotools.py CHANGED Viewed

@@ -8,74 +8,79 @@ def _debug_non_unique_axis_values(sent1: list[str], sent2: list[str]):
         using zero-width-space
     cf. https://github.com/plotly/plotly.js/issues/1516#issuecomment-983090013
     """
-    sent1 = [word + i*'\u200b' for i, word in enumerate(sent1)]
-    sent2 = [word + i*'\u200b' for i, word in enumerate(sent2)]
     return sent1, sent2
 def discrete_colorscale(bvals, colors):
     """
     bvals - list of values bounding intervals/ranges of interest
-    colors - list of rgb or hex colorcodes for values in [bvals[k], bvals[k+1]],0 <= k < len(bvals)-1
     returns the plotly discrete colorscale
     ref. https://community.plotly.com/t/colors-for-discrete-ranges-in-heatmaps/7780
     """
-    if len(bvals) != len(colors)+1:
-        raise ValueError('len(boundary values) should be equal to len(colors)+1')
-    bvals = sorted(bvals)
-    nvals = [(v-bvals[0])/(bvals[-1]-bvals[0]) for v in bvals]  #normalized values
-    dcolorscale = [] #discrete colorscale
     for k in range(len(colors)):
-        dcolorscale.extend([[nvals[k], colors[k]], [nvals[k+1], colors[k]]])
-    return dcolorscale
 def plot_align_matrix_heatmap_plotly(align_matrix, sent1, sent2, threshhold, Cost):
     align_matrix = np.where(align_matrix <= threshhold, 0, align_matrix)
     sent1, sent2 = _debug_non_unique_axis_values(sent1, sent2)
-    _colors = ['#F2F2F2', '#E0F4FA', '#BEE4F0', '#88CCE5', '#33b7df', '#1B88A6', '#105264', '#092E39']
     _ticks = [0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
     colorscale = discrete_colorscale(_ticks, _colors)
     fig = go.Figure()
-    fig.add_trace(go.Heatmap(
-        z=align_matrix,
-        customdata=Cost,
-        x=sent1,
-        y=sent2,
-        xgap=2,
-        ygap=2,
-        colorscale=colorscale,
-        colorbar=dict(
-            tick0=0,
-            dtick=0.125,
-            outlinewidth=0
-        ),
-        hovertemplate=
-        'x: %{x}<br>' +
-        'y: %{y}<br>' +
-        'P: %{z:.3f}<br>' +
-        'cost: %{customdata:.3f} ',
-        name=''
-    ))
     fig.update_layout(
-        #xaxis=dict(scaleanchor='y'),
-        yaxis=dict(autorange='reversed'),
-        margin={'l': 0, 'r': 0, 't': 0, 'b': 0},
-        plot_bgcolor='rgba(0,0,0,0)',
         font=dict(
             size=16,
         ),
         hoverlabel=dict(
-            bgcolor="#555",
-            font_color="white",
-            font_size=14,
-            font_family="Open Sans"
-        )
     )
     fig.update_xaxes(
         tickangle=-45,
@@ -83,47 +88,104 @@ def plot_align_matrix_heatmap_plotly(align_matrix, sent1, sent2, threshhold, Cos
     return fig
-def plot_similarity_matrix_heatmap_plotly(similarity_matrix, sent1, sent2, Cost):
     sent1, sent2 = _debug_non_unique_axis_values(sent1, sent2)
     fig = go.Figure()
-    fig.add_trace(go.Heatmap(
-        z=similarity_matrix,
-        customdata=Cost,
-        x=sent1,
-        y=sent2,
-        xgap=2,
-        ygap=2,
-        colorscale="Reds",
-        colorbar=dict(
-            tick0=0,
-            dtick=0.125,
-            outlinewidth=0
-        ),
-        hovertemplate=
-        'x: %{x}<br>' +
-        'y: %{y}<br>' +
-        'cosine: %{z:.3f}<br>' +
-        'cost: %{customdata:.3f} ',
-        name=''
-    ))
     fig.update_layout(
-        #xaxis=dict(scaleanchor='y'),
-        yaxis=dict(autorange='reversed'),
-        margin={'l': 0, 'r': 0, 't': 0, 'b': 0},
-        plot_bgcolor='rgba(0,0,0,0)',
         font=dict(
             size=16,
         ),
         hoverlabel=dict(
-            bgcolor="#555",
-            font_color="white",
-            font_size=14,
-            font_family="Open Sans"
-        )
     )
     fig.update_xaxes(
         tickangle=-45,
     )
-    return fig

         using zero-width-space
     cf. https://github.com/plotly/plotly.js/issues/1516#issuecomment-983090013
     """
+    sent1 = [word + i * "\u200b" for i, word in enumerate(sent1)]
+    sent2 = [word + i * "\u200b" for i, word in enumerate(sent2)]
     return sent1, sent2
 def discrete_colorscale(bvals, colors):
     """
     bvals - list of values bounding intervals/ranges of interest
+    colors - list of rgb or hex colorcodes for values in [bvals[k], bvals[k+1]],0<=k < len(bvals)-1
     returns the plotly discrete colorscale
     ref. https://community.plotly.com/t/colors-for-discrete-ranges-in-heatmaps/7780
     """
+    if len(bvals) != len(colors) + 1:
+        raise ValueError("len(boundary values) should be equal to  len(colors)+1")
+    bvals = sorted(bvals)
+    nvals = [
+        (v - bvals[0]) / (bvals[-1] - bvals[0]) for v in bvals
+    ]  # normalized values
+    dcolorscale = []  # discrete colorscale
     for k in range(len(colors)):
+        dcolorscale.extend([[nvals[k], colors[k]], [nvals[k + 1], colors[k]]])
+    return dcolorscale
 def plot_align_matrix_heatmap_plotly(align_matrix, sent1, sent2, threshhold, Cost):
     align_matrix = np.where(align_matrix <= threshhold, 0, align_matrix)
     sent1, sent2 = _debug_non_unique_axis_values(sent1, sent2)
+    _colors = [
+        "#F2F2F2",
+        "#E0F4FA",
+        "#BEE4F0",
+        "#88CCE5",
+        "#33b7df",
+        "#1B88A6",
+        "#105264",
+        "#092E39",
+    ]
     _ticks = [0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
     colorscale = discrete_colorscale(_ticks, _colors)
     fig = go.Figure()
+    fig.add_trace(
+        go.Heatmap(
+            z=align_matrix,
+            customdata=Cost,
+            x=sent1,
+            y=sent2,
+            xgap=2,
+            ygap=2,
+            colorscale=colorscale,
+            colorbar=dict(tick0=0, dtick=0.125, outlinewidth=0),
+            hovertemplate="x: %{x}<br>"
+            + "y: %{y}<br>"
+            + "P: %{z:.3f}<br>"
+            + "cost: %{customdata:.3f} ",
+            name="",
+        )
+    )
     fig.update_layout(
+        # xaxis=dict(scaleanchor='y'),
+        yaxis=dict(autorange="reversed"),
+        margin={"l": 0, "r": 0, "t": 0, "b": 0},
+        plot_bgcolor="rgba(0,0,0,0)",
         font=dict(
             size=16,
         ),
         hoverlabel=dict(
+            bgcolor="#555", font_color="white", font_size=14, font_family="Open Sans"
+        ),
     )
     fig.update_xaxes(
         tickangle=-45,
     return fig
+def plot_similarity_matrix_heatmap_plotly(
+    similarity_matrix, sent1, sent2, Cost, colorscale="Reds", hover_z="cosine"
+):
     sent1, sent2 = _debug_non_unique_axis_values(sent1, sent2)
     fig = go.Figure()
+    fig.add_trace(
+        go.Heatmap(
+            z=similarity_matrix,
+            customdata=Cost,
+            x=sent1,
+            y=sent2,
+            xgap=2,
+            ygap=2,
+            colorscale=colorscale,
+            colorbar=dict(tick0=0, dtick=0.125, outlinewidth=0),
+            hovertemplate="x: %{x}<br>"
+            + "y: %{y}<br>"
+            + f"{hover_z}: "
+            + "%{z:.3f}<br>"
+            + "cost: %{customdata:.3f} ",
+            name="",
+        )
+    )
     fig.update_layout(
+        # xaxis=dict(scaleanchor='y'),
+        yaxis=dict(autorange="reversed"),
+        margin={"l": 0, "r": 0, "t": 0, "b": 0},
+        plot_bgcolor="rgba(0,0,0,0)",
         font=dict(
             size=16,
         ),
         hoverlabel=dict(
+            bgcolor="#555", font_color="white", font_size=14, font_family="Open Sans"
+        ),
     )
     fig.update_xaxes(
         tickangle=-45,
     )
+    return fig
+def show_assignments_plotly(P, word_embeddings, sents1, sents2, thr=0):
+    P = np.where(P <= thr, 0, P)
+    s1_end = len(sents1)
+    a = word_embeddings[:s1_end]
+    b = word_embeddings[s1_end:]
+    traces = []
+    sample = 0
+    for i in range(a.shape[0]):
+        for j in range(b.shape[0]):
+            if P[i, j] > 0:
+                sample += 1
+                traces.append(
+                    go.Scatter(
+                        x=[a[i, 0], b[j, 0]],
+                        y=[a[i, 1], b[j, 1]],
+                        mode="lines",
+                        line=dict(color="black", width=P[i, j] * 2),
+                        opacity=P[i, j],
+                        name=f"{sample}",
+                    )
+                )
+    # ソースサンプルの描画
+    traces.append(
+        go.Scatter(
+            x=a[:, 0],
+            y=a[:, 1],
+            mode="markers+text",
+            marker=dict(color="blue", size=8, symbol="cross"),
+            text=sents1,
+            textposition="top center",
+            name="Source samples",
+        )
+    )
+    # ターゲットサンプルの描画
+    traces.append(
+        go.Scatter(
+            x=b[:, 0],
+            y=b[:, 1],
+            mode="markers+text",
+            marker=dict(color="red", size=8, symbol="x"),
+            text=sents2,
+            textposition="bottom center",
+            name="Target samples",
+        )
+    )
+    layout = go.Layout(
+        showlegend=True,
+        margin=dict(l=0, r=0, t=10, b=0),
+    )
+    fig = go.Figure(data=traces, layout=layout)
+    return fig

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ transformers==4.30.2
 matplotlib==3.7.1
 plotly==5.15.0
 torch==2.0.1
-nltk==3.8.1

 matplotlib==3.7.1
 plotly==5.15.0
 torch==2.0.1
+nltk==3.8.1
+umap-learn==0.5.5