Spaces:

Yuichiroh
/

UOT

Sleeping

App Files Files Community

4kasha commited on Oct 16, 2023

Commit

94f5fd3

•

1 Parent(s): e7088f8

update demo

Browse files

Files changed (6) hide show

aligner.py +34 -47
app.py +45 -91
otfuncs.py +68 -0
plotools.py +129 -0
requirements.txt +4 -3
utils.py +64 -100

aligner.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import numpy as np
 import torch
 import ot
-from utils import (
     compute_distance_matrix_cosine,
     compute_distance_matrix_l2,
     compute_weights_norm,
@@ -30,55 +30,36 @@ class Aligner:
         else:
             self.weight_func = compute_weights_norm
-    def compute_alignment_matrixes(self, s1_vecs, s2_vecs):
-        self.align_matrixes = []
-        for vecX, vecY in zip(s1_vecs, s2_vecs):
-            P = self.compute_optimal_transport(vecX, vecY)
-            if torch.is_tensor(P):
-                P = P.to('cpu').numpy()
-            self.align_matrixes.append(P)
-    def get_alignments(self, thresh, assign_cost=False):
-        assert len(self.align_matrixes) > 0
-        self.thresh = thresh
-        all_alignments = []
-        for P in self.align_matrixes:
-            alignments = self.matrix_to_alignments(P, assign_cost)
-            all_alignments.append(alignments)
-        return all_alignments
-    def matrix_to_alignments(self, P, assign_cost):
-        alignments = set()
-        align_pairs = np.transpose(np.nonzero(P > self.thresh))
-        if assign_cost:
-            for i_j in align_pairs:
-                alignments.add('{0}-{1}-{2:.4f}'.format(i_j[0], i_j[1], P[i_j[0], i_j[1]]))
-        else:
-            for i_j in align_pairs:
-                alignments.add('{0}-{1}'.format(i_j[0], i_j[1]))
-        return alignments
     def compute_optimal_transport(self, s1_word_embeddigs, s2_word_embeddigs):
         s1_word_embeddigs = s1_word_embeddigs.to(torch.float64)
         s2_word_embeddigs = s2_word_embeddigs.to(torch.float64)
-        C = self.dist_func(s1_word_embeddigs, s2_word_embeddigs, self.distotion)
         s1_weights, s2_weights = self.weight_func(s1_word_embeddigs, s2_word_embeddigs)
         if self.ot_type == 'ot':
             s1_weights = s1_weights / s1_weights.sum()
             s2_weights = s2_weights / s2_weights.sum()
-            s1_weights, s2_weights, C = self.comvert_to_numpy(s1_weights, s2_weights, C)
             if self.sinkhorn:
-                P = ot.bregman.sinkhorn_log(s1_weights, s2_weights, C, reg=self.epsilon, stopThr=self.stopThr,
-                                            numItermax=self.numItermax)
             else:
-                P = ot.emd(s1_weights, s2_weights, C)
             # Min-max normalization
             P = min_max_scaling(P)
@@ -89,16 +70,18 @@ class Aligner:
             else:
                 m = self.tau
-            s1_weights, s2_weights, C = self.comvert_to_numpy(s1_weights, s2_weights, C)
             m = np.min((np.sum(s1_weights), np.sum(s2_weights))) * m
             if self.sinkhorn:
-                P = ot.partial.entropic_partial_wasserstein(s1_weights, s2_weights, C,
-                                                            reg=self.epsilon,
-                                                            m=m, stopThr=self.stopThr, numItermax=self.numItermax)
             else:
                 # To cope with round error
-                P = ot.partial.partial_wasserstein(s1_weights, s2_weights, C, m=m)
             # Min-max normalization
             P = min_max_scaling(P)
@@ -109,20 +92,24 @@ class Aligner:
                 tau = self.tau
             if self.ot_type == 'uot':
-                P = ot.unbalanced.sinkhorn_stabilized_unbalanced(s1_weights, s2_weights, C, reg=self.epsilon, reg_m=tau,
-                                                                 stopThr=self.stopThr, numItermax=self.numItermax)
             elif self.ot_type == 'uot-mm':
-                P = ot.unbalanced.mm_unbalanced(s1_weights, s2_weights, C, reg_m=tau, div=self.div_type,
-                                                stopThr=self.stopThr, numItermax=self.numItermax)
             # Min-max normalization
             P = min_max_scaling(P)
         elif self.ot_type == 'none':
             P = 1 - C
-        return P
-    def comvert_to_numpy(self, s1_weights, s2_weights, C):
         if torch.is_tensor(s1_weights):
             s1_weights = s1_weights.to('cpu').numpy()
             s2_weights = s2_weights.to('cpu').numpy()

 import numpy as np
 import torch
 import ot
+from otfuncs import (
     compute_distance_matrix_cosine,
     compute_distance_matrix_l2,
     compute_weights_norm,
         else:
             self.weight_func = compute_weights_norm
+    def compute_alignment_matrixes(self, s1_word_embeddigs, s2_word_embeddigs):
+        P, Cost, log, similarity_matrix = self.compute_optimal_transport(s1_word_embeddigs, s2_word_embeddigs)
+        print(log.keys())
+        if torch.is_tensor(P):
+            P = P.to('cpu').numpy()
+        loss = log.get('cost', 'NotImplemented')
+        return P, Cost, loss, similarity_matrix
     def compute_optimal_transport(self, s1_word_embeddigs, s2_word_embeddigs):
         s1_word_embeddigs = s1_word_embeddigs.to(torch.float64)
         s2_word_embeddigs = s2_word_embeddigs.to(torch.float64)
+        C, similarity_matrix = self.dist_func(s1_word_embeddigs, s2_word_embeddigs, self.distotion)
         s1_weights, s2_weights = self.weight_func(s1_word_embeddigs, s2_word_embeddigs)
         if self.ot_type == 'ot':
             s1_weights = s1_weights / s1_weights.sum()
             s2_weights = s2_weights / s2_weights.sum()
+            s1_weights, s2_weights, C = self.convert_to_numpy(s1_weights, s2_weights, C)
             if self.sinkhorn:
+                P, log = ot.bregman.sinkhorn_log(
+                    s1_weights, s2_weights, C,
+                    reg=self.epsilon, stopThr=self.stopThr,
+                    numItermax=self.numItermax, log=True
+                )
             else:
+                P, log = ot.emd(s1_weights, s2_weights, C, log=True)
             # Min-max normalization
             P = min_max_scaling(P)
             else:
                 m = self.tau
+            s1_weights, s2_weights, C = self.convert_to_numpy(s1_weights, s2_weights, C)
             m = np.min((np.sum(s1_weights), np.sum(s2_weights))) * m
             if self.sinkhorn:
+                P, log = ot.partial.entropic_partial_wasserstein(
+                    s1_weights, s2_weights, C,
+                    reg=self.epsilon,
+                    m=m, stopThr=self.stopThr, numItermax=self.numItermax, log=True
+                )
             else:
                 # To cope with round error
+                P, log = ot.partial.partial_wasserstein(s1_weights, s2_weights, C, m=m, log=True)
             # Min-max normalization
             P = min_max_scaling(P)
                 tau = self.tau
             if self.ot_type == 'uot':
+                P, log = ot.unbalanced.sinkhorn_stabilized_unbalanced(
+                    s1_weights, s2_weights, C, reg=self.epsilon, reg_m=tau,
+                    stopThr=self.stopThr, numItermax=self.numItermax, log=True
+                )
             elif self.ot_type == 'uot-mm':
+                P, log = ot.unbalanced.mm_unbalanced(
+                    s1_weights, s2_weights, C, reg_m=tau, div=self.div_type,
+                    stopThr=self.stopThr, numItermax=self.numItermax, log=True
+                )
             # Min-max normalization
             P = min_max_scaling(P)
         elif self.ot_type == 'none':
             P = 1 - C
+        return P, C, log, similarity_matrix
+    def convert_to_numpy(self, s1_weights, s2_weights, C):
         if torch.is_tensor(s1_weights):
             s1_weights = s1_weights.to('cpu').numpy()
             s2_weights = s2_weights.to('cpu').numpy()

app.py CHANGED Viewed

@@ -2,14 +2,22 @@ import streamlit as st
 import random
 import numpy as np
 import torch
 from transformers import AutoTokenizer, AutoModel
 from aligner import Aligner
-from utils import plot_align_matrix_heatmap
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch.manual_seed(42)
 np.random.seed(42)
 random.seed(42)
 @st.cache_resource
@@ -34,72 +42,6 @@ def init_aligner(ot_type: str, sinkhorn: bool, distortion: float, threshhold: fl
     )
-def encode_sentence(sent, pair, tokenizer, model, layer: int):
-    if pair == None:
-        inputs = tokenizer(sent, padding=False, truncation=False, is_split_into_words=True, return_offsets_mapping=True,
-                           return_tensors="pt")
-        with torch.no_grad():
-            outputs = model(inputs['input_ids'].to(device), inputs['attention_mask'].to(device),
-                            inputs['token_type_ids'].to(device))
-    else:
-        inputs = tokenizer(text=sent, text_pair=pair, padding=False, truncation=True,
-                           is_split_into_words=True,
-                           return_offsets_mapping=True, return_tensors="pt")
-        with torch.no_grad():
-            outputs = model(inputs['input_ids'].to(device), inputs['attention_mask'].to(device),
-                            inputs['token_type_ids'].to(device))
-    return outputs.hidden_states[layer][0], inputs['input_ids'][0], inputs['offset_mapping'][0]
-def centering(hidden_outputs):
-    """
-    hidden_outputs : [tokens, hidden_size]
-    """
-    # 全てのトークンの埋め込みについて足し上げ、その平均ベクトルを求める
-    mean_vec = torch.sum(hidden_outputs, dim=0) / hidden_outputs.shape[0]
-    hidden_outputs = hidden_outputs - mean_vec
-    print(hidden_outputs.shape)
-    return hidden_outputs
-def convert_to_word_embeddings(offset_mapping, token_ids, hidden_tensors, tokenizer, pair):
-    word_idx = -1
-    subword_to_word_conv = np.full((hidden_tensors.shape[0]), -1)
-    # Bug in hugging face tokenizer? Sometimes Metaspace is inserted
-    metaspace = getattr(tokenizer.decoder, "replacement", None)
-    metaspace = tokenizer.decoder.prefix if metaspace is None else metaspace
-    tokenizer_bug_idxes = [i for i, x in enumerate(tokenizer.convert_ids_to_tokens(token_ids)) if
-                           x == metaspace]
-    for subw_idx, offset in enumerate(offset_mapping):
-        if subw_idx in tokenizer_bug_idxes:
-            continue
-        elif offset[0] == offset[1]:  # Special token
-            continue
-        elif offset[0] == 0:
-            word_idx += 1
-            subword_to_word_conv[subw_idx] = word_idx
-        else:
-            subword_to_word_conv[subw_idx] = word_idx
-    word_embeddings = torch.vstack(
-        ([torch.mean(hidden_tensors[subword_to_word_conv == word_idx], dim=0) for word_idx in range(word_idx + 1)]))
-    print(word_embeddings.shape)
-    if pair:
-        sep_tok_indices = [i for i, x in enumerate(token_ids) if x == tokenizer.sep_token_id]
-        s2_start_idx = subword_to_word_conv[
-            sep_tok_indices[0] + np.argmax(subword_to_word_conv[sep_tok_indices[0]:] > -1)]
-        s1_word_embeddigs = word_embeddings[0:s2_start_idx, :]
-        s2_word_embeddigs = word_embeddings[s2_start_idx:, :]
-        return s1_word_embeddigs, s2_word_embeddigs
-    else:
-        return word_embeddings
 def main():
     st.set_page_config(layout="wide")
@@ -107,21 +49,30 @@ def main():
     st.sidebar.markdown("## Settings & Parameters")
     model = st.sidebar.selectbox('model', ['microsoft/deberta-v3-base', 'bert-base-uncased'])
     layer = st.sidebar.slider(
-      'layer number for embeddings', 0, 11, value=9
     )
     is_centering = st.sidebar.checkbox('centering embeddings', value=True)
-    ot_type = st.sidebar.selectbox('ot_type', ['OT', 'POT', 'UOT'])
     ot_type = ot_type.lower()
-    sinkhorn = st.sidebar.checkbox('sinkhorn', value=True)
     distortion = st.sidebar.slider(
-      'distortion: $\kappa$', 0.0, 1.0, value=0.20
     )
     tau = st.sidebar.slider(
-      'tau: $\\tau$', 0.0, 1.0, value=0.98
-    ) # with 0.02 interva
     threshhold = st.sidebar.slider(
-      'threshhold: $\lambda$', 0.0, 1.0
-    ) # with 0.01 interval
     # Content
     st.markdown('## Playground: Unbalanced Optimal Transport for Unbalanced Word Alignment')
@@ -130,39 +81,42 @@ def main():
     with col1:
       sent1 = st.text_area(
-        'sentence 1',
-        'By one estimate , fewer than 20,000 lions exist in the wild , a drop of about 40 percent in the past two decades .'
       )
     with col2:
       sent2 = st.text_area(
-        'sentence 2',
-        'Today there are only around 20,000 wild lions left in the world .'
       )
     tokenizer, model = init_model(model)
     aligner = init_aligner(ot_type, sinkhorn, distortion, threshhold, tau)
     with st.container():
-      st.write("word alignment matrix")
       if sent1 != '' and sent2 != '':
-        sent1 = sent1.lower().split()
-        sent2 = sent2.lower().split()
         hidden_output, input_id, offset_map = encode_sentence(sent1, sent2, tokenizer, model, layer=layer)
         if is_centering:
             hidden_output = centering(hidden_output)
         s1_vec, s2_vec = convert_to_word_embeddings(offset_map, input_id, hidden_output, tokenizer, pair=True)
-        aligner.compute_alignment_matrixes([s1_vec], [s2_vec])
-        align_matrix = aligner.align_matrixes[0]
-        print(align_matrix.shape)
-        #fig = align_matrix_heatmap(align_matrix.T, sent1, sent2, threshhold)
-        #st.plotly_chart(fig, use_container_width=True)
-        fig = plot_align_matrix_heatmap(align_matrix.T, sent1, sent2, threshhold)
-        st.pyplot(fig, dpi=300)
     st.divider()
-    st.markdown("Note that the centering in this demo is applied only to the input sentences, so the variance may be large.")
     st.subheader('Refs')
     st.write("Yuki Arase, Han Bao, Sho Yokoi, [Unbalanced Optimal Transport for Unbalanced Word Alignment](https://arxiv.org/abs/2306.04116), ACL2023 [[github](https://github.com/yukiar/OTAlign/tree/main)]")

 import random
 import numpy as np
 import torch
+from nltk.tokenize import word_tokenize
 from transformers import AutoTokenizer, AutoModel
 from aligner import Aligner
+from utils import (
+    encode_sentence,
+    centering,
+    convert_to_word_embeddings
+)
+from plotools import plot_align_matrix_heatmap_plotly, plot_similarity_matrix_heatmap_plotly
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch.manual_seed(42)
 np.random.seed(42)
 random.seed(42)
+import nltk
+nltk.download('punkt')
 @st.cache_resource
     )
 def main():
     st.set_page_config(layout="wide")
     st.sidebar.markdown("## Settings & Parameters")
     model = st.sidebar.selectbox('model', ['microsoft/deberta-v3-base', 'bert-base-uncased'])
     layer = st.sidebar.slider(
+        'layer number for embeddings', 0, 11, value=9,
     )
     is_centering = st.sidebar.checkbox('centering embeddings', value=True)
+    ot_type = st.sidebar.selectbox(
+        'ot_type', ['OT', 'POT', 'UOT'],
+        help="optimal transport algorithm to be used"
+    )
     ot_type = ot_type.lower()
+    sinkhorn = st.sidebar.checkbox(
+        'sinkhorn', value=True,
+        help="use sinkhorn algorithm"
+    )
     distortion = st.sidebar.slider(
+        'distortion: $\kappa$', 0.0, 1.0, value=0.20,
+        help="suppression of off-diagonal alignments"
     )
     tau = st.sidebar.slider(
+        'm / $\\tau$', 0.0, 1.0, value=0.98,
+        help="fraction of fertility to be aligned (fraction of mass to be transported) / penalties"
+    )
     threshhold = st.sidebar.slider(
+        'threshhold: $\lambda$', 0.0, 1.0, value=0.22,
+        help="sparsity of alignment matrix"
+    )
     # Content
     st.markdown('## Playground: Unbalanced Optimal Transport for Unbalanced Word Alignment')
     with col1:
       sent1 = st.text_area(
+          'sentence 1',
+          'By one estimate, fewer than 20,000 lions exist in the wild, a drop of about 40 percent in the past two decades.',
+          help="Initial text"
       )
     with col2:
       sent2 = st.text_area(
+          'sentence 2',
+          'Today there are only around 20,000 wild lions left in the world.',
+          help="Text to compare"
       )
     tokenizer, model = init_model(model)
     aligner = init_aligner(ot_type, sinkhorn, distortion, threshhold, tau)
     with st.container():
       if sent1 != '' and sent2 != '':
+        sent1 = word_tokenize(sent1.lower())
+        sent2 = word_tokenize(sent2.lower())
+        print(sent1)
+        print(sent2)
         hidden_output, input_id, offset_map = encode_sentence(sent1, sent2, tokenizer, model, layer=layer)
         if is_centering:
             hidden_output = centering(hidden_output)
         s1_vec, s2_vec = convert_to_word_embeddings(offset_map, input_id, hidden_output, tokenizer, pair=True)
+        align_matrix, cost_matrix, loss, similarity_matrix = aligner.compute_alignment_matrixes(s1_vec, s2_vec)
+        print(align_matrix.shape, cost_matrix.shape)
+        st.write(f"**word alignment matrix** (loss: :blue[{loss}])")
+        fig = plot_align_matrix_heatmap_plotly(align_matrix.T, sent1, sent2, threshhold, cost_matrix.T)
+        st.plotly_chart(fig, use_container_width=True)
+        st.write(f"**word similarity matrix**")
+        fig2 = plot_similarity_matrix_heatmap_plotly(similarity_matrix.T, sent1, sent2, cost_matrix.T)
+        st.plotly_chart(fig2, use_container_width=True)
     st.divider()
     st.subheader('Refs')
     st.write("Yuki Arase, Han Bao, Sho Yokoi, [Unbalanced Optimal Transport for Unbalanced Word Alignment](https://arxiv.org/abs/2306.04116), ACL2023 [[github](https://github.com/yukiar/OTAlign/tree/main)]")

otfuncs.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from ot.backend import get_backend
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def compute_distance_matrix_cosine(s1_word_embeddigs, s2_word_embeddigs, distortion_ratio):
+    sim_matrix = (torch.matmul(F.normalize(s1_word_embeddigs), F.normalize(s2_word_embeddigs).t()) + 1.0) / 2  # Range 0-1
+    C = apply_distortion(sim_matrix, distortion_ratio)
+    C = min_max_scaling(C)  # Range 0-1
+    C = 1.0 - C  # Convert to distance
+    return C, sim_matrix
+def compute_distance_matrix_l2(s1_word_embeddigs, s2_word_embeddigs, distortion_ratio):
+    C = torch.cdist(s1_word_embeddigs, s2_word_embeddigs, p=2)
+    C = min_max_scaling(C)  # Range 0-1
+    C = 1.0 - C  # Convert to similarity
+    C = apply_distortion(C, distortion_ratio)
+    C = min_max_scaling(C)  # Range 0-1
+    C = 1.0 - C  # Convert to distance
+    return C
+def apply_distortion(sim_matrix, ratio):
+    shape = sim_matrix.shape
+    if (shape[0] < 2 or shape[1] < 2) or ratio == 0.0:
+        return sim_matrix
+    pos_x = torch.tensor([[y / float(shape[1] - 1) for y in range(shape[1])] for x in range(shape[0])],
+                         device=device)
+    pos_y = torch.tensor([[x / float(shape[0] - 1) for x in range(shape[0])] for y in range(shape[1])],
+                         device=device)
+    distortion_mask = 1.0 - ((pos_x - pos_y.T) ** 2) * ratio
+    sim_matrix = torch.mul(sim_matrix, distortion_mask)
+    return sim_matrix
+def compute_weights_norm(s1_word_embeddigs, s2_word_embeddigs):
+    s1_weights = torch.norm(s1_word_embeddigs, dim=1)
+    s2_weights = torch.norm(s2_word_embeddigs, dim=1)
+    return s1_weights, s2_weights
+def compute_weights_uniform(s1_word_embeddigs, s2_word_embeddigs):
+    s1_weights = torch.ones(s1_word_embeddigs.shape[0], dtype=torch.float64, device=device)
+    s2_weights = torch.ones(s2_word_embeddigs.shape[0], dtype=torch.float64, device=device)
+    # # Uniform weights to make L2 norm=1
+    # s1_weights /= torch.linalg.norm(s1_weights)
+    # s2_weights /= torch.linalg.norm(s2_weights)
+    return s1_weights, s2_weights
+def min_max_scaling(C):
+    eps = 1e-10
+    # Min-max scaling for stabilization
+    nx = get_backend(C)
+    C_min = nx.min(C)
+    C_max = nx.max(C)
+    C = (C - C_min + eps) / (C_max - C_min + eps)
+    return C

plotools.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import numpy as np
+import plotly.graph_objects as go
+def _debug_non_unique_axis_values(sent1: list[str], sent2: list[str]):
+    """
+    solution:
+        using zero-width-space
+    cf. https://github.com/plotly/plotly.js/issues/1516#issuecomment-983090013
+    """
+    sent1 = [word + i*'\u200b' for i, word in enumerate(sent1)]
+    sent2 = [word + i*'\u200b' for i, word in enumerate(sent2)]
+    return sent1, sent2
+def discrete_colorscale(bvals, colors):
+    """
+    bvals - list of values bounding intervals/ranges of interest
+    colors - list of rgb or hex colorcodes for values in [bvals[k], bvals[k+1]],0 <= k < len(bvals)-1
+    returns the plotly discrete colorscale
+    ref. https://community.plotly.com/t/colors-for-discrete-ranges-in-heatmaps/7780
+    """
+    if len(bvals) != len(colors)+1:
+        raise ValueError('len(boundary values) should be equal to len(colors)+1')
+    bvals = sorted(bvals)
+    nvals = [(v-bvals[0])/(bvals[-1]-bvals[0]) for v in bvals]  #normalized values
+    dcolorscale = [] #discrete colorscale
+    for k in range(len(colors)):
+        dcolorscale.extend([[nvals[k], colors[k]], [nvals[k+1], colors[k]]])
+    return dcolorscale
+def plot_align_matrix_heatmap_plotly(align_matrix, sent1, sent2, threshhold, Cost):
+    align_matrix = np.where(align_matrix <= threshhold, 0, align_matrix)
+    sent1, sent2 = _debug_non_unique_axis_values(sent1, sent2)
+    _colors = ['#F2F2F2', '#E0F4FA', '#BEE4F0', '#88CCE5', '#33b7df', '#1B88A6', '#105264', '#092E39']
+    _ticks = [0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
+    colorscale = discrete_colorscale(_ticks, _colors)
+    fig = go.Figure()
+    fig.add_trace(go.Heatmap(
+        z=align_matrix,
+        customdata=Cost,
+        x=sent1,
+        y=sent2,
+        xgap=2,
+        ygap=2,
+        colorscale=colorscale,
+        colorbar=dict(
+            tick0=0,
+            dtick=0.125,
+            outlinewidth=0
+        ),
+        hovertemplate=
+        'x: %{x}<br>' +
+        'y: %{y}<br>' +
+        'P: %{z:.3f}<br>' +
+        'cost: %{customdata:.3f} ',
+        name=''
+    ))
+    fig.update_layout(
+        #xaxis=dict(scaleanchor='y'),
+        yaxis=dict(autorange='reversed'),
+        margin={'l': 0, 'r': 0, 't': 0, 'b': 0},
+        plot_bgcolor='rgba(0,0,0,0)',
+        font=dict(
+            size=16,
+        ),
+        hoverlabel=dict(
+            bgcolor="#555",
+            font_color="white",
+            font_size=14,
+            font_family="Open Sans"
+        )
+    )
+    fig.update_xaxes(
+        tickangle=-45,
+    )
+    return fig
+def plot_similarity_matrix_heatmap_plotly(similarity_matrix, sent1, sent2, Cost):
+    sent1, sent2 = _debug_non_unique_axis_values(sent1, sent2)
+    fig = go.Figure()
+    fig.add_trace(go.Heatmap(
+        z=similarity_matrix,
+        customdata=Cost,
+        x=sent1,
+        y=sent2,
+        xgap=2,
+        ygap=2,
+        colorscale="Reds",
+        colorbar=dict(
+            tick0=0,
+            dtick=0.125,
+            outlinewidth=0
+        ),
+        hovertemplate=
+        'x: %{x}<br>' +
+        'y: %{y}<br>' +
+        'cosine: %{z:.3f}<br>' +
+        'cost: %{customdata:.3f} ',
+        name=''
+    ))
+    fig.update_layout(
+        #xaxis=dict(scaleanchor='y'),
+        yaxis=dict(autorange='reversed'),
+        margin={'l': 0, 'r': 0, 't': 0, 'b': 0},
+        plot_bgcolor='rgba(0,0,0,0)',
+        font=dict(
+            size=16,
+        ),
+        hoverlabel=dict(
+            bgcolor="#555",
+            font_color="white",
+            font_size=14,
+            font_family="Open Sans"
+        )
+    )
+    fig.update_xaxes(
+        tickangle=-45,
+    )
+    return fig

requirements.txt CHANGED Viewed

@@ -1,8 +1,9 @@
 POT==0.9.0
 sentencepiece==0.1.99
-streamlit==1.24.0
 tokenizers==0.13.3
 transformers==4.30.2
 matplotlib==3.7.1
-seaborn==0.12.2
-torch==2.0.1

 POT==0.9.0
 sentencepiece==0.1.99
+streamlit==1.27.2
 tokenizers==0.13.3
 transformers==4.30.2
 matplotlib==3.7.1
+plotly==5.15.0
+torch==2.0.1
+nltk==3.8.1

utils.py CHANGED Viewed

@@ -1,105 +1,69 @@
 import numpy as np
 import torch
-import torch.nn.functional as F
-from ot.backend import get_backend
 device = "cuda" if torch.cuda.is_available() else "cpu"
-def compute_distance_matrix_cosine(s1_word_embeddigs, s2_word_embeddigs, distortion_ratio):
-    C = (torch.matmul(F.normalize(s1_word_embeddigs), F.normalize(s2_word_embeddigs).t()) + 1.0) / 2  # Range 0-1
-    C = apply_distortion(C, distortion_ratio)
-    C = min_max_scaling(C)  # Range 0-1
-    C = 1.0 - C  # Convert to distance
-    return C
-def compute_distance_matrix_l2(s1_word_embeddigs, s2_word_embeddigs, distortion_ratio):
-    C = torch.cdist(s1_word_embeddigs, s2_word_embeddigs, p=2)
-    C = min_max_scaling(C)  # Range 0-1
-    C = 1.0 - C  # Convert to similarity
-    C = apply_distortion(C, distortion_ratio)
-    C = min_max_scaling(C)  # Range 0-1
-    C = 1.0 - C  # Convert to distance
-    return C
-def apply_distortion(sim_matrix, ratio):
-    shape = sim_matrix.shape
-    if (shape[0] < 2 or shape[1] < 2) or ratio == 0.0:
-        return sim_matrix
-    pos_x = torch.tensor([[y / float(shape[1] - 1) for y in range(shape[1])] for x in range(shape[0])],
-                         device=device)
-    pos_y = torch.tensor([[x / float(shape[0] - 1) for x in range(shape[0])] for y in range(shape[1])],
-                         device=device)
-    distortion_mask = 1.0 - ((pos_x - pos_y.T) ** 2) * ratio
-    sim_matrix = torch.mul(sim_matrix, distortion_mask)
-    return sim_matrix
-def compute_weights_norm(s1_word_embeddigs, s2_word_embeddigs):
-    s1_weights = torch.norm(s1_word_embeddigs, dim=1)
-    s2_weights = torch.norm(s2_word_embeddigs, dim=1)
-    return s1_weights, s2_weights
-def compute_weights_uniform(s1_word_embeddigs, s2_word_embeddigs):
-    s1_weights = torch.ones(s1_word_embeddigs.shape[0], dtype=torch.float64, device=device)
-    s2_weights = torch.ones(s2_word_embeddigs.shape[0], dtype=torch.float64, device=device)
-    # # Uniform weights to make L2 norm=1
-    # s1_weights /= torch.linalg.norm(s1_weights)
-    # s2_weights /= torch.linalg.norm(s2_weights)
-    return s1_weights, s2_weights
-def min_max_scaling(C):
-    eps = 1e-10
-    # Min-max scaling for stabilization
-    nx = get_backend(C)
-    C_min = nx.min(C)
-    C_max = nx.max(C)
-    C = (C - C_min + eps) / (C_max - C_min + eps)
-    return C
-import seaborn as sns
-import matplotlib.pyplot as plt
-from mpl_toolkits.axes_grid1 import make_axes_locatable
-def plot_align_matrix_heatmap(align_matrix, sent1, sent2, thresh, **kwargs):
-    align_matrix = np.where(align_matrix <= thresh, 0, align_matrix)
-    fig, ax = plt.subplots(figsize=(10, 6))
-    sns.set(font='sans-serif', style="ticks")
-    _color = ['#F2F2F2', '#E0F4FA', '#BEE4F0', '#88CCE5', '#33b7df', '#1B88A6', '#105264', '#092E39']
-    _ticks = [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
-    divider = make_axes_locatable(ax)
-    cbar_ax = divider.append_axes("right", size="2.5%", pad=0.1)
-    fig.add_axes(cbar_ax)
-    ax = sns.heatmap(
-        align_matrix,
-        xticklabels=sent1,
-        yticklabels=sent2,
-        cmap=_color,
-        linewidths=1,
-        square=True,
-        ax=ax,
-        cbar_ax=cbar_ax,
-        **kwargs
-    )
-    ax.collections[0].colorbar.ax.yaxis.set_ticks(_ticks, minor=False)
-    ax.collections[0].colorbar.set_ticklabels(_ticks)
-    cax = ax.collections[0].colorbar.ax
-    cax.tick_params(which='major', length=3, labelsize=5)
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
-    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
-    return fig

 import numpy as np
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
+def encode_sentence(sent, pair, tokenizer, model, layer: int):
+    if pair == None:
+        inputs = tokenizer(sent, padding=False, truncation=False, is_split_into_words=True, return_offsets_mapping=True,
+                           return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(inputs['input_ids'].to(device), inputs['attention_mask'].to(device),
+                            inputs['token_type_ids'].to(device))
+    else:
+        inputs = tokenizer(text=sent, text_pair=pair, padding=False, truncation=True,
+                           is_split_into_words=True,
+                           return_offsets_mapping=True, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(inputs['input_ids'].to(device), inputs['attention_mask'].to(device),
+                            inputs['token_type_ids'].to(device))
+    return outputs.hidden_states[layer][0], inputs['input_ids'][0], inputs['offset_mapping'][0]
+def centering(hidden_outputs):
+    """
+    hidden_outputs : [tokens, hidden_size]
+    """
+    # 全てのトークンの埋め込みについて足し上げ、その平均ベクトルを求める
+    mean_vec = torch.sum(hidden_outputs, dim=0) / hidden_outputs.shape[0]
+    hidden_outputs = hidden_outputs - mean_vec
+    print(hidden_outputs.shape)
+    return hidden_outputs
+def convert_to_word_embeddings(offset_mapping, token_ids, hidden_tensors, tokenizer, pair):
+    word_idx = -1
+    subword_to_word_conv = np.full((hidden_tensors.shape[0]), -1)
+    # Bug in hugging face tokenizer? Sometimes Metaspace is inserted
+    metaspace = getattr(tokenizer.decoder, "replacement", None)
+    metaspace = tokenizer.decoder.prefix if metaspace is None else metaspace
+    tokenizer_bug_idxes = [i for i, x in enumerate(tokenizer.convert_ids_to_tokens(token_ids)) if
+                           x == metaspace]
+    for subw_idx, offset in enumerate(offset_mapping):
+        if subw_idx in tokenizer_bug_idxes:
+            continue
+        elif offset[0] == offset[1]:  # Special token
+            continue
+        elif offset[0] == 0:
+            word_idx += 1
+            subword_to_word_conv[subw_idx] = word_idx
+        else:
+            subword_to_word_conv[subw_idx] = word_idx
+    word_embeddings = torch.vstack(
+        ([torch.mean(hidden_tensors[subword_to_word_conv == word_idx], dim=0) for word_idx in range(word_idx + 1)]))
+    print(word_embeddings.shape)
+    if pair:
+        sep_tok_indices = [i for i, x in enumerate(token_ids) if x == tokenizer.sep_token_id]
+        s2_start_idx = subword_to_word_conv[
+            sep_tok_indices[0] + np.argmax(subword_to_word_conv[sep_tok_indices[0]:] > -1)]
+        s1_word_embeddigs = word_embeddings[0:s2_start_idx, :]
+        s2_word_embeddigs = word_embeddings[s2_start_idx:, :]
+        return s1_word_embeddigs, s2_word_embeddigs
+    else:
+        return word_embeddings