|
"""Find potential duplicates in the data using cosine similarity.""" |
|
import streamlit as st |
|
from sentence_transformers.util import cos_sim |
|
|
|
from src.subpages.page import Context, Page |
|
|
|
|
|
@st.cache() |
|
def get_sims(texts: list[str], sentence_encoder): |
|
embeddings = sentence_encoder.encode(texts, batch_size=8, convert_to_numpy=True) |
|
return cos_sim(embeddings, embeddings) |
|
|
|
|
|
class FindDuplicatesPage(Page): |
|
name = "Find Duplicates" |
|
icon = "fingerprint" |
|
|
|
def _get_widget_defaults(self): |
|
return { |
|
"cutoff": 0.95, |
|
} |
|
|
|
def render(self, context: Context): |
|
st.title("Find Duplicates") |
|
with st.expander("💡", expanded=True): |
|
st.write("Find potential duplicates in the data using cosine similarity.") |
|
|
|
cutoff = st.slider("Similarity threshold", min_value=0.0, max_value=1.0, key="cutoff") |
|
|
|
|
|
|
|
|
|
texts = [" ".join(ts) for ts in context.split["tokens"]] |
|
sims = get_sims(texts, context.sentence_encoder) |
|
|
|
candidates = [] |
|
for i in range(len(sims)): |
|
for j in range(i + 1, len(sims)): |
|
if sims[i][j] >= cutoff: |
|
candidates.append((sims[i][j], i, j)) |
|
candidates.sort(reverse=False) |
|
|
|
for (sim, i, j) in candidates[:100]: |
|
st.markdown(f"**Possible duplicate ({i}, {j}, sim: {sim:.3f}):**") |
|
st.markdown("* " + " ".join(context.split["tokens"][i])) |
|
st.markdown("* " + " ".join(context.split["tokens"][j])) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|