raynardj's picture
👜 baseline
67eeae3
raw
history blame
1.34 kB
import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer
from forgebox.cosine import CosineSearch
import numpy as np
TAG = "raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn"
@st.cache(allow_output_mutation=True)
def load_encoder():
with st.spinner(f"Loading Transformer:{TAG}"):
encoder = SentenceTransformer(TAG)
return encoder
encoder = load_encoder()
@st.cache(allow_output_mutation=True)
def load_book():
with st.spinner(f"📚 Loading Book..."):
df = pd.read_csv("grand_historian.csv")
return list(df.sentence)
all_lines = load_book()
@st.cache(allow_output_mutation=True)
def encode_book():
with st.spinner(f"Encoding sentences for book《Records of the Grand Historian》"):
vec = encoder.encode(all_lines, batch_size=64, show_progress_bar=True)
cosine = CosineSearch(vec)
return cosine
cosine = encode_book()
def search(text):
enc = encoder.encode(text) # encode the search key
order = cosine(enc) # distance array
sentence_df = pd.DataFrame({"sentence":np.array(all_lines)[order[:5]]})
return sentence_df
keyword = st.text_input("用白话搜", "")
if st.button("搜索"):
if keyword:
with st.spinner(f"🔍 Searching for {keyword}"):
df = search(keyword)
st.table(df)