|
import streamlit as st |
|
import pandas as pd |
|
from sentence_transformers import SentenceTransformer |
|
from forgebox.cosine import CosineSearch |
|
import numpy as np |
|
|
|
TAG = "raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn" |
|
|
|
@st.cache(allow_output_mutation=True) |
|
def load_encoder(): |
|
with st.spinner(f"Loading Transformer:{TAG}"): |
|
encoder = SentenceTransformer(TAG) |
|
return encoder |
|
|
|
encoder = load_encoder() |
|
|
|
@st.cache(allow_output_mutation=True) |
|
def load_book(): |
|
with st.spinner(f"📚 Loading Book..."): |
|
df = pd.read_csv("grand_historian.csv") |
|
return list(df.sentence) |
|
|
|
all_lines = load_book() |
|
|
|
@st.cache(allow_output_mutation=True) |
|
def encode_book(): |
|
with st.spinner(f"Encoding sentences for book《Records of the Grand Historian》"): |
|
vec = encoder.encode(all_lines, batch_size=64, show_progress_bar=True) |
|
cosine = CosineSearch(vec) |
|
return cosine |
|
|
|
cosine = encode_book() |
|
|
|
def search(text): |
|
enc = encoder.encode(text) |
|
order = cosine(enc) |
|
sentence_df = pd.DataFrame({"sentence":np.array(all_lines)[order[:5]]}) |
|
return sentence_df |
|
|
|
keyword = st.text_input("用白话搜", "") |
|
if st.button("搜索"): |
|
if keyword: |
|
with st.spinner(f"🔍 Searching for {keyword}"): |
|
df = search(keyword) |
|
st.table(df) |
|
|