File size: 2,076 Bytes
67eeae3
 
 
 
 
 
121f3c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67eeae3
 
 
 
 
 
 
 
 
 
 
 
 
 
efbcb42
67eeae3
 
 
 
 
 
cfa2da9
67eeae3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer
from forgebox.cosine import CosineSearch
import numpy as np

from PIL import Image
image = Image.open('shiji.png')


st.markdown("""
## 🍻 跨古/现代文搜索: 用白话搜史记
""")

st.sidebar.image(image, use_column_width=True)
st.sidebar.markdown("""
Search《Records of the Grand Historian》 with modern Chinese
### References
* Model trained [here, please hit ⭐️](https://github.com/raynardj/yuan)
* [Trained crossed language BERT](https://huggingface.co./raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn)
### Related projects
* Read more [ancient books(almost all) with a translator](https://huggingface.co./spaces/raynardj/duguwen-classical-chinese-to-morden-translate)
* [Modern Chines to classical Chinese translator](https://huggingface.co./spaces/raynardj/modern-chinese-to-ancient-translate-wenyanwen)
""")

TAG = "raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn"

@st.cache(allow_output_mutation=True)
def load_encoder():
    with st.spinner(f"Loading Transformer:{TAG}"):
        encoder = SentenceTransformer(TAG)
    return encoder

encoder = load_encoder()

@st.cache(allow_output_mutation=True)
def load_book():
    with st.spinner(f"📚 Loading Book..."):
        df = pd.read_csv("grand_historian.csv")
    return list(df.sentences)

all_lines = load_book()

@st.cache(allow_output_mutation=True)
def encode_book():
    with st.spinner(f"Encoding sentences for book《Records of the Grand Historian》"):
        vec = np.load('vec.npy')
        cosine = CosineSearch(vec)
    return cosine

cosine = encode_book()

def search(text):
    enc = encoder.encode(text) # encode the search key
    order = cosine(enc) # distance array
    sentence_df = pd.DataFrame({"sentence":np.array(all_lines)[order[:5]]})
    return sentence_df

keyword = st.text_input("用白话搜", "")
if st.button("搜索"):
    if keyword:
        with st.spinner(f"🔍 Searching for {keyword}"):
            df = search(keyword)
            st.table(df)