Spaces:
Runtime error
Runtime error
Pavankalyan
commited on
Commit
•
5602ea6
1
Parent(s):
d42d3ef
Upload 5 files
Browse files- Responses.csv +0 -0
- corpus.pt +3 -0
- data_url.csv +56 -0
- load_data.py +138 -0
- meta_data.json +1 -0
Responses.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
corpus.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8388b97d5d8045bc3a9fc29f6793001123368ed44de62ff9a26d5f33f4a3ff56
|
3 |
+
size 5803755
|
data_url.csv
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
file_name,link
|
2 |
+
0,ccservices_new.txt,https://www.cc.iitb.ac.in/
|
3 |
+
1,file1.txt,https://drive.google.com/file/d/1NaYLmXUc1DzhxZ3ps9fBp9ZN5i1qrtZy/view?usp=sharing
|
4 |
+
2,file10.txt,https://drive.google.com/file/d/1sIvXZFTs051Si7CC37b35gnpN4BRuKmf/view?usp=sharing
|
5 |
+
3,file11.txt,https://drive.google.com/file/d/1dbQcZsco3mRjBZNz8BDdXurzbKuWmbTE/view?usp=sharing
|
6 |
+
4,file12.txt,https://drive.google.com/file/d/17ky3Ayl29Q1mWAYqivV-5MXVJ4FUVkbK/view?usp=sharing
|
7 |
+
5,file13.txt,https://drive.google.com/file/d/1bZ1ZORsXS3bsDeQupEwdkdicbcfdsuzZ/view?usp=sharing
|
8 |
+
6,file14.txt,https://drive.google.com/file/d/1eTICcbswfGC3yFl4hcajy6JahRjwOFTD/view?usp=sharing
|
9 |
+
7,file15.txt,https://drive.google.com/file/d/1JQo8NkjtdWtrAXScSMC6Bp0xG6-EAbCo/view?usp=sharing
|
10 |
+
8,file17.txt,https://drive.google.com/file/d/1-r5uvlzFcNsChsXzqvpLjTVQ3FWhDZub/view?usp=sharing
|
11 |
+
9,file18.txt,https://drive.google.com/file/d/1eV3DfnbSspxT_GJ755iemXSuacqK0OMK/view?usp=sharing
|
12 |
+
10,file2.txt,https://drive.google.com/file/d/1zn6hPZe1fJ3otvWvXEyq2V991sTfYo1v/view?usp=sharing
|
13 |
+
11,file20_new.txt,https://drive.google.com/file/d/11rJhoC34ONDXPo7r9U6ynlLFIp_CFqjG/view?usp=sharing
|
14 |
+
12,file21_new.txt,https://drive.google.com/file/d/1Rc91IlWYC0l3u0GMlFN8u6P1Lqg3bp43/view?usp=sharing
|
15 |
+
13,file22.txt,https://drive.google.com/file/d/155w3J-KcmnTtpRIZEMqU8Sy2w_85nTlA/view?usp=sharing
|
16 |
+
14,file23.txt,https://drive.google.com/file/d/1h1TCBnl7__c63F9IQR0P8wvsIUGxIbU4/view?usp=sharing
|
17 |
+
15,file24.txt,https://drive.google.com/file/d/1FKa2uXENxCeUPm0j51uYVmUQfnUOtAvU/view?usp=sharing
|
18 |
+
16,file3.txt,https://drive.google.com/file/d/1yMO7AclQWfvuIgRvP-ACEAIpwIayIRrV/view?usp=sharing
|
19 |
+
17,file4.txt,https://drive.google.com/file/d/1Dz7vH8vZggfezr5sgpYI1ZsV41VOk23o/view?usp=sharing
|
20 |
+
18,file5.txt,https://drive.google.com/file/d/1BCpQwWA5RL1adaZkJnv7U7-qWGLoPi4d/view?usp=sharing
|
21 |
+
19,file6.txt,https://drive.google.com/file/d/1pP5kK9r350pmYIWJl6aRdMEe06XMf3ny/view?usp=sharing
|
22 |
+
20,file7.txt,https://drive.google.com/file/d/1LBFWcznes0Xyth9HzFepJezBtl2O0ItJ/view?usp=sharing
|
23 |
+
21,file8.txt,https://drive.google.com/file/d/1F56u2ro-qmCnwjV8jsjQOOLYY4aOu0Ed/view?usp=sharing
|
24 |
+
22,file9.txt,https://drive.google.com/file/d/19pQBbTLy-7OI4hKwoaQ9pehsVKpLWCmH/view?usp=sharing
|
25 |
+
23,getting_started_new.txt,https://www.cc.iitb.ac.in/
|
26 |
+
24,HallmanagerDuties.txt,https://gymkhana.iitb.ac.in/hostels/#/hall-manager-duties
|
27 |
+
25,howto_new.txt,https://www.cc.iitb.ac.in/
|
28 |
+
26,ismp0.txt,https://smp.gymkhana.iitb.ac.in/incoming_introduction.php
|
29 |
+
27,ismp1.txt,https://smp.gymkhana.iitb.ac.in/incoming_things_to_do.php
|
30 |
+
28,ismp10.txt,https://smp.gymkhana.iitb.ac.in/academics_cpi.php
|
31 |
+
29,ismp11.txt,https://smp.gymkhana.iitb.ac.in/academics_bc.php
|
32 |
+
30,ismp12.txt,https://smp.gymkhana.iitb.ac.in/academics_idddp.php
|
33 |
+
31,ismp13.txt,https://smp.gymkhana.iitb.ac.in/academics_semex.php
|
34 |
+
32,ismp14.txt,https://smp.gymkhana.iitb.ac.in/life_intro.php
|
35 |
+
33,ismp15.txt,https://smp.gymkhana.iitb.ac.in/life_intro.php
|
36 |
+
34,ismp16.txt,https://smp.gymkhana.iitb.ac.in/life_campus.php
|
37 |
+
35,ismp17.txt,https://smp.gymkhana.iitb.ac.in/life_support.php
|
38 |
+
36,ismp18.txt,https://smp.gymkhana.iitb.ac.in/life_culture.php
|
39 |
+
37,ismp19.txt,https://smp.gymkhana.iitb.ac.in/extra_curricular.php
|
40 |
+
38,ismp2.txt,https://smp.gymkhana.iitb.ac.in/incoming_accomadation.php
|
41 |
+
39,ismp20.txt,https://smp.gymkhana.iitb.ac.in/extra_gym_sports.php#gymkhana
|
42 |
+
40,ismp21.txt,https://smp.gymkhana.iitb.ac.in/extra_culture.php
|
43 |
+
41,ismp22.txt,https://smp.gymkhana.iitb.ac.in/extra_media.php
|
44 |
+
42,ismp23.txt,https://smp.gymkhana.iitb.ac.in/extra_gym_sports.php#sports
|
45 |
+
43,ismp24.txt,https://smp.gymkhana.iitb.ac.in/extra_technical.php
|
46 |
+
44,ismp25.txt,https://smp.gymkhana.iitb.ac.in/extra_ibs.php
|
47 |
+
45,ismp3.txt,https://smp.gymkhana.iitb.ac.in/incoming_scholarships.php
|
48 |
+
46,ismp4.txt,https://smp.gymkhana.iitb.ac.in/incoming_fee_structure.php
|
49 |
+
47,ismp5.txt,https://smp.gymkhana.iitb.ac.in/incoming_letter_to_parents.php
|
50 |
+
48,ismp6.txt,https://smp.gymkhana.iitb.ac.in/academics.php
|
51 |
+
49,ismp7.txt,https://smp.gymkhana.iitb.ac.in/academics_intro.php
|
52 |
+
50,ismp8.txt,https://smp.gymkhana.iitb.ac.in/academics.php#curriculum
|
53 |
+
51,ismp9.txt,https://smp.gymkhana.iitb.ac.in/academics.php#departments
|
54 |
+
52,network_new.txt,https://www.cc.iitb.ac.in/
|
55 |
+
53,policies_new.txt,https://www.cc.iitb.ac.in/
|
56 |
+
54,SWC.txt,https://www.iitb.ac.in/swc/en/about-student-wellness-centre
|
load_data.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import re
|
5 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
6 |
+
import torch
|
7 |
+
import time
|
8 |
+
import textwrap
|
9 |
+
|
10 |
+
model_bi_encoder = "msmarco-distilbert-base-tas-b"
|
11 |
+
model_cross_encoder = "cross-encoder/ms-marco-MiniLM-L-12-v2"
|
12 |
+
|
13 |
+
bi_encoder = SentenceTransformer(model_bi_encoder)
|
14 |
+
bi_encoder.max_seq_length = 512
|
15 |
+
cross_encoder = CrossEncoder(model_cross_encoder)
|
16 |
+
|
17 |
+
def collect_data(data_lis,meta_count):
|
18 |
+
new_files = data_lis['file_name'][meta_count:]
|
19 |
+
new_links = data_lis['link'][meta_count:]
|
20 |
+
return new_files,new_links
|
21 |
+
|
22 |
+
def merge_text(text_list):
|
23 |
+
i = 0;j = 1
|
24 |
+
k = len(text_list)
|
25 |
+
while j < k:
|
26 |
+
if len(text_list[i].split()) <= 30:
|
27 |
+
text_list[j] = text_list[i] + " " + text_list[j]
|
28 |
+
text_list[i] = " "
|
29 |
+
i += 1;j += 1
|
30 |
+
return [accepted for accepted in text_list if accepted != " "]
|
31 |
+
|
32 |
+
def make_data(new_files,new_links,local_path):
|
33 |
+
text = [];links = []
|
34 |
+
for doc in range(len(new_files)):
|
35 |
+
sub_text = [];sub_link = []
|
36 |
+
with open(os.path.join(local_path, new_files[doc]), encoding='utf-8') as f:
|
37 |
+
for line in f.readlines():
|
38 |
+
temp_text = re.sub("\\n", "", line)
|
39 |
+
if temp_text != "":
|
40 |
+
sub_text.append(temp_text)
|
41 |
+
sub_text = merge_text(sub_text)
|
42 |
+
sub_link = [new_links[doc] for i in range(len(sub_text))]
|
43 |
+
text.extend(sub_text)
|
44 |
+
links.extend(sub_link)
|
45 |
+
return text,links
|
46 |
+
|
47 |
+
def get_final_data():
|
48 |
+
|
49 |
+
#Define all the paths
|
50 |
+
meta_path = "meta_data.json"
|
51 |
+
data_lis_path = "data_url.csv"
|
52 |
+
local_path = "Data_final"
|
53 |
+
data_path = "Responses.csv"
|
54 |
+
corpus_path = "corpus.pt"
|
55 |
+
|
56 |
+
# Load the list of data files
|
57 |
+
data_lis = pd.read_csv(data_lis_path)
|
58 |
+
|
59 |
+
# Load the responses.csv file
|
60 |
+
if not(os.path.exists(data_path)):
|
61 |
+
fresh_text = []
|
62 |
+
fresh_link = []
|
63 |
+
fresh_data = {
|
64 |
+
"text": fresh_text,
|
65 |
+
"links": fresh_link
|
66 |
+
}
|
67 |
+
fresh_data = pd.DataFrame(fresh_data)
|
68 |
+
fresh_data.to_csv(data_path)
|
69 |
+
data = pd.read_csv(data_path)
|
70 |
+
|
71 |
+
# Check for any new files; If present add those to responses.csv file
|
72 |
+
# Make changes to corpus.pt accordingly
|
73 |
+
act_count = len(data_lis['file_name'])
|
74 |
+
with open(meta_path, "r") as jsonFile:
|
75 |
+
meta_data = json.load(jsonFile)
|
76 |
+
meta_count = meta_data["data"]["count"]
|
77 |
+
|
78 |
+
if meta_count!=act_count:
|
79 |
+
meta_data["data"]["count"] = act_count
|
80 |
+
with open(meta_path, "w") as jsonFile:
|
81 |
+
json.dump(meta_data, jsonFile)
|
82 |
+
new_files,new_links = collect_data(data_lis,meta_count)
|
83 |
+
text,links = make_data(new_files,new_links,local_path)
|
84 |
+
df = {
|
85 |
+
"text": text,
|
86 |
+
"links":links
|
87 |
+
}
|
88 |
+
df = pd.DataFrame(df)
|
89 |
+
data = pd.concat([data,df])
|
90 |
+
data.to_csv("Responses.csv")
|
91 |
+
if not(os.path.exists(corpus_path)):
|
92 |
+
corpus_embeddings = bi_encoder.encode(data["text"], convert_to_tensor=True, show_progress_bar=True)
|
93 |
+
torch.save(corpus_embeddings, corpus_path)
|
94 |
+
else:
|
95 |
+
corpus_embeddings = torch.load(corpus_path)
|
96 |
+
new_embeddings = bi_encoder.encode(df["text"], convert_to_tensor=True, show_progress_bar=True)
|
97 |
+
corpus_embeddings = torch.cat((corpus_embeddings,new_embeddings),0)
|
98 |
+
torch.save(corpus_embeddings, corpus_path)
|
99 |
+
|
100 |
+
corpus_embeddings = torch.load(corpus_path)
|
101 |
+
return corpus_embeddings,data
|
102 |
+
|
103 |
+
|
104 |
+
def search(query):
|
105 |
+
corpus_embeddings,data = get_final_data()
|
106 |
+
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
|
107 |
+
top_k = 20
|
108 |
+
#be = time.process_time()
|
109 |
+
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
|
110 |
+
#print("Time taken by Bi-encoder:" + str(time.process_time() - be))
|
111 |
+
|
112 |
+
hits = hits[0]
|
113 |
+
cross_inp = [[query, data['text'][hit['corpus_id']]] for hit in hits]
|
114 |
+
|
115 |
+
#ce = time.process_time()
|
116 |
+
cross_scores = cross_encoder.predict(cross_inp)
|
117 |
+
#print("Time taken by Cross-encoder:" + str(time.process_time() - ce))
|
118 |
+
|
119 |
+
# Sort results by the cross-encoder scores
|
120 |
+
for idx in range(len(cross_scores)):
|
121 |
+
hits[idx]['cross-score'] = cross_scores[idx]
|
122 |
+
|
123 |
+
|
124 |
+
hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
|
125 |
+
result_table = list()
|
126 |
+
for hit in hits[0:5]:
|
127 |
+
ans = "{}".format(data['text'][hit['corpus_id']].replace("\n", " "))
|
128 |
+
#print(ans)
|
129 |
+
cs = "{}".format(hit['cross-score'])
|
130 |
+
#print(cs)
|
131 |
+
sc = "{}".format(hit['score'])
|
132 |
+
#print(sc)
|
133 |
+
corr_link = "{}".format(data['links'][hit['corpus_id']])
|
134 |
+
wrapper = textwrap.TextWrapper(width=50)
|
135 |
+
ans = wrapper.fill(text=ans)
|
136 |
+
result_table.append([ans,str(cs),str(sc),str(corr_link)])
|
137 |
+
|
138 |
+
return result_table
|
meta_data.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"data": {"count": 55}}
|