Pavankalyan commited on
Commit
5602ea6
1 Parent(s): d42d3ef

Upload 5 files

Browse files
Files changed (5) hide show
  1. Responses.csv +0 -0
  2. corpus.pt +3 -0
  3. data_url.csv +56 -0
  4. load_data.py +138 -0
  5. meta_data.json +1 -0
Responses.csv ADDED
The diff for this file is too large to render. See raw diff
 
corpus.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8388b97d5d8045bc3a9fc29f6793001123368ed44de62ff9a26d5f33f4a3ff56
3
+ size 5803755
data_url.csv ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ file_name,link
2
+ 0,ccservices_new.txt,https://www.cc.iitb.ac.in/
3
+ 1,file1.txt,https://drive.google.com/file/d/1NaYLmXUc1DzhxZ3ps9fBp9ZN5i1qrtZy/view?usp=sharing
4
+ 2,file10.txt,https://drive.google.com/file/d/1sIvXZFTs051Si7CC37b35gnpN4BRuKmf/view?usp=sharing
5
+ 3,file11.txt,https://drive.google.com/file/d/1dbQcZsco3mRjBZNz8BDdXurzbKuWmbTE/view?usp=sharing
6
+ 4,file12.txt,https://drive.google.com/file/d/17ky3Ayl29Q1mWAYqivV-5MXVJ4FUVkbK/view?usp=sharing
7
+ 5,file13.txt,https://drive.google.com/file/d/1bZ1ZORsXS3bsDeQupEwdkdicbcfdsuzZ/view?usp=sharing
8
+ 6,file14.txt,https://drive.google.com/file/d/1eTICcbswfGC3yFl4hcajy6JahRjwOFTD/view?usp=sharing
9
+ 7,file15.txt,https://drive.google.com/file/d/1JQo8NkjtdWtrAXScSMC6Bp0xG6-EAbCo/view?usp=sharing
10
+ 8,file17.txt,https://drive.google.com/file/d/1-r5uvlzFcNsChsXzqvpLjTVQ3FWhDZub/view?usp=sharing
11
+ 9,file18.txt,https://drive.google.com/file/d/1eV3DfnbSspxT_GJ755iemXSuacqK0OMK/view?usp=sharing
12
+ 10,file2.txt,https://drive.google.com/file/d/1zn6hPZe1fJ3otvWvXEyq2V991sTfYo1v/view?usp=sharing
13
+ 11,file20_new.txt,https://drive.google.com/file/d/11rJhoC34ONDXPo7r9U6ynlLFIp_CFqjG/view?usp=sharing
14
+ 12,file21_new.txt,https://drive.google.com/file/d/1Rc91IlWYC0l3u0GMlFN8u6P1Lqg3bp43/view?usp=sharing
15
+ 13,file22.txt,https://drive.google.com/file/d/155w3J-KcmnTtpRIZEMqU8Sy2w_85nTlA/view?usp=sharing
16
+ 14,file23.txt,https://drive.google.com/file/d/1h1TCBnl7__c63F9IQR0P8wvsIUGxIbU4/view?usp=sharing
17
+ 15,file24.txt,https://drive.google.com/file/d/1FKa2uXENxCeUPm0j51uYVmUQfnUOtAvU/view?usp=sharing
18
+ 16,file3.txt,https://drive.google.com/file/d/1yMO7AclQWfvuIgRvP-ACEAIpwIayIRrV/view?usp=sharing
19
+ 17,file4.txt,https://drive.google.com/file/d/1Dz7vH8vZggfezr5sgpYI1ZsV41VOk23o/view?usp=sharing
20
+ 18,file5.txt,https://drive.google.com/file/d/1BCpQwWA5RL1adaZkJnv7U7-qWGLoPi4d/view?usp=sharing
21
+ 19,file6.txt,https://drive.google.com/file/d/1pP5kK9r350pmYIWJl6aRdMEe06XMf3ny/view?usp=sharing
22
+ 20,file7.txt,https://drive.google.com/file/d/1LBFWcznes0Xyth9HzFepJezBtl2O0ItJ/view?usp=sharing
23
+ 21,file8.txt,https://drive.google.com/file/d/1F56u2ro-qmCnwjV8jsjQOOLYY4aOu0Ed/view?usp=sharing
24
+ 22,file9.txt,https://drive.google.com/file/d/19pQBbTLy-7OI4hKwoaQ9pehsVKpLWCmH/view?usp=sharing
25
+ 23,getting_started_new.txt,https://www.cc.iitb.ac.in/
26
+ 24,HallmanagerDuties.txt,https://gymkhana.iitb.ac.in/hostels/#/hall-manager-duties
27
+ 25,howto_new.txt,https://www.cc.iitb.ac.in/
28
+ 26,ismp0.txt,https://smp.gymkhana.iitb.ac.in/incoming_introduction.php
29
+ 27,ismp1.txt,https://smp.gymkhana.iitb.ac.in/incoming_things_to_do.php
30
+ 28,ismp10.txt,https://smp.gymkhana.iitb.ac.in/academics_cpi.php
31
+ 29,ismp11.txt,https://smp.gymkhana.iitb.ac.in/academics_bc.php
32
+ 30,ismp12.txt,https://smp.gymkhana.iitb.ac.in/academics_idddp.php
33
+ 31,ismp13.txt,https://smp.gymkhana.iitb.ac.in/academics_semex.php
34
+ 32,ismp14.txt,https://smp.gymkhana.iitb.ac.in/life_intro.php
35
+ 33,ismp15.txt,https://smp.gymkhana.iitb.ac.in/life_intro.php
36
+ 34,ismp16.txt,https://smp.gymkhana.iitb.ac.in/life_campus.php
37
+ 35,ismp17.txt,https://smp.gymkhana.iitb.ac.in/life_support.php
38
+ 36,ismp18.txt,https://smp.gymkhana.iitb.ac.in/life_culture.php
39
+ 37,ismp19.txt,https://smp.gymkhana.iitb.ac.in/extra_curricular.php
40
+ 38,ismp2.txt,https://smp.gymkhana.iitb.ac.in/incoming_accomadation.php
41
+ 39,ismp20.txt,https://smp.gymkhana.iitb.ac.in/extra_gym_sports.php#gymkhana
42
+ 40,ismp21.txt,https://smp.gymkhana.iitb.ac.in/extra_culture.php
43
+ 41,ismp22.txt,https://smp.gymkhana.iitb.ac.in/extra_media.php
44
+ 42,ismp23.txt,https://smp.gymkhana.iitb.ac.in/extra_gym_sports.php#sports
45
+ 43,ismp24.txt,https://smp.gymkhana.iitb.ac.in/extra_technical.php
46
+ 44,ismp25.txt,https://smp.gymkhana.iitb.ac.in/extra_ibs.php
47
+ 45,ismp3.txt,https://smp.gymkhana.iitb.ac.in/incoming_scholarships.php
48
+ 46,ismp4.txt,https://smp.gymkhana.iitb.ac.in/incoming_fee_structure.php
49
+ 47,ismp5.txt,https://smp.gymkhana.iitb.ac.in/incoming_letter_to_parents.php
50
+ 48,ismp6.txt,https://smp.gymkhana.iitb.ac.in/academics.php
51
+ 49,ismp7.txt,https://smp.gymkhana.iitb.ac.in/academics_intro.php
52
+ 50,ismp8.txt,https://smp.gymkhana.iitb.ac.in/academics.php#curriculum
53
+ 51,ismp9.txt,https://smp.gymkhana.iitb.ac.in/academics.php#departments
54
+ 52,network_new.txt,https://www.cc.iitb.ac.in/
55
+ 53,policies_new.txt,https://www.cc.iitb.ac.in/
56
+ 54,SWC.txt,https://www.iitb.ac.in/swc/en/about-student-wellness-centre
load_data.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import json
4
+ import re
5
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
6
+ import torch
7
+ import time
8
+ import textwrap
9
+
10
+ model_bi_encoder = "msmarco-distilbert-base-tas-b"
11
+ model_cross_encoder = "cross-encoder/ms-marco-MiniLM-L-12-v2"
12
+
13
+ bi_encoder = SentenceTransformer(model_bi_encoder)
14
+ bi_encoder.max_seq_length = 512
15
+ cross_encoder = CrossEncoder(model_cross_encoder)
16
+
17
+ def collect_data(data_lis,meta_count):
18
+ new_files = data_lis['file_name'][meta_count:]
19
+ new_links = data_lis['link'][meta_count:]
20
+ return new_files,new_links
21
+
22
+ def merge_text(text_list):
23
+ i = 0;j = 1
24
+ k = len(text_list)
25
+ while j < k:
26
+ if len(text_list[i].split()) <= 30:
27
+ text_list[j] = text_list[i] + " " + text_list[j]
28
+ text_list[i] = " "
29
+ i += 1;j += 1
30
+ return [accepted for accepted in text_list if accepted != " "]
31
+
32
+ def make_data(new_files,new_links,local_path):
33
+ text = [];links = []
34
+ for doc in range(len(new_files)):
35
+ sub_text = [];sub_link = []
36
+ with open(os.path.join(local_path, new_files[doc]), encoding='utf-8') as f:
37
+ for line in f.readlines():
38
+ temp_text = re.sub("\\n", "", line)
39
+ if temp_text != "":
40
+ sub_text.append(temp_text)
41
+ sub_text = merge_text(sub_text)
42
+ sub_link = [new_links[doc] for i in range(len(sub_text))]
43
+ text.extend(sub_text)
44
+ links.extend(sub_link)
45
+ return text,links
46
+
47
+ def get_final_data():
48
+
49
+ #Define all the paths
50
+ meta_path = "meta_data.json"
51
+ data_lis_path = "data_url.csv"
52
+ local_path = "Data_final"
53
+ data_path = "Responses.csv"
54
+ corpus_path = "corpus.pt"
55
+
56
+ # Load the list of data files
57
+ data_lis = pd.read_csv(data_lis_path)
58
+
59
+ # Load the responses.csv file
60
+ if not(os.path.exists(data_path)):
61
+ fresh_text = []
62
+ fresh_link = []
63
+ fresh_data = {
64
+ "text": fresh_text,
65
+ "links": fresh_link
66
+ }
67
+ fresh_data = pd.DataFrame(fresh_data)
68
+ fresh_data.to_csv(data_path)
69
+ data = pd.read_csv(data_path)
70
+
71
+ # Check for any new files; If present add those to responses.csv file
72
+ # Make changes to corpus.pt accordingly
73
+ act_count = len(data_lis['file_name'])
74
+ with open(meta_path, "r") as jsonFile:
75
+ meta_data = json.load(jsonFile)
76
+ meta_count = meta_data["data"]["count"]
77
+
78
+ if meta_count!=act_count:
79
+ meta_data["data"]["count"] = act_count
80
+ with open(meta_path, "w") as jsonFile:
81
+ json.dump(meta_data, jsonFile)
82
+ new_files,new_links = collect_data(data_lis,meta_count)
83
+ text,links = make_data(new_files,new_links,local_path)
84
+ df = {
85
+ "text": text,
86
+ "links":links
87
+ }
88
+ df = pd.DataFrame(df)
89
+ data = pd.concat([data,df])
90
+ data.to_csv("Responses.csv")
91
+ if not(os.path.exists(corpus_path)):
92
+ corpus_embeddings = bi_encoder.encode(data["text"], convert_to_tensor=True, show_progress_bar=True)
93
+ torch.save(corpus_embeddings, corpus_path)
94
+ else:
95
+ corpus_embeddings = torch.load(corpus_path)
96
+ new_embeddings = bi_encoder.encode(df["text"], convert_to_tensor=True, show_progress_bar=True)
97
+ corpus_embeddings = torch.cat((corpus_embeddings,new_embeddings),0)
98
+ torch.save(corpus_embeddings, corpus_path)
99
+
100
+ corpus_embeddings = torch.load(corpus_path)
101
+ return corpus_embeddings,data
102
+
103
+
104
+ def search(query):
105
+ corpus_embeddings,data = get_final_data()
106
+ question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
107
+ top_k = 20
108
+ #be = time.process_time()
109
+ hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
110
+ #print("Time taken by Bi-encoder:" + str(time.process_time() - be))
111
+
112
+ hits = hits[0]
113
+ cross_inp = [[query, data['text'][hit['corpus_id']]] for hit in hits]
114
+
115
+ #ce = time.process_time()
116
+ cross_scores = cross_encoder.predict(cross_inp)
117
+ #print("Time taken by Cross-encoder:" + str(time.process_time() - ce))
118
+
119
+ # Sort results by the cross-encoder scores
120
+ for idx in range(len(cross_scores)):
121
+ hits[idx]['cross-score'] = cross_scores[idx]
122
+
123
+
124
+ hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
125
+ result_table = list()
126
+ for hit in hits[0:5]:
127
+ ans = "{}".format(data['text'][hit['corpus_id']].replace("\n", " "))
128
+ #print(ans)
129
+ cs = "{}".format(hit['cross-score'])
130
+ #print(cs)
131
+ sc = "{}".format(hit['score'])
132
+ #print(sc)
133
+ corr_link = "{}".format(data['links'][hit['corpus_id']])
134
+ wrapper = textwrap.TextWrapper(width=50)
135
+ ans = wrapper.fill(text=ans)
136
+ result_table.append([ans,str(cs),str(sc),str(corr_link)])
137
+
138
+ return result_table
meta_data.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"data": {"count": 55}}