Spaces:

HEHEBOIOG
/

DiBotIE

Sleeping

App Files Files Community

Muhammad Haris commited on Jun 26, 2024

Commit

380d8a4

verified ·

1 Parent(s): 261d5bd

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -11

app.py CHANGED Viewed

@@ -8,12 +8,7 @@ import torch
 import gdown
 import os
-# file_id = '1P3Nz6f3KG0m0kO_2pEfnVIhgP8Bvkl4v'
-# url = f'https://drive.google.com/uc?id={file_id}'
-# excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
-# Download the file from Hugging Face Spaces
 url = 'https://huggingface.co/datasets/HEHEBOIBOT/PharmEvoDiabetesData/raw/main/medical_data.csv'
 excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
@@ -27,7 +22,7 @@ except UnicodeDecodeError:
 # TF-IDF Vectorization
 vectorizer = TfidfVectorizer(stop_words='english')
-X_tfidf = vectorizer.fit_transform(medical_df['Questions'])
 # Load pre-trained GPT-2 model and tokenizer
 model_name = "sshleifer/tiny-gpt2"
@@ -47,11 +42,11 @@ def get_medical_response(question, vectorizer, X_tfidf, model, tokenizer, sbert_
     # Find the most similar question using semantic similarity
     question_embedding = sbert_model.encode(question, convert_to_tensor=True)
-    similarities = util.pytorch_cos_sim(question_embedding, sbert_model.encode(medical_df['Questions'].tolist(), convert_to_tensor=True)).flatten()
     max_sim_index = similarities.argmax().item()
     # LLM response generation
-    input_text = "DiBot: " + medical_df.iloc[max_sim_index]['Questions']
     input_ids = tokenizer.encode(input_text, return_tensors="pt")
     attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
     pad_token_id = tokenizer.eos_token_id
@@ -61,7 +56,7 @@ def get_medical_response(question, vectorizer, X_tfidf, model, tokenizer, sbert_
     # Compare similarities and choose the best response
     if tfidf_similarities.max() > 0.5:
         tfidf_index = tfidf_similarities.argmax()
-        return medical_df.iloc[tfidf_index]['Answers']
     else:
         return lm_generated_response
@@ -85,4 +80,4 @@ if user_input:
 # Display the chat messages
 for message in st.session_state.messages:
     with st.chat_message(message["role"]):
-        st.markdown(message["content"])

 import gdown
 import os
+# Download the CSV file from Hugging Face Spaces
 url = 'https://huggingface.co/datasets/HEHEBOIBOT/PharmEvoDiabetesData/raw/main/medical_data.csv'
 excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
 # TF-IDF Vectorization
 vectorizer = TfidfVectorizer(stop_words='english')
+X_tfidf = vectorizer.fit_transform(medical_df.iloc[:, 0])  # Accessing first column by index
 # Load pre-trained GPT-2 model and tokenizer
 model_name = "sshleifer/tiny-gpt2"
     # Find the most similar question using semantic similarity
     question_embedding = sbert_model.encode(question, convert_to_tensor=True)
+    similarities = util.pytorch_cos_sim(question_embedding, sbert_model.encode(medical_df.iloc[:, 0].tolist(), convert_to_tensor=True)).flatten()
     max_sim_index = similarities.argmax().item()
     # LLM response generation
+    input_text = "DiBot: " + medical_df.iloc[max_sim_index][0]
     input_ids = tokenizer.encode(input_text, return_tensors="pt")
     attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
     pad_token_id = tokenizer.eos_token_id
     # Compare similarities and choose the best response
     if tfidf_similarities.max() > 0.5:
         tfidf_index = tfidf_similarities.argmax()
+        return medical_df.iloc[tfidf_index][1]  # Assuming 'Answers' is in the second column (index 1)
     else:
         return lm_generated_response
 # Display the chat messages
 for message in st.session_state.messages:
     with st.chat_message(message["role"]):
+        st.markdown(message["content"])