Spaces:
Runtime error
Runtime error
File size: 8,642 Bytes
3736582 84bffdd 3736582 6bdcc65 3736582 d2dc8ab 3736582 d2dc8ab 3736582 d2dc8ab 3736582 4cb327c 3736582 d2dc8ab 3736582 d2dc8ab 3736582 d2dc8ab 3736582 1b17e20 3736582 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
import streamlit as st
import GPTHelper
from sentence_transformers import CrossEncoder
from pymed import PubMed
import pandas as pd
import plotly.express as px
import logging
from langdetect import detect
from typing import Dict, List
if "valid_inputs_received" not in st.session_state:
st.session_state["valid_inputs_received"] = False
def get_articles(query, fetcher) -> Dict[List[str], List[str]]:
# Fetches articles using pymed. Increasing max_results results in longer loading times.
results = fetcher.query(query, max_results=50)
conclusions = []
titles = []
links = []
for article in results:
article_id = 0 # If PubMed search fails to return anything
try:
article_id = article.pubmed_id[:8] # Sometimes pymed wrongly returns a long list of ids. Use only the first
# [] can cause the cross-encoder to misinterpret string as a list
title = article.title.replace('[', '(').replace(']', ')')
conclusion = article.conclusions
abstract = article.abstract
article_url = f'https://pubmed.ncbi.nlm.nih.gov/{article_id}/'
article_link = f'<a href="{article_url}" style="color: black; font-size: 16px; ' \
f'text-decoration: underline;">PubMed ID: {article_id}</a>' # Injects a link to plotly
if conclusion:
# Not all articles come with the provided conclusions. Abstract is used alternatively.
conclusion = conclusion.replace('[', '(').replace(']', ')')
conclusions.append(title+'\n'+conclusion)
titles.append(title) # Title is added to the conclusion to improve relevance ranking.
links.append(article_link)
elif abstract:
abstract = abstract.replace('[', '(').replace(']', ')')
conclusions.append(title + '\n' + abstract)
titles.append(title)
links.append(article_link)
except Exception as e:
logging.warning(f"Error reading article: {article_id}: ", exc_info=e)
return {
"Conclusions": conclusions,
"Links": links
}
@st.cache_resource
def load_cross_encoder():
# The pretrained cross-encoder model used for reranking. Can be substituted with a different one.
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
return cross_encoder
@st.cache_resource
def load_pubmed_fetcher():
pubmed = PubMed(tool="PubmedFactChecker", email="[email protected]")
return pubmed
def run_ui():
# This function controls the whole app flow.
st.set_page_config(page_title="PUBMED FACT-CHECKER", page_icon="📖")
sidebar = st.sidebar
sidebar.title('ABOUT')
sidebar.write("""
The PubMed fact-checker app enables users to verify biomedical claims by comparing them against
research papers available on PubMed. \n
As the number of self-proclaimed experts continues to rise,
so does the risk of harmful misinformation. This app showcases the potential of Large Language Models
to provide accurate and valuable information to people.
""")
sidebar.title('EXAMPLES')
sidebar.write('Try one of the below examples to see PubMed fact-checker in action.')
st.title('PubMed FACT CHECKER')
with st.form(key="fact_form"):
fact = st.text_input('Fact:', placeholder='Enter your fact')
submitted = st.form_submit_button("Fact-Check")
if sidebar.button('Mediterranean diet helps with weight loss.', use_container_width=250):
submitted = True
fact = 'Mediterranean diet helps with weight loss.'
if sidebar.button('Low Carb High Fat diet is healthy in long term.', use_container_width=250):
submitted = True
fact = 'Low Carb High Fat diet is healthy in long term.'
if sidebar.button('Vaccines are a cause of autism.', use_container_width=250):
submitted = True
fact = 'Vaccines are a cause of autism.'
sidebar.title('HOW IT WORKS')
sidebar.write('Source code and in-depth app description available at:')
sidebar.info('**GitHub: [@jacinthes](https://github.com/jacinthes/slovene-nli-benchmark)**', icon="💻")
sidebar.title('DISCLAIMER')
sidebar.write('This project is meant for educational and research purposes. \n'
'PubMed fact-checker may provide inaccurate information.')
if not submitted and not st.session_state.valid_inputs_received:
st.stop()
elif submitted and not fact:
st.warning('Please enter your fact before fact-checking.')
st.session_state.valid_inputs_received = False
st.stop()
elif submitted and not detect(fact) == 'en':
st.warning('Please enter valid text in English. For short inputs, language detection is sometimes inaccurate.')
st.session_state.valid_inputs_received = False
st.stop()
elif submitted and not len(fact) < 75:
st.warning('To ensure accurate searching, please keep your fact under 75 characters.')
st.session_state.valid_inputs_received = False
st.stop()
elif submitted or st.session_state.valid_inputs_received:
pubmed_query = GPTHelper.gpt35_rephrase(fact) # Call gpt3.5 to rephrase fact as a PubMed query.
pubmed = load_pubmed_fetcher()
with st.spinner('Fetching articles...'):
articles = get_articles(pubmed_query, pubmed)
article_conclusions = articles['Conclusions']
article_links = articles['Links']
cross_inp = [[fact, conclusions] for conclusions in article_conclusions]
with st.spinner('Assessing article relevancy...'):
cross_encoder = load_cross_encoder()
cross_scores = cross_encoder.predict(cross_inp) # Calculate relevancy using the defined cross-encoder.
df = pd.DataFrame({
'Link': article_links,
'Conclusion': article_conclusions,
'Score': cross_scores
})
df.sort_values(by=['Score'], ascending=False, inplace=True)
df = df[df['Score'] > 0] # Only keep articles with relevancy score above 0.
if df.shape[0] == 0: # If no relevant article si found, inform the user.
st.info(
"Unfortunately, I couldn't find anything for your search.\n"
"Don't let that discourage you, I have over 35 million citations in my database.\n"
"I am sure your next search will be more successful."
)
st.stop()
df = df.head(10) # Keep only 10 most relevant articles. This is done to control OpenAI costs and load time.
progress_text = "Assessing the validity of the fact based on relevant research papers."
fact_checking_bar = st.progress(0, text=progress_text)
step = 100/df.shape[0]
percent_complete = 0
predictions = []
for index, row in df.iterrows():
prediction = GPTHelper.gpt35_check_fact(row['Conclusion'], fact) # Prompt to GPT3.5 to fact-check
# For output purposes I use True, False and Undetermined as labels.
if prediction == 'Entails':
predictions.append('True')
elif prediction == 'Contradicts':
predictions.append('False')
elif prediction == 'Undetermined':
predictions.append(prediction)
else:
logging.warning(f'Unexpected prediction: {prediction}')
percent_complete += step/100
fact_checking_bar.progress(round(percent_complete, 2), text=progress_text)
fact_checking_bar.empty()
df['Prediction'] = predictions
# Prepare DataFrame for plotly sunburst chart.
totals = df.groupby('Prediction').size().to_dict()
df['Total'] = df['Prediction'].map(totals)
fig = px.sunburst(df, path=['Prediction', 'Link'], values='Total', height=600, width=600, color='Prediction',
color_discrete_map={
'False': "#FF8384",
'True': "#A5D46A",
'Undetermined': "#FFDF80"
}
)
fig.update_layout(
margin=dict(l=20, r=20, t=20, b=20),
font_size=32,
font_color='#000000'
)
st.write(f'According to PubMed "{fact}" is:')
st.plotly_chart(fig, use_container_width=True)
if __name__ == "__main__":
run_ui()
|