|
|
|
|
|
import requests |
|
import spacy_udpipe |
|
import streamlit as st |
|
from spacy import displacy |
|
|
|
spacy_udpipe.download("ar") |
|
nlp = spacy_udpipe.load("ar") |
|
nlp.add_pipe("span_marker", |
|
config={"model": "iahlt/span-marker-xlm-roberta-base-ar"}) |
|
|
|
|
|
DEFAULT_LABEL_COLORS = { |
|
"ORG": "#17A2B8", |
|
"ORGS": "#17A2B8", |
|
"ORGANIZATION": "#17A2B8", |
|
"PRODUCT": "#FA9F42", |
|
"COMMERCIAL_ITEM": "#FA9F42", |
|
"DUC": "#FA9F42", |
|
"GPE": "#FFC107", |
|
"LOC": "#28A745", |
|
"LOCATION": "#28A745", |
|
"PERSON": "#0069B4", |
|
"PER": "#0069B4", |
|
"PERS": "#0069B4", |
|
"TTL": "#FA8B1B", |
|
"TITLE": "#FA8B1B", |
|
"NORP": "#c887fb", |
|
"FAC": "#721817", |
|
"EVENT": "#2B4162", |
|
"EVE": "#2B4162", |
|
"LAW": "#C880B7", |
|
"LANGUAGE": "#437F97", |
|
"ANG": "#437F97", |
|
"WORK_OF_ART": "#0B6E4F", |
|
"WOA": "#0B6E4F", |
|
"DATE": "#849324", |
|
"TIME": "#849324", |
|
"TIMEX": "#849324", |
|
"MONEY": "#6C757D", |
|
"QUANTITY": "#FD151B", |
|
"ORDINAL": "#FD151B", |
|
"CARDINAL": "#FD151B", |
|
"PERCENT": "#F1D302", |
|
"MISC": "#e7d2e4", |
|
"OTHER": '#ff8197', |
|
} |
|
|
|
def get_html(html: str): |
|
"""Convert HTML so it can be rendered.""" |
|
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem"; direction: rtl; >{}</div>""" |
|
|
|
html = html.replace("\n", " ") |
|
style = "<style>mark.entity { display: inline-block }</style>" |
|
html = WRAPPER.format(html) |
|
return f"{style}{html}" |
|
|
|
|
|
def page_init(): |
|
st.header("Named Entity Recognition Demo") |
|
|
|
|
|
@st.cache_data |
|
def get_html_from_server(text): |
|
base_url = "https://ne-api.iahlt.org/api/arabic/ner/?text={}" |
|
|
|
def get_entities(text): |
|
text = text.strip() |
|
if text == "": |
|
return [] |
|
response = requests.get(base_url.format(requests.utils.quote(text))) |
|
answer = response.json() |
|
ents = [] |
|
for ent in answer["ents"]: |
|
if ent["entity_group"] == "O": |
|
continue |
|
ents.append({ |
|
"start": ent["start"], |
|
"end": ent["end"], |
|
"label": ent["entity_group"] |
|
}) |
|
answer["ents"] = ents |
|
return answer |
|
|
|
def render_entities(text): |
|
entities = get_entities(text) |
|
html = displacy.render(entities, |
|
style="ent", |
|
options={"direction": "rtl", "colors": DEFAULT_LABEL_COLORS}, |
|
manual=True) |
|
return html.replace("ltr", "rtl") |
|
|
|
return get_html(render_entities(text)) |
|
|
|
|
|
if __name__ == '__main__': |
|
page_init() |
|
|
|
sample_text = """ |
|
تمكن البطل الملاكم "محمد عيسى" القناص من الفوز في مباراته ببطولة دبي وذلك بعد انهائه النزال بالضربة القاضية. حيث يواصل البطل محمد عيسى مسيرته بتسلَّق الرُّتَب والألقاب ليصل لملاكمة الاحتراف. |
|
""".strip() |
|
|
|
text = st.text_area("Text", sample_text, height=200, max_chars=1000) |
|
btn = st.button("Annotate") |
|
style = """ |
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Scheherazade+New"> |
|
<style> |
|
.stTextArea textarea { |
|
font-size: 20px; |
|
font-color: black; |
|
font-family: 'Scheherazade+New'; |
|
direction: rtl; |
|
} |
|
.entities { |
|
font-size: 16px; |
|
font-family: 'David+Libre'; |
|
direction: rtl; |
|
} |
|
#MainMenu {visibility: hidden;} |
|
footer {visibility: hidden;} |
|
</style> |
|
""" |
|
st.write(style, unsafe_allow_html=True) |
|
|
|
if text and btn: |
|
doc = nlp(text) |
|
html = displacy.render( |
|
doc, |
|
style="ent", |
|
options={"direction": "rtl", "colors": DEFAULT_LABEL_COLORS}, |
|
manual=False, |
|
) |
|
|
|
nemo_html = get_html(html) |
|
iahlt_html = get_html_from_server(text) |
|
|
|
html = f""" |
|
<div style="display: flex; flex-direction: row; justify-content: space-between; direction: rtl"> |
|
<div> |
|
<h3>WikiANN-trained model results</h3> |
|
{nemo_html} |
|
</div> |
|
</div> |
|
<div style="display: flex; flex-direction: row; justify-content: space-between; direction: rtl"> |
|
<div> |
|
<h3>IAHLT results</h3> |
|
{iahlt_html} |
|
</div> |
|
</div> |
|
""" |
|
st.write(html, unsafe_allow_html=True) |
|
|
|
else: |
|
st.write("") |
|
|