|
import streamlit as st |
|
import pandas as pd |
|
import json |
|
import nltk |
|
|
|
import re |
|
|
|
nltk.download("punkt") |
|
|
|
with open("_perception_cache.json") as f: |
|
s2s = json.load(f) |
|
with open("_analysis_cache.json") as f: |
|
s2f = json.load(f) |
|
|
|
db = pd.read_excel("data.xlsx") |
|
|
|
FRAMES_OF_INTEREST = ["Abusing", "Attack", "Hit_target", "Quarreling", "Use_firearm", "Death", "Dead_or_alive", "Experience_bodily_harm", "Cause_harm", "Killing", "Event", "Catastrophe", "Offenses"] |
|
|
|
|
|
def get_frame_analysis(s): |
|
frame_analysis = [] |
|
if s not in s2f: |
|
return None |
|
for fns in s2f[s]["sociofillmore"][0]["fn_structures"]: |
|
|
|
if fns["frame"] in FRAMES_OF_INTEREST: |
|
analysis = { |
|
"frame": fns["frame"], |
|
"trigger": " ".join(fns["target"]["tokens_str"]) |
|
} |
|
analysis.update({ |
|
rol[0]: " ".join(rol[1]["tokens_str"]) for rol in fns["roles"] |
|
}) |
|
frame_analysis.append(analysis) |
|
|
|
if len(frame_analysis) > 0: |
|
return pd.DataFrame(frame_analysis) |
|
else: |
|
return None |
|
|
|
def analyze_document(doc): |
|
if not pd.isna(doc): |
|
sentences = nltk.sent_tokenize( |
|
doc, |
|
language="english" |
|
) |
|
else: |
|
sentences = [] |
|
|
|
perception_tables = [] |
|
frame_tables = [] |
|
for si, s in enumerate(sentences[:20]): |
|
frame_analysis_df = get_frame_analysis(s) |
|
frame_tables.append(frame_analysis_df) |
|
perception_tables.append(s2s.get(s)) |
|
return sentences, perception_tables, frame_tables |
|
|
|
|
|
st.write("# LorentzFillmore: WATCH YOUR LANGUAGE") |
|
|
|
st.dataframe(db) |
|
|
|
st.write("## Writing Exercises & Perception Scores") |
|
|
|
text_columns = [col for col in db.columns if col.startswith("Writing exercise:")] |
|
selected_column = st.selectbox(label="Writing exercise:", options=text_columns) |
|
aggregate_sentences = st.checkbox(label="Aggregate over sentences?") |
|
|
|
perception_rows = [] |
|
for _, row in db.iterrows(): |
|
sentences, perception_tables, frame_tables = analyze_document(row[selected_column]) |
|
|
|
|
|
if aggregate_sentences: |
|
perception_row = { |
|
"writer": row["Email Address"], |
|
"gender": row["I identify as ..."], |
|
"language": row["What is your native language?"], |
|
"background": row["What is your background?"], |
|
"text": sentences |
|
} |
|
for k, v in pd.DataFrame(perception_tables).mean().to_dict().items(): |
|
perception_row[k] = v |
|
perception_rows.append(perception_row) |
|
else: |
|
for s, pt, ft in zip(sentences, perception_tables, frame_tables): |
|
perception_row = { |
|
"writer": row["Email Address"], |
|
"gender": row["I identify as ..."], |
|
"language": row["What is your native language?"], |
|
"background": row["What is your background?"], |
|
"text": s |
|
} |
|
for k, v in pd.Series(pt).to_dict().items(): |
|
perception_row[k] = v |
|
perception_rows.append(perception_row) |
|
|
|
perception_df = pd.DataFrame(perception_rows) |
|
|
|
dimension = st.selectbox(label="Which dimension of perception?", options=["blame", "cause", "focus"]) |
|
dim_cols = [col for col in perception_df.columns if col.startswith(dimension)] |
|
dim_df = ( |
|
perception_df[["writer", "text"] + dim_cols] |
|
.style.background_gradient(subset=dim_cols, axis=None, vmin=-2, vmax=2, cmap="YlGnBu") |
|
|
|
) |
|
st.dataframe(dim_df) |
|
|
|
st.write("### Analysis by demographic attribute") |
|
|
|
demo_attrib = st.selectbox("Select demographic attribute:", options=["writer", "gender", "language", "background"]) |
|
perc_attrib = st.selectbox("Select perception attribute", options=dim_cols) |
|
st.plotly_chart(perception_df.groupby(demo_attrib).agg({perc_attrib: "mean"}).plot.bar(backend="plotly")) |
|
|
|
|
|
st.write("## Comparing versions") |
|
v_number = int(re.search(r"version (\d)", selected_column).group(1)) |
|
if v_number < 2: |
|
st.warning("To compare versions, select a writing exercise with version number 2 or higher.") |
|
else: |
|
prev_version = re.sub(r"version (\d)", f"version {v_number - 1}", selected_column) |
|
assert prev_version in text_columns |
|
st.info(f"Comparing _{selected_column.replace('Writing exercise: ', '')}_ ↔️ _{prev_version.replace('Writing exercise: ', '')}_") |
|
|
|
perception_diff_rows = [] |
|
for _, row in db.iterrows(): |
|
sentences, perception_tables, frame_tables = analyze_document(row[selected_column]) |
|
prev_sentences, prev_perception_tables, prev_frame_tables = analyze_document(row[prev_version]) |
|
|
|
perception_diff_row = { |
|
"writer": row["Email Address"], |
|
"gender": row["I identify as ..."], |
|
"language": row["What is your native language?"], |
|
"background": row["What is your background?"], |
|
f"text_v{v_number - 1}": prev_sentences, |
|
f"text_v{v_number}": sentences |
|
} |
|
perc_new = pd.DataFrame(perception_tables).mean().to_dict() |
|
perc_old = pd.DataFrame(prev_perception_tables).mean().to_dict() |
|
for k, v in perc_new.items(): |
|
if k not in perc_old: |
|
perception_diff_row[k] = 0 |
|
else: |
|
perception_diff_row[k] = v - perc_old[k] |
|
perception_diff_rows.append(perception_diff_row) |
|
|
|
perception_diff_df = pd.DataFrame(perception_diff_rows) |
|
|
|
dim_diff_df = ( |
|
perception_diff_df[["writer", f"text_v{v_number - 1}", f"text_v{v_number}"] + dim_cols] |
|
.style.background_gradient(subset=dim_cols, axis=None, vmin=-2, vmax=2, cmap="YlGnBu") |
|
|
|
) |
|
st.dataframe(dim_diff_df) |
|
|
|
st.write("### Analysis by demographic attribute") |
|
|
|
demo_attrib_diff = st.selectbox("Select demographic attribute for diff:", options=["writer", "gender", "language", "background"]) |
|
perc_attrib_diff = st.selectbox("Select perception attribute for diff", options=dim_cols) |
|
st.plotly_chart(perception_diff_df.groupby(demo_attrib_diff).agg({perc_attrib_diff: "mean"}).plot.bar(backend="plotly")) |
|
|
|
|
|
st.write("## Frame analysis") |
|
only_sentences_with_relevant_frames = st.checkbox("Only analyze sentences containing relevant frames?") |
|
|
|
selected_writer = st.selectbox( |
|
label="Select a writer:", |
|
options=sorted([f"{row['Email Address']}" for idx, row in db.iterrows()]) |
|
) |
|
|
|
st.write("----") |
|
|
|
writer_row = db[db["Email Address"] == selected_writer].iloc[0] |
|
|
|
st.write("### Text information") |
|
st.dataframe(writer_row) |
|
|
|
sentences, perception_tables, frame_tables = analyze_document(writer_row[selected_column]) |
|
|
|
|
|
st.write("### Analysis of the entire document") |
|
mean_perception = pd.DataFrame(perception_tables).mean().to_frame(name="mean perception") |
|
st.dataframe(mean_perception.style.highlight_max(axis=0), width=500) |
|
|
|
st.write("---") |
|
st.write("### Analysis by sentence") |
|
|
|
for si, s in enumerate(sentences[:20]): |
|
frame_analysis_df = frame_tables[si] |
|
if frame_analysis_df is None and only_sentences_with_relevant_frames: |
|
continue |
|
|
|
st.write(f"#### Sentence #{1+si:02}/{len(sentences[:20])}\n*{s}*") |
|
if s not in s2s or s not in s2f: |
|
st.write("(Analysis not found)") |
|
continue |
|
|
|
st.write("##### Perception") |
|
perception_table = perception_tables[si] |
|
perception_df = pd.Series(perception_table).to_frame(name="predicted perception").style.highlight_max(axis=0) |
|
st.dataframe(perception_df, width=500) |
|
|
|
if frame_analysis_df is not None: |
|
st.write("##### Relevant frames") |
|
st.dataframe(frame_analysis_df, width=750) |