Gosse Minnema
First public, anonymized version
2922ea1
import streamlit as st
import pandas as pd
import json
import nltk
import re
nltk.download("punkt")
with open("_perception_cache.json") as f:
s2s = json.load(f)
with open("_analysis_cache.json") as f:
s2f = json.load(f)
db = pd.read_excel("data.xlsx")
FRAMES_OF_INTEREST = ["Abusing", "Attack", "Hit_target", "Quarreling", "Use_firearm", "Death", "Dead_or_alive", "Experience_bodily_harm", "Cause_harm", "Killing", "Event", "Catastrophe", "Offenses"]
def get_frame_analysis(s):
frame_analysis = []
if s not in s2f:
return None
for fns in s2f[s]["sociofillmore"][0]["fn_structures"]:
# if True:
if fns["frame"] in FRAMES_OF_INTEREST:
analysis = {
"frame": fns["frame"],
"trigger": " ".join(fns["target"]["tokens_str"])
}
analysis.update({
rol[0]: " ".join(rol[1]["tokens_str"]) for rol in fns["roles"]
})
frame_analysis.append(analysis)
if len(frame_analysis) > 0:
return pd.DataFrame(frame_analysis)
else:
return None
def analyze_document(doc):
if not pd.isna(doc):
sentences = nltk.sent_tokenize(
doc,
language="english"
)
else:
sentences = []
perception_tables = []
frame_tables = []
for si, s in enumerate(sentences[:20]):
frame_analysis_df = get_frame_analysis(s)
frame_tables.append(frame_analysis_df)
perception_tables.append(s2s.get(s))
return sentences, perception_tables, frame_tables
st.write("# LorentzFillmore: WATCH YOUR LANGUAGE")
st.dataframe(db)
st.write("## Writing Exercises & Perception Scores")
text_columns = [col for col in db.columns if col.startswith("Writing exercise:")]
selected_column = st.selectbox(label="Writing exercise:", options=text_columns)
aggregate_sentences = st.checkbox(label="Aggregate over sentences?")
perception_rows = []
for _, row in db.iterrows():
sentences, perception_tables, frame_tables = analyze_document(row[selected_column])
# mean_blame_score = pd.DataFrame(perception_tables).mean()["blame-assassin"]
if aggregate_sentences:
perception_row = {
"writer": row["Email Address"],
"gender": row["I identify as ..."],
"language": row["What is your native language?"],
"background": row["What is your background?"],
"text": sentences
}
for k, v in pd.DataFrame(perception_tables).mean().to_dict().items():
perception_row[k] = v
perception_rows.append(perception_row)
else:
for s, pt, ft in zip(sentences, perception_tables, frame_tables):
perception_row = {
"writer": row["Email Address"],
"gender": row["I identify as ..."],
"language": row["What is your native language?"],
"background": row["What is your background?"],
"text": s
}
for k, v in pd.Series(pt).to_dict().items():
perception_row[k] = v
perception_rows.append(perception_row)
perception_df = pd.DataFrame(perception_rows)
dimension = st.selectbox(label="Which dimension of perception?", options=["blame", "cause", "focus"])
dim_cols = [col for col in perception_df.columns if col.startswith(dimension)]
dim_df = (
perception_df[["writer", "text"] + dim_cols]
.style.background_gradient(subset=dim_cols, axis=None, vmin=-2, vmax=2, cmap="YlGnBu")
)
st.dataframe(dim_df)
st.write("### Analysis by demographic attribute")
demo_attrib = st.selectbox("Select demographic attribute:", options=["writer", "gender", "language", "background"])
perc_attrib = st.selectbox("Select perception attribute", options=dim_cols)
st.plotly_chart(perception_df.groupby(demo_attrib).agg({perc_attrib: "mean"}).plot.bar(backend="plotly"))
st.write("## Comparing versions")
v_number = int(re.search(r"version (\d)", selected_column).group(1))
if v_number < 2:
st.warning("To compare versions, select a writing exercise with version number 2 or higher.")
else:
prev_version = re.sub(r"version (\d)", f"version {v_number - 1}", selected_column)
assert prev_version in text_columns
st.info(f"Comparing _{selected_column.replace('Writing exercise: ', '')}_ ↔️ _{prev_version.replace('Writing exercise: ', '')}_")
perception_diff_rows = []
for _, row in db.iterrows():
sentences, perception_tables, frame_tables = analyze_document(row[selected_column])
prev_sentences, prev_perception_tables, prev_frame_tables = analyze_document(row[prev_version])
perception_diff_row = {
"writer": row["Email Address"],
"gender": row["I identify as ..."],
"language": row["What is your native language?"],
"background": row["What is your background?"],
f"text_v{v_number - 1}": prev_sentences,
f"text_v{v_number}": sentences
}
perc_new = pd.DataFrame(perception_tables).mean().to_dict()
perc_old = pd.DataFrame(prev_perception_tables).mean().to_dict()
for k, v in perc_new.items():
if k not in perc_old:
perception_diff_row[k] = 0
else:
perception_diff_row[k] = v - perc_old[k]
perception_diff_rows.append(perception_diff_row)
perception_diff_df = pd.DataFrame(perception_diff_rows)
dim_diff_df = (
perception_diff_df[["writer", f"text_v{v_number - 1}", f"text_v{v_number}"] + dim_cols]
.style.background_gradient(subset=dim_cols, axis=None, vmin=-2, vmax=2, cmap="YlGnBu")
)
st.dataframe(dim_diff_df)
st.write("### Analysis by demographic attribute")
demo_attrib_diff = st.selectbox("Select demographic attribute for diff:", options=["writer", "gender", "language", "background"])
perc_attrib_diff = st.selectbox("Select perception attribute for diff", options=dim_cols)
st.plotly_chart(perception_diff_df.groupby(demo_attrib_diff).agg({perc_attrib_diff: "mean"}).plot.bar(backend="plotly"))
st.write("## Frame analysis")
only_sentences_with_relevant_frames = st.checkbox("Only analyze sentences containing relevant frames?")
selected_writer = st.selectbox(
label="Select a writer:",
options=sorted([f"{row['Email Address']}" for idx, row in db.iterrows()])
)
st.write("----")
writer_row = db[db["Email Address"] == selected_writer].iloc[0]
st.write("### Text information")
st.dataframe(writer_row)
sentences, perception_tables, frame_tables = analyze_document(writer_row[selected_column])
st.write("### Analysis of the entire document")
mean_perception = pd.DataFrame(perception_tables).mean().to_frame(name="mean perception")
st.dataframe(mean_perception.style.highlight_max(axis=0), width=500)
st.write("---")
st.write("### Analysis by sentence")
for si, s in enumerate(sentences[:20]):
frame_analysis_df = frame_tables[si]
if frame_analysis_df is None and only_sentences_with_relevant_frames:
continue
st.write(f"#### Sentence #{1+si:02}/{len(sentences[:20])}\n*{s}*")
if s not in s2s or s not in s2f:
st.write("(Analysis not found)")
continue
st.write("##### Perception")
perception_table = perception_tables[si]
perception_df = pd.Series(perception_table).to_frame(name="predicted perception").style.highlight_max(axis=0)
st.dataframe(perception_df, width=500)
if frame_analysis_df is not None:
st.write("##### Relevant frames")
st.dataframe(frame_analysis_df, width=750)