from collections import defaultdict from itertools import count from operator import itemgetter from pathlib import Path from typing import Dict, Optional from typing import List, Tuple, Union import htbuilder import streamlit as st from htbuilder import span, div, script, style, link, styles, HtmlElement, br from htbuilder.units import px from spacy.tokens import Doc palette = [ "#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854", "#ffd92f", "#e5c494", "#b3b3b3", ] inactive_color = "#BBB" def local_stylesheet(path): with open(path) as f: css = f.read() return style()( css ) def remote_stylesheet(url): return link( href=url ) def local_script(path): with open(path) as f: code = f.read() return script()( code ) def remote_script(url): return script( src=url ) def get_color(sent_idx): return palette[sent_idx % len(palette)] def hex_to_rgb(hex): hex = hex.replace("#", '') return tuple(int(hex[i:i + 2], 16) for i in (0, 2, 4)) def color_with_opacity(hex_color, opacity): rgb = hex_to_rgb(hex_color) return f"rgba({rgb[0]},{rgb[1]},{rgb[2]},{opacity:.2f})" class Component: def show(self, width=None, height=None, scrolling=True, **kwargs): out = div(style=styles( **kwargs ))(self.html()) html = str(out) st.components.v1.html(html, width=width, height=height, scrolling=scrolling) def html(self): raise NotImplemented class MainView(Component): def __init__( self, document: Doc, summaries: List[Doc], semantic_alignments: Optional[List[Dict]], lexical_alignments: Optional[List[Dict]], layout: str, scroll: bool, gray_out_stopwords: bool ): self.document = document self.summaries = summaries self.semantic_alignments = semantic_alignments self.lexical_alignments = lexical_alignments self.layout = layout self.scroll = scroll self.gray_out_stopwords = gray_out_stopwords def html(self): # Add document elements if self.document._.name == 'Document': document_name = 'Source Document' else: document_name = self.document._.name + ' summary' doc_header = div( id_="document-header" )( document_name ) doc_elements = [] # Add document content, which comprises multiple elements, one for each summary. Only the elment corresponding to # selected summary will be visible. mu = MultiUnderline() for summary_idx, summary in enumerate(self.summaries): token_idx_to_sent_idx = {} for sent_idx, sent in enumerate(summary.sents): for token in sent: token_idx_to_sent_idx[token.i] = sent_idx is_selected_summary = (summary_idx == 0) # By default, first summary is selected if self.semantic_alignments is not None: doc_token_idx_to_matches = defaultdict(list) semantic_alignment = self.semantic_alignments[summary_idx] for summary_token_idx, matches in semantic_alignment.items(): for doc_token_idx, sim in matches: doc_token_idx_to_matches[doc_token_idx].append((summary_token_idx, sim)) else: doc_token_idx_to_matches = {} token_elements = [] for doc_token_idx, doc_token in enumerate(self.document): if doc_token.is_stop or doc_token.is_punct: classes = ["stopword"] if self.gray_out_stopwords: classes.append("grayed-out") el = span( _class=" ".join(classes) )( doc_token.text ) else: matches = doc_token_idx_to_matches.get(doc_token_idx) if matches: summary_token_idx, sim = max(matches, key=itemgetter(1)) sent_idx = token_idx_to_sent_idx[summary_token_idx] color_primary = get_color(sent_idx) highlight_color_primary = color_with_opacity(color_primary, sim) props = { 'data-highlight-id': str(doc_token_idx), 'data-primary-color': highlight_color_primary } match_classes = [] for summary_token_idx, sim in matches: sent_idx = token_idx_to_sent_idx[summary_token_idx] match_classes.append(f"summary-highlight-{summary_idx}-{summary_token_idx}") color = color_with_opacity(get_color(sent_idx), sim) props[f"data-color-{summary_idx}-{summary_token_idx}"] = color props["data-match-classes"] = " ".join(match_classes) el = self._highlight( doc_token.text, highlight_color_primary, color_primary, match_classes + ["annotation-hidden"], **props ) else: el = doc_token.text token_elements.append(el) spans = [] if self.lexical_alignments is not None: lexical_alignment = self.lexical_alignments[summary_idx] for summary_span, doc_spans in lexical_alignment.items(): summary_span_start, summary_span_end = summary_span span_id = f"{summary_idx}-{summary_span_start}-{summary_span_end}" sent_idx = token_idx_to_sent_idx[summary_span_start] for doc_span_start, doc_span_end in doc_spans: spans.append(( doc_span_start, doc_span_end, sent_idx, get_color(sent_idx), span_id )) token_elements = mu.markup(token_elements, spans) classes = ["main-doc", "bordered"] if self.scroll: classes.append("scroll") main_doc = div( _class=" ".join(classes) )( token_elements ), classes = ["doc"] if is_selected_summary: classes.append("display") else: classes.append("nodisplay") doc_elements.append( div( **{ "class": " ".join(classes), "data-index": summary_idx } )( main_doc, div(_class="proxy-doc"), div(_class="proxy-scroll") ) ) summary_title = "Summary" summary_header = div( id_="summary-header" )( summary_title, div(id="summary-header-gap"), ) summary_items = [] for summary_idx, summary in enumerate(self.summaries): token_idx_to_sent_idx = {} for sent_idx, sent in enumerate(summary.sents): for token in sent: token_idx_to_sent_idx[token.i] = sent_idx spans = [] matches_ngram = [False] * len(list(summary)) if self.lexical_alignments is not None: lexical_alignment = self.lexical_alignments[summary_idx] for summary_span in lexical_alignment.keys(): start, end = summary_span matches_ngram[slice(start, end)] = [True] * (end - start) span_id = f"{summary_idx}-{start}-{end}" sent_idx = token_idx_to_sent_idx[start] spans.append(( start, end, sent_idx, get_color(sent_idx), span_id )) if self.semantic_alignments is not None: semantic_alignment = self.semantic_alignments[summary_idx] else: semantic_alignment = {} token_elements = [] for token_idx, token in enumerate(summary): if token.is_stop or token.is_punct: classes = ["stopword"] if self.gray_out_stopwords: classes.append("grayed-out") el = span( _class=" ".join(classes) )( token.text ) else: classes = [] if token.ent_iob_ in ('I', 'B'): classes.append("entity") if matches_ngram[token_idx]: classes.append("matches-ngram") matches = semantic_alignment.get(token_idx) if matches: top_match = max(matches, key=itemgetter(1)) top_sim = max(top_match[1], 0) top_doc_token_idx = top_match[0] props = { "data-highlight-id": f"{summary_idx}-{token_idx}", "data-top-doc-highlight-id": str(top_doc_token_idx), "data-top-doc-sim": f"{top_sim:.2f}", } classes.extend([ "annotation-hidden", f"summary-highlight-{summary_idx}-{token_idx}" ]) sent_idx = token_idx_to_sent_idx[token_idx] el = self._highlight( token.text, color_with_opacity(get_color(sent_idx), top_sim), color_with_opacity(get_color(sent_idx), 1), classes, **props ) else: if classes: el = span(_class=" ".join(classes))(token.text) else: el = token.text token_elements.append(el) token_elements = mu.markup(token_elements, spans) classes = ["summary-item"] if summary_idx == 0: # Default is for first summary to be selected classes.append("selected") summary_items.append( div( **{"class": ' '.join(classes), "data-index": summary_idx} )( div(_class="name")(summary._.name), div(_class="content")(token_elements) ) ) classes = ["summary-list", "bordered"] if self.scroll: classes.append("scroll") if self.lexical_alignments is not None: classes.append("has-lexical-alignment") if self.semantic_alignments is not None: classes.append("has-semantic-alignment") summary_list = div( _class=" ".join(classes) )( summary_items ) annotation_key = \ """