textgraphs

Sleeping

File size: 5,993 Bytes

91eaff6

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Sample application to demo the `TextGraphs` library.

see copyright/license https://huggingface.co./spaces/DerwenAI/textgraphs/blob/main/README.md
"""

import asyncio
import sys  # pylint: disable=W0611
import traceback
import time
import typing

from icecream import ic  # pylint: disable=E0401
from pyinstrument import Profiler  # pylint: disable=E0401
import matplotlib.pyplot as plt  # pylint: disable=E0401
import pandas as pd  # pylint: disable=E0401

import textgraphs


if __name__ == "__main__":
    SRC_TEXT: str = """
Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.
After the war, Werner fled to America to become famous.
"""

    ## set up
    ## NB: profiler raises handler exceptions when `concur = False`
    debug: bool = False  # True
    concur: bool = True  # False
    profile: bool = True  # False

    if profile:
        profiler: Profiler = Profiler()
        profiler.start()

    try:
        start_time: float = time.time()

        tg: textgraphs.TextGraphs = textgraphs.TextGraphs(
            factory = textgraphs.PipelineFactory(
                spacy_model = textgraphs.SPACY_MODEL,
                ner = None, #textgraphs.NERSpanMarker(),
                kg = textgraphs.KGWikiMedia(
                    spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API,
                    dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API,
                    dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API,
                    wikidata_api = textgraphs.WIKIDATA_API,
                ),
                infer_rels = [
                    textgraphs.InferRel_OpenNRE(
                        model = textgraphs.OPENNRE_MODEL,
                        max_skip = textgraphs.MAX_SKIP,
                        min_prob = textgraphs.OPENNRE_MIN_PROB,
                    ),
                    textgraphs.InferRel_Rebel(
                        lang = "en_XX",
                        mrebel_model = textgraphs.MREBEL_MODEL,
                    ),
                ],
            ),
        )

        duration: float = round(time.time() - start_time, 3)
        print(f"{duration:7.3f} sec: set up")


        ## NLP parse
        start_time = time.time()

        pipe: textgraphs.Pipeline = tg.create_pipeline(
            SRC_TEXT.strip(),
        )

        duration = round(time.time() - start_time, 3)
        print(f"{duration:7.3f} sec: parse text")


        ## collect graph elements from the parse
        start_time = time.time()

        tg.collect_graph_elements(
            pipe,
            debug = debug,
        )

        duration = round(time.time() - start_time, 3)
        print(f"{duration:7.3f} sec: collect elements")


        ## perform entity linking
        start_time = time.time()

        tg.perform_entity_linking(
            pipe,
            debug = debug,
        )

        duration = round(time.time() - start_time, 3)
        print(f"{duration:7.3f} sec: entity linking")


        ## perform concurrent relation extraction
        start_time = time.time()

        if concur:
            try:
                loop = asyncio.get_running_loop()
            except RuntimeError:
                loop = asyncio.new_event_loop()
                asyncio.set_event_loop(loop)

            inferred_edges: list = loop.run_until_complete(
                tg.infer_relations_async(
                    pipe,
                    debug = debug,
                )
            )
        else:
            inferred_edges = tg.infer_relations(
                pipe,
                debug = debug,
            )

        duration = round(time.time() - start_time, 3)
        print(f"{duration:7.3f} sec: relation extraction")

        n_list: list = list(tg.nodes.values())

        df_rel: pd.DataFrame = pd.DataFrame.from_dict([
            {
                "src": n_list[edge.src_node].text,
                "dst": n_list[edge.dst_node].text,
                "rel": pipe.kg.normalize_prefix(edge.rel),
                "weight": edge.prob,
            }
            for edge in inferred_edges
        ])

        ic(df_rel)


        ## construct the _lemma graph_
        start_time = time.time()

        tg.construct_lemma_graph(
            debug = debug,
        )

        duration = round(time.time() - start_time, 3)
        print(f"{duration:7.3f} sec: construct graph")


        ## rank the extracted phrases
        start_time = time.time()

        tg.calc_phrase_ranks(
            pr_alpha = textgraphs.PAGERANK_ALPHA,
            debug = debug,
        )

        duration = round(time.time() - start_time, 3)
        print(f"{duration:7.3f} sec: rank phrases")


        ## show the extracted phrase results
        ic(tg.get_phrases_as_df())

        if debug:  # pylint: disable=W0101
            for key, node in tg.nodes.items():
                print(key, node)

            for key, edge in tg.edges.items():
                print(key, edge)

    except Exception as ex:  # pylint: disable=W0718
        ic(ex)
        traceback.print_exc()


    ## transform graph data to a _graph of relations_
    start_time = time.time()

    gor: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(
        tg,
    )

    gor.seeds(
        debug = False,  # True
    )

    gor.construct_gor(
        debug = False,  # True
    )

    _scores: typing.Dict[ tuple, float ] = gor.get_affinity_scores(
        debug = False,  # True
    )

    duration = round(time.time() - start_time, 3)
    print(f"{duration:7.3f} sec: graph of relations")

    gor.render_gor_plt(_scores)
    plt.show()

    #sys.exit(0)


    ######################################################################
    ## stack profiler report
    if profile:
        profiler.stop()
        profiler.print()

    ## output lemma graph as JSON
    with open("lemma.json", "w", encoding = "utf-8") as fp:
        fp.write(tg.dump_lemma_graph())