Spaces:

nickil
/

weakly-supervised-parsing

Build error

App Files Files Community

nickil commited on Apr 3, 2022

Commit

47c0211

1 Parent(s): fee0b96

add initial files

Browse files

Files changed (18) hide show

app.py +64 -0
requirements.txt +2 -0
weakly_supervised_parser/__init__.py +0 -0
weakly_supervised_parser/inference.py +145 -0
weakly_supervised_parser/model/__init__.py +0 -0
weakly_supervised_parser/model/data_module_loader.py +79 -0
weakly_supervised_parser/model/span_classifier.py +95 -0
weakly_supervised_parser/model/trainer.py +128 -0
weakly_supervised_parser/settings.py +33 -0
weakly_supervised_parser/tree/__init__.py +0 -0
weakly_supervised_parser/tree/evaluate.py +221 -0
weakly_supervised_parser/tree/helpers.py +177 -0
weakly_supervised_parser/utils/__init__.py +0 -0
weakly_supervised_parser/utils/cky_algorithm.py +91 -0
weakly_supervised_parser/utils/create_inside_outside_strings.py +40 -0
weakly_supervised_parser/utils/distant_supervision.py +40 -0
weakly_supervised_parser/utils/populate_chart.py +95 -0
weakly_supervised_parser/utils/prepare_dataset.py +165 -0

app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import gradio
+import benepar
+import spacy
+import nltk
+from huggingface_hub import hf_hub_url, cached_download
+from weakly_supervised_parser.tree.evaluate import calculate_F1_for_spans, tree_to_spans
+from weakly_supervised_parser.inference import Predictor
+from weakly_supervised_parser.model.trainer import InsideOutsideStringClassifier
+benepar.download('benepar_en3')
+nlp = spacy.load("en_core_web_md")
+nlp.add_pipe("benepar", config={"model": "benepar_en3"})
+inside_model = InsideOutsideStringClassifier(model_name_or_path="roberta-base", max_seq_length=256)
+fetch_url_inside_model = hf_hub_url(repo_id="nickil/weakly-supervised-parsing", filename="inside_model.onnx", revision="main")
+inside_model.load_model(pre_trained_model_path=cached_download(fetch_url_inside_model))
+# outside_model = InsideOutsideStringClassifier(model_name_or_path="roberta-base", max_seq_length=64)
+# outside_model.load_model(pre_trained_model_path=TRAINED_MODEL_PATH + "outside_model.onnx")
+# inside_outside_model = InsideOutsideStringClassifier(model_name_or_path="roberta-base", max_seq_length=256)
+# inside_outside_model.load_model(pre_trained_model_path=TRAINED_MODEL_PATH + "inside_outside_model.onnx")
+def predict(sentence, model):
+    gold_standard = list(nlp(sentence).sents)[0]._.parse_string
+    if model == "inside":
+        best_parse = Predictor(sentence=sentence).obtain_best_parse(predict_type="inside", model=inside_model, scale_axis=1, predict_batch_size=128)
+    elif model == "outside":
+        best_parse = Predictor(sentence=sentence).obtain_best_parse(predict_type="outside", model=outside_model, scale_axis=1, predict_batch_size=128)
+    elif model == "inside-outside":
+        best_parse = Predictor(sentence=sentence).obtain_best_parse(predict_type="inside_outside", model=inside_outside_model, scale_axis=1, predict_batch_size=128)
+    sentence_f1 = calculate_F1_for_spans(tree_to_spans(gold_standard), tree_to_spans(best_parse))
+    return gold_standard, best_parse, sentence_f1
+iface = gradio.Interface(
+    title="Co-training an Unsupervised Constituency Parser with Weak Supervision",
+    description="Demo for the repository - [weakly-supervised-parsing](https://github.com/Nickil21/weakly-supervised-parsing) (ACL Findings 2022)",
+    theme="default",
+    article="""<h4 class='text-lg font-semibold my-2'>Note</h4>
+    - We use a strong supervised parsing model `benepar_en3` which is based on T5-small to compute the gold parse.<br>
+    - Sentence F1 score corresponds to the macro F1 score.
+    """,
+    allow_flagging="never",
+    fn=predict,
+    inputs=[
+        gradio.inputs.Textbox(label="Sentence", placeholder="Enter a sentence in English"),
+        gradio.inputs.Radio(["inside", "outside", "inside-outside"], default="inside", label="Choose Model"),
+    ],
+    outputs=[
+        gradio.outputs.Textbox(label="Gold Parse Tree"),
+        gradio.outputs.Textbox(label="Predicted Parse Tree"),
+        gradio.outputs.Textbox(label="F1 score"),
+    ],
+    examples=[
+        ["Russia 's war on Ukraine unsettles investors expecting carve-out deal uptick for 2022 .", "inside-outside"],
+        ["Bitcoin community under pressure to cut energy use .", "inside"],
+    ],
+)
+iface.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ spacy==3.1.4
2	+ benepar==0.2.0

weakly_supervised_parser/__init__.py ADDED Viewed

File without changes

weakly_supervised_parser/inference.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from argparse import ArgumentParser
+from loguru import logger
+from weakly_supervised_parser.settings import TRAINED_MODEL_PATH
+from weakly_supervised_parser.utils.prepare_dataset import DataLoaderHelper
+from weakly_supervised_parser.utils.populate_chart import PopulateCKYChart
+from weakly_supervised_parser.tree.evaluate import calculate_F1_for_spans, tree_to_spans
+from weakly_supervised_parser.model.trainer import InsideOutsideStringClassifier
+from weakly_supervised_parser.settings import PTB_TEST_SENTENCES_WITHOUT_PUNCTUATION_PATH, PTB_TEST_GOLD_WITHOUT_PUNCTUATION_ALIGNED_PATH
+class Predictor:
+    def __init__(self, sentence):
+        self.sentence = sentence
+        self.sentence_list = sentence.split()
+    def obtain_best_parse(self, predict_type, model, scale_axis, predict_batch_size, return_df=False):
+        unique_tokens_flag, span_scores, df = PopulateCKYChart(sentence=self.sentence).fill_chart(predict_type=predict_type,
+                                                                                                  model=model,
+                                                                                                  scale_axis=scale_axis,
+                                                                                                  predict_batch_size=predict_batch_size)
+        if unique_tokens_flag:
+            best_parse = "(S " + " ".join(["(S " + item + ")" for item in self.sentence_list]) + ")"
+            logger.info("BEST PARSE", best_parse)
+        else:
+            best_parse = PopulateCKYChart(sentence=self.sentence).best_parse_tree(span_scores)
+        if return_df:
+            return best_parse, df
+        return best_parse
+def process_test_sample(index, sentence, gold_file_path, predict_type, model, scale_axis, predict_batch_size, return_df=False):
+    best_parse, df = Predictor(sentence=sentence).obtain_best_parse(predict_type=predict_type,
+                                                                    model=model,
+                                                                    scale_axis=scale_axis,
+                                                                    predict_batch_size=predict_batch_size,
+                                                                    return_df=True)
+    gold_standard = DataLoaderHelper(input_file_object=gold_file_path)
+    sentence_f1 = calculate_F1_for_spans(tree_to_spans(gold_standard[index]), tree_to_spans(best_parse))
+    if sentence_f1 < 25.0:
+        logger.warning(f"Index: {index} <> F1: {sentence_f1:.2f}")
+    else:
+        logger.info(f"Index: {index} <> F1: {sentence_f1:.2f}")
+    if return_df:
+        return best_parse, df
+    else:
+        return best_parse
+def process_co_train_test_sample(index, sentence, gold_file_path, inside_model, outside_model, return_df=False):
+    _, df_inside = PopulateCKYChart(sentence=sentence).compute_scores(predict_type="inside", model=inside_model, return_df=True)
+    _, df_outside = PopulateCKYChart(sentence=sentence).compute_scores(predict_type="outside", model=outside_model, return_df=True)
+    df = df_inside.copy()
+    df["scores"] = df_inside["scores"] * df_outside["scores"]
+    _, span_scores, df = PopulateCKYChart(sentence=sentence).fill_chart(data=df)
+    best_parse = PopulateCKYChart(sentence=sentence).best_parse_tree(span_scores)
+    gold_standard = DataLoaderHelper(input_file_object=gold_file_path)
+    sentence_f1 = calculate_F1_for_spans(tree_to_spans(gold_standard[index]), tree_to_spans(best_parse))
+    if sentence_f1 < 25.0:
+        logger.warning(f"Index: {index} <> F1: {sentence_f1:.2f}")
+    else:
+        logger.info(f"Index: {index} <> F1: {sentence_f1:.2f}")
+    return best_parse
+def main():
+    parser = ArgumentParser(description="Inference Pipeline for the Inside Outside String Classifier", add_help=True)
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--use_inside", action="store_true", help="Whether to predict using inside model")
+    group.add_argument("--use_inside_self_train", action="store_true", help="Whether to predict using inside model with self-training")
+    group.add_argument("--use_outside", action="store_true", help="Whether to predict using outside model")
+    group.add_argument("--use_inside_outside_co_train", action="store_true", help="Whether to predict using inside-outside model with co-training")
+    parser.add_argument("--model_name_or_path", type=str, default="roberta-base", help="Path to the model identifier from huggingface.co/models")
+    parser.add_argument("--save_path", type=str, required=True, help="Path to save the final trees")
+    parser.add_argument("--scale_axis", choices=[None, 1], default=None, help="Whether to scale axis globally (None) or sequentially (1) across batches during softmax computation")
+    parser.add_argument("--predict_batch_size", type=int, help="Batch size during inference")
+    parser.add_argument(
+        "--inside_max_seq_length", default=256, type=int, help="The maximum total input sequence length after tokenization for the inside model"
+    )
+    parser.add_argument(
+        "--outside_max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization for the outside model"
+    )
+    args = parser.parse_args()
+    if args.use_inside:
+        pre_trained_model_path = TRAINED_MODEL_PATH + "inside_model.onnx"
+        max_seq_length = args.inside_max_seq_length
+    if args.use_inside_self_train:
+        pre_trained_model_path = TRAINED_MODEL_PATH + "inside_model_self_trained.onnx"
+        max_seq_length = args.inside_max_seq_length
+    if args.use_outside:
+        pre_trained_model_path = TRAINED_MODEL_PATH + "outside_model.onnx"
+        max_seq_length = args.outside_max_seq_length
+    if args.use_inside_outside_co_train:
+        inside_pre_trained_model_path = "inside_model_co_trained.onnx"
+        inside_model = InsideOutsideStringClassifier(model_name_or_path=args.model_name_or_path, max_seq_length=args.inside_max_seq_length)
+        inside_model.load_model(pre_trained_model_path=inside_pre_trained_model_path)
+        outside_pre_trained_model_path = "outside_model_co_trained.onnx"
+        outside_model = InsideOutsideStringClassifier(model_name_or_path=args.model_name_or_path, max_seq_length=args.outside_max_seq_length)
+        outside_model.load_model(pre_trained_model_path=outside_pre_trained_model_path)
+    else:
+        model = InsideOutsideStringClassifier(model_name_or_path=args.model_name_or_path, max_seq_length=max_seq_length)
+        model.load_model(pre_trained_model_path=pre_trained_model_path)
+    if args.use_inside or args.use_inside_self_train:
+        predict_type = "inside"
+    if args.use_outside:
+        predict_type = "outside"
+    with open(args.save_path, "w") as out_file:
+        print(type(args.scale_axis))
+        test_sentences = DataLoaderHelper(input_file_object=PTB_TEST_SENTENCES_WITHOUT_PUNCTUATION_PATH).read_lines()
+        test_gold_file_path = PTB_TEST_GOLD_WITHOUT_PUNCTUATION_ALIGNED_PATH
+        for test_index, test_sentence in enumerate(test_sentences):
+            if args.use_inside_outside_co_train:
+                best_parse = process_co_train_test_sample(
+                    test_index, test_sentence, test_gold_file_path, inside_model=inside_model, outside_model=outside_model
+                )
+            else:
+                best_parse = process_test_sample(test_index, test_sentence, test_gold_file_path, predict_type=predict_type, model=model,
+                                                 scale_axis=args.scale_axis, predict_batch_size=args.predict_batch_size)
+            out_file.write(best_parse + "\n")
+if __name__ == "__main__":
+    main()

weakly_supervised_parser/model/__init__.py ADDED Viewed

File without changes

weakly_supervised_parser/model/data_module_loader.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import pandas as pd
+from transformers import AutoTokenizer
+from torch.utils.data import Dataset, DataLoader
+from pytorch_lightning import LightningDataModule
+class PyTorchDataModule(Dataset):
+    """PyTorch Dataset class"""
+    def __init__(self, model_name_or_path: str, data: pd.DataFrame, max_seq_length: int = 256):
+        """
+        Initiates a PyTorch Dataset Module for input data
+        """
+        self.model_name_or_path = model_name_or_path
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)
+        self.data = data
+        self.max_seq_length = max_seq_length
+    def __len__(self):
+        """returns length of data"""
+        return len(self.data)
+    def __getitem__(self, index: int):
+        """returns dictionary of input tensors to feed into the model"""
+        data_row = self.data.iloc[index]
+        sentence = data_row["sentence"]
+        sentence_encoding = self.tokenizer(
+            sentence,
+            max_length=self.max_seq_length,
+            padding="max_length",
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        out = dict(
+            sentence=sentence,
+            input_ids=sentence_encoding["input_ids"].flatten(),
+            attention_mask=sentence_encoding["attention_mask"].flatten(),
+            labels=data_row["label"].flatten(),
+        )
+        return out
+class DataModule(LightningDataModule):
+    def __init__(
+        self,
+        model_name_or_path: str,
+        train_df: pd.DataFrame,
+        eval_df: pd.DataFrame,
+        max_seq_length: int = 256,
+        train_batch_size: int = 32,
+        eval_batch_size: int = 32,
+        num_workers: int = 16,
+        **kwargs
+    ):
+        super().__init__()
+        self.model_name_or_path = model_name_or_path
+        self.train_df = train_df
+        self.eval_df = eval_df
+        self.max_seq_length = max_seq_length
+        self.train_batch_size = train_batch_size
+        self.eval_batch_size = eval_batch_size
+        self.num_workers = num_workers
+    def setup(self, stage=None):
+        self.train_dataset = PyTorchDataModule(self.model_name_or_path, self.train_df, self.max_seq_length)
+        self.eval_dataset = PyTorchDataModule(self.model_name_or_path, self.eval_df, self.max_seq_length)
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(self.train_dataset, batch_size=self.train_batch_size, shuffle=False, num_workers=self.num_workers, pin_memory=True)
+    def val_dataloader(self) -> DataLoader:
+        return DataLoader(self.eval_dataset, batch_size=self.eval_batch_size, shuffle=False, num_workers=self.num_workers, pin_memory=True)

weakly_supervised_parser/model/span_classifier.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+import torchmetrics
+from torch.optim import AdamW
+from pytorch_lightning import LightningModule
+from transformers import AutoConfig, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
+class LightningModel(LightningModule):
+    def __init__(
+        self,
+        model_name_or_path: str,
+        num_labels: int = 2,
+        lr: float = 5e-6,
+        train_batch_size: int = 32,
+        adam_epsilon=1e-8,
+        warmup_steps: int = 0,
+        weight_decay: float = 0.0,
+        **kwargs
+    ):
+        super().__init__()
+        self.save_hyperparameters()
+        self.num_labels = num_labels
+        self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=self.num_labels)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)
+        self.model.gradient_checkpointing_enable()
+        self.lr = lr
+        self.train_batch_size = train_batch_size
+        self.accuracy = torchmetrics.Accuracy()
+        self.f1score = torchmetrics.F1Score(num_classes=2)
+        self.mcc = torchmetrics.MatthewsCorrCoef(num_classes=2)
+    def forward(self, input_ids, attention_mask, labels=None):
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+    def training_step(self, batch, batch_idx):
+        outputs = self(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
+        loss = outputs[0]
+        return loss
+    def validation_step(self, batch, batch_idx, dataloader_idx=0):
+        outputs = self(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
+        val_loss, logits = outputs[:2]
+        preds = torch.argmax(logits, axis=1)
+        labels = batch["labels"]
+        return {"loss": val_loss, "preds": preds, "labels": labels}
+    def validation_epoch_end(self, outputs):
+        preds = torch.cat([x["preds"] for x in outputs])
+        labels = torch.cat([x["labels"] for x in outputs])
+        loss = torch.stack([x["loss"] for x in outputs]).mean()
+        self.log("val_loss", loss, prog_bar=True)
+        self.log("val_accuracy", self.accuracy(preds, labels.squeeze()), prog_bar=True)
+        self.log("val_f1", self.f1score(preds, labels.squeeze()), prog_bar=True)
+        self.log("val_mcc", self.mcc(preds, labels.squeeze()), prog_bar=True)
+        return loss
+    def setup(self, stage=None):
+        # Get dataloader by calling it - train_dataloader() is called after setup() by default
+        train_loader = self.trainer.datamodule.train_dataloader()
+        # Calculate total steps
+        tb_size = self.train_batch_size * max(1, self.trainer.gpus)
+        ab_size = tb_size * self.trainer.accumulate_grad_batches
+        self.total_steps = int((len(train_loader.dataset) / ab_size) * float(self.trainer.max_epochs))
+    def configure_optimizers(self):
+        """Prepare optimizer and schedule (linear warmup and decay)"""
+        model = self.model
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": self.hparams.weight_decay,
+            },
+            {
+                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+            },
+        ]
+        optimizer = AdamW(
+            optimizer_grouped_parameters,
+            lr=self.lr,
+            eps=self.hparams.adam_epsilon,
+        )
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=self.hparams.warmup_steps,
+            num_training_steps=self.total_steps,
+        )
+        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
+        return [optimizer], [scheduler]

weakly_supervised_parser/model/trainer.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import torch
+import datasets
+import numpy as np
+import pandas as pd
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
+from transformers import AutoTokenizer, logging
+from onnxruntime import InferenceSession
+from scipy.special import softmax
+from weakly_supervised_parser.model.data_module_loader import DataModule
+from weakly_supervised_parser.model.span_classifier import LightningModel
+# Disable model checkpoint warnings
+logging.set_verbosity_error()
+class InsideOutsideStringClassifier:
+    def __init__(self, model_name_or_path: str, num_labels: int = 2, max_seq_length: int = 256):
+        self.model_name_or_path = model_name_or_path
+        self.num_labels = num_labels
+        self.max_seq_length = max_seq_length
+    def fit(
+        self,
+        train_df: pd.DataFrame,
+        eval_df: pd.DataFrame,
+        outputdir: str,
+        filename: str,
+        devices: int = 1,
+        enable_progress_bar: bool = True,
+        enable_model_summary: bool = False,
+        enable_checkpointing: bool = False,
+        logger: bool = False,
+        accelerator: str = "auto",
+        train_batch_size: int = 32,
+        eval_batch_size: int = 32,
+        learning_rate: float = 5e-6,
+        max_epochs: int = 10,
+        dataloader_num_workers: int = 16,
+        seed: int = 42,
+    ):
+        data_module = DataModule(
+            model_name_or_path=self.model_name_or_path,
+            train_df=train_df,
+            eval_df=eval_df,
+            max_seq_length=self.max_seq_length,
+            train_batch_size=train_batch_size,
+            eval_batch_size=eval_batch_size,
+            num_workers=dataloader_num_workers,
+        )
+        model = LightningModel(
+            model_name_or_path=self.model_name_or_path,
+            lr=learning_rate,
+            num_labels=self.num_labels,
+            train_batch_size=train_batch_size,
+            eval_batch_size=eval_batch_size,
+        )
+        seed_everything(seed, workers=True)
+        callbacks = []
+        callbacks.append(EarlyStopping(monitor="val_loss", patience=2, mode="min", check_finite=True))
+        # callbacks.append(ModelCheckpoint(monitor="val_loss", dirpath=outputdir, filename=filename, save_top_k=1, save_weights_only=True, mode="min"))
+        trainer = Trainer(
+            accelerator=accelerator,
+            devices=devices,
+            max_epochs=max_epochs,
+            callbacks=callbacks,
+            enable_progress_bar=enable_progress_bar,
+            enable_model_summary=enable_model_summary,
+            enable_checkpointing=enable_checkpointing,
+            logger=logger,
+        )
+        trainer.fit(model, data_module)
+        trainer.validate(model, data_module.val_dataloader())
+        train_batch = next(iter(data_module.train_dataloader()))
+        model.to_onnx(
+            file_path=f"{outputdir}/{filename}.onnx",
+            input_sample=(train_batch["input_ids"].cuda(), train_batch["attention_mask"].cuda()),
+            export_params=True,
+            opset_version=11,
+            input_names=["input", "attention_mask"],
+            output_names=["output"],
+            dynamic_axes={"input": {0: "batch_size"}, "attention_mask": {0: "batch_size"}, "output": {0: "batch_size"}},
+        )
+    def load_model(self, pre_trained_model_path):
+        self.model = InferenceSession(pre_trained_model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)
+    def preprocess_function(self, data):
+        features = self.tokenizer(
+            data["sentence"], max_length=self.max_seq_length, padding="max_length", add_special_tokens=True, truncation=True, return_tensors="np"
+        )
+        return features
+    def process_spans(self, spans, scale_axis):
+        spans_dataset = datasets.Dataset.from_pandas(spans)
+        processed = spans_dataset.map(self.preprocess_function, batched=True, batch_size=None)
+        inputs = {"input": processed["input_ids"], "attention_mask": processed["attention_mask"]}
+        with torch.no_grad():
+            return softmax(self.model.run(None, inputs)[0], axis=scale_axis)
+    def predict_proba(self, spans, scale_axis, predict_batch_size):
+        if spans.shape[0] > predict_batch_size:
+            output = []
+            span_batches = np.array_split(spans, spans.shape[0] // predict_batch_size)
+            for span_batch in span_batches:
+                output.extend(self.process_spans(span_batch, scale_axis))
+            return np.vstack(output)
+        else:
+            return self.process_spans(spans, scale_axis)
+    def predict(self, spans):
+        return self.predict_proba(spans).argmax(axis=1)

weakly_supervised_parser/settings.py ADDED Viewed

	@@ -0,0 +1,33 @@

+PROJECT_DIR = "weakly_supervised_parser/"
+PTB_TREES_ROOT_DIR = "data/PROCESSED/english/trees/"
+PTB_SENTENCES_ROOT_DIR = "data/PROCESSED/english/sentences/"
+PTB_TRAIN_SENTENCES_WITH_PUNCTUATION_PATH = PTB_SENTENCES_ROOT_DIR + "ptb-train-sentences-with-punctuation.txt"
+PTB_VALID_SENTENCES_WITH_PUNCTUATION_PATH = PTB_SENTENCES_ROOT_DIR + "ptb-valid-sentences-with-punctuation.txt"
+PTB_TEST_SENTENCES_WITH_PUNCTUATION_PATH = PTB_SENTENCES_ROOT_DIR + "ptb-test-sentences-with-punctuation.txt"
+PTB_TRAIN_SENTENCES_WITHOUT_PUNCTUATION_PATH = PTB_SENTENCES_ROOT_DIR + "ptb-train-sentences-without-punctuation.txt"
+PTB_VALID_SENTENCES_WITHOUT_PUNCTUATION_PATH = PTB_SENTENCES_ROOT_DIR + "ptb-valid-sentences-without-punctuation.txt"
+PTB_TEST_SENTENCES_WITHOUT_PUNCTUATION_PATH = PTB_SENTENCES_ROOT_DIR + "ptb-test-sentences-without-punctuation.txt"
+PTB_TRAIN_GOLD_WITH_PUNCTUATION_PATH = PTB_TREES_ROOT_DIR + "ptb-train-gold-with-punctuation.txt"
+PTB_VALID_GOLD_WITH_PUNCTUATION_PATH = PTB_TREES_ROOT_DIR + "ptb-valid-gold-with-punctuation.txt"
+PTB_TEST_GOLD_WITH_PUNCTUATION_PATH = PTB_TREES_ROOT_DIR + "ptb-test-gold-with-punctuation.txt"
+PTB_TRAIN_GOLD_WITHOUT_PUNCTUATION_PATH = PTB_TREES_ROOT_DIR + "ptb-train-gold-without-punctuation.txt"
+PTB_VALID_GOLD_WITHOUT_PUNCTUATION_PATH = PTB_TREES_ROOT_DIR + "ptb-valid-gold-without-punctuation.txt"
+PTB_TEST_GOLD_WITHOUT_PUNCTUATION_PATH = PTB_TREES_ROOT_DIR + "ptb-test-gold-without-punctuation.txt"
+PTB_TRAIN_GOLD_WITHOUT_PUNCTUATION_ALIGNED_PATH = PTB_TREES_ROOT_DIR + "ptb-train-gold-without-punctuation-aligned.txt"
+PTB_VALID_GOLD_WITHOUT_PUNCTUATION_ALIGNED_PATH = PTB_TREES_ROOT_DIR + "ptb-valid-gold-without-punctuation-aligned.txt"
+PTB_TEST_GOLD_WITHOUT_PUNCTUATION_ALIGNED_PATH = PTB_TREES_ROOT_DIR + "ptb-test-gold-without-punctuation-aligned.txt"
+YOON_KIM_TRAIN_GOLD_WITHOUT_PUNCTUATION_PATH = PTB_TREES_ROOT_DIR + "Yoon_Kim/ptb-train-gold-filtered.txt"
+YOON_KIM_VALID_GOLD_WITHOUT_PUNCTUATION_PATH = PTB_TREES_ROOT_DIR + "Yoon_Kim/ptb-valid-gold-filtered.txt"
+YOON_KIM_TEST_GOLD_WITHOUT_PUNCTUATION_PATH = PTB_TREES_ROOT_DIR + "Yoon_Kim/ptb-test-gold-filtered.txt"
+# Predictions
+PTB_SAVE_TREES_PATH = "TEMP/predictions/english/"
+# Training
+TRAINED_MODEL_PATH = PROJECT_DIR + "/model/TRAINED_MODEL/"

weakly_supervised_parser/tree/__init__.py ADDED Viewed

File without changes

weakly_supervised_parser/tree/evaluate.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import argparse
+import collections
+import os
+import subprocess
+import nltk
+def tree_to_spans(tree, keep_labels=False, keep_leaves=False, keep_whole_span=False):
+    if isinstance(tree, str):
+        tree = nltk.Tree.fromstring(tree)
+    length = len(tree.pos())
+    queue = collections.deque(tree.treepositions())
+    stack = [(queue.popleft(), 0)]
+    j = 0
+    spans = []
+    while stack != []:
+        (p, i) = stack[-1]
+        if not queue or queue[0][:-1] != p:
+            if isinstance(tree[p], nltk.tree.Tree):
+                if j - i > 1:
+                    spans.append((tree[p].label(), (i, j)))
+            else:
+                j = i + 1
+            stack.pop()
+        else:
+            q = queue.popleft()
+            stack.append((q, j))
+    if not keep_whole_span:
+        spans = [span for span in spans if span[1] != (0, length)]
+    if not keep_labels:
+        spans = [span[1] for span in spans]
+    return spans
+def test_tree_to_spans():
+    assert [(0, 2), (0, 3), (0, 4)] == tree_to_spans("(S (S (S (S (S 1) (S 2)) (S 3)) (S 4)) (S 5))", keep_labels=False)
+    assert [] == tree_to_spans("(S 1)", keep_labels=False)
+    assert [] == tree_to_spans("(S (S 1) (S 2))", keep_labels=False)
+    assert [(1, 3)] == tree_to_spans("(S (S 1) (S (S 2) (S 3)))", keep_labels=False)
+    assert [("S", (1, 3))] == tree_to_spans("(S (S 1) (S (S 2) (S 3)))", keep_labels=True)
+def get_F1_score_intermediates(gold_spans, pred_spans):
+    """Get intermediate results for calculating the F1 score"""
+    n_true_positives = 0
+    gold_span_counter = collections.Counter(gold_spans)
+    pred_span_counter = collections.Counter(pred_spans)
+    unique_spans = set(gold_spans + pred_spans)
+    for span in unique_spans:
+        n_true_positives += min(gold_span_counter[span], pred_span_counter[span])
+    return n_true_positives, len(gold_spans), len(pred_spans)
+def calculate_F1_score_from_intermediates(n_true_positives, n_golds, n_predictions, precision_recall_f_score=False):
+    """Calculate F1 score"""
+    if precision_recall_f_score:
+        zeros = (0, 0, 0)
+    else:
+        zeros = 0
+    if n_golds == 0:
+        return 100 if n_predictions == 0 else zeros
+    if n_true_positives == 0 or n_predictions == 0:
+        return zeros
+    recall = n_true_positives / n_golds
+    precision = n_true_positives / n_predictions
+    F1 = 2 * precision * recall / (precision + recall)
+    if precision_recall_f_score:
+        return precision, recall, F1 * 100
+    return F1 * 100
+def calculate_F1_for_spans(gold_spans, pred_spans, precision_recall_f_score=False):
+    #  CHANGE THIS LATER
+    # gold_spans = list(set(gold_spans))
+    ###################################
+    tp, n_gold, n_pred = get_F1_score_intermediates(gold_spans, pred_spans)
+    if precision_recall_f_score:
+        p, r, F1 = calculate_F1_score_from_intermediates(tp, len(gold_spans), len(pred_spans), precision_recall_f_score=precision_recall_f_score)
+        return p, r, F1
+    F1 = calculate_F1_score_from_intermediates(tp, len(gold_spans), len(pred_spans))
+    return F1
+def test_calculate_F1_for_spans():
+    pred = [(0, 1)]
+    gold = [(0, 1)]
+    assert calculate_F1_for_spans(gold, pred) == 100
+    pred = [(0, 0)]
+    gold = [(0, 1)]
+    assert calculate_F1_for_spans(gold, pred) == 0
+    pred = [(0, 0), (0, 1)]
+    gold = [(0, 1), (1, 1)]
+    assert calculate_F1_for_spans(gold, pred) == 50
+    pred = [(0, 0), (0, 0)]
+    gold = [(0, 0), (0, 0), (0, 1)]
+    assert calculate_F1_for_spans(gold, pred) == 80
+    pred = [(0, 0), (1, 0)]
+    gold = [(0, 0), (0, 0), (0, 1)]
+    assert calculate_F1_for_spans(gold, pred) == 40
+def read_lines_from_file(filepath, len_limit):
+    with open(filepath, "r") as f:
+        for line in f:
+            tree = nltk.Tree.fromstring(line)
+            if len_limit is not None and len(tree.pos()) > len_limit:
+                continue
+            yield line.strip()
+def read_spans_from_file(filepath, len_limit):
+    for line in read_lines_from_file(filepath, len_limit):
+        yield tree_to_spans(line, keep_labels=False, keep_leaves=False, keep_whole_span=False)
+def calculate_corpus_level_F1_for_spans(gold_list, pred_list):
+    n_true_positives = 0
+    n_golds = 0
+    n_predictions = 0
+    for gold_spans, pred_spans in zip(gold_list, pred_list):
+        n_tp, n_g, n_p = get_F1_score_intermediates(gold_spans, pred_spans)
+        n_true_positives += n_tp
+        n_golds += n_g
+        n_predictions += n_p
+    F1 = calculate_F1_score_from_intermediates(n_true_positives, n_golds, n_predictions)
+    return F1
+def calculate_sentence_level_F1_for_spans(gold_list, pred_list):
+    f1_scores = []
+    for gold_spans, pred_spans in zip(gold_list, pred_list):
+        f1 = calculate_F1_for_spans(gold_spans, pred_spans)
+        f1_scores.append(f1)
+    F1 = sum(f1_scores) / len(f1_scores)
+    return F1
+def parse_evalb_results_from_file(filepath):
+    i_th_score = 0
+    score_of_all_length = None
+    score_of_length_10 = None
+    prefix_of_the_score_line = "Bracketing FMeasure       ="
+    with open(filepath, "r") as f:
+        for line in f:
+            if line.startswith(prefix_of_the_score_line):
+                i_th_score += 1
+                if i_th_score == 1:
+                    score_of_all_length = float(line.split()[-1])
+                elif i_th_score == 2:
+                    score_of_length_10 = float(line.split()[-1])
+                else:
+                    raise ValueError("Too many lines for F score")
+    return score_of_all_length, score_of_length_10
+def execute_evalb(gold_file, pred_file, out_file, len_limit):
+    EVALB_PATH = "model/EVALB/"
+    subprocess.run("{} -p {} {} {} > {}".format(EVALB_PATH + "/evalb", EVALB_PATH + "unlabelled.prm", gold_file, pred_file, out_file), shell=True)
+def calculate_evalb_F1_for_file(gold_file, pred_file, len_limit):
+    evalb_out_file = pred_file + ".evalb_out"
+    execute_evalb(gold_file, pred_file, evalb_out_file, len_limit)
+    F1_len_all, F1_len_10 = parse_evalb_results_from_file(evalb_out_file)
+    if len_limit is None:
+        return F1_len_all
+    elif len_limit == 10:
+        return F1_len_10
+    else:
+        raise ValueError(f"Unexpected len_limit: {len_limit}")
+def calculate_sentence_level_F1_for_file(gold_file, pred_file, len_limit):
+    gold_list = list(read_spans_from_file(gold_file, len_limit))
+    pred_list = list(read_spans_from_file(pred_file, len_limit))
+    F1 = calculate_sentence_level_F1_for_spans(gold_list, pred_list)
+    return F1
+def calculate_corpus_level_F1_for_file(gold_file, pred_file, len_limit):
+    gold_list = list(read_spans_from_file(gold_file, len_limit))
+    pred_list = list(read_spans_from_file(pred_file, len_limit))
+    F1 = calculate_corpus_level_F1_for_spans(gold_list, pred_list)
+    return F1
+def evaluate_prediction_file(gold_file, pred_file, len_limit):
+    corpus_F1 = calculate_corpus_level_F1_for_file(gold_file, pred_file, len_limit)
+    sentence_F1 = calculate_sentence_level_F1_for_file(gold_file, pred_file, len_limit)
+    # evalb_F1 = calculate_evalb_F1_for_file(gold_file, pred_file, len_limit)
+    print("=====> Evaluation Results <=====")
+    print(f"Length constraint: f{len_limit}")
+    print(f"Micro F1: {corpus_F1:.2f}, Macro F1: {sentence_F1:.2f}")  # , evalb_F1))
+    print("=====> Evaluation Results <=====")
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gold_file", "-g", help="path to gold file")
+    parser.add_argument("--pred_file", "-p", help="path to prediction file")
+    parser.add_argument(
+        "--len_limit", default=None, type=int, choices=(None, 10, 20, 30, 40, 50, 100), help="length constraint for evaluation, 10 or None"
+    )
+    args = parser.parse_args()
+    return args
+def main():
+    args = parse_args()
+    evaluate_prediction_file(args.gold_file, args.pred_file, args.len_limit)
+if __name__ == "__main__":
+    main()
+#  python helper/evaluate.py -g TEMP/preprocessed_dev.txt -p TEMP/pred_dev_m_None.txt

weakly_supervised_parser/tree/helpers.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import nltk
+from collections import Counter
+from weakly_supervised_parser.tree.evaluate import tree_to_spans
+class Tree(object):
+    def __init__(self, label, children, word):
+        self.label = label
+        self.children = children
+        self.word = word
+    def __str__(self):
+        return self.linearize()
+    def linearize(self):
+        if not self.children:
+            return f"({self.label} {self.word})"
+        return f"({self.label} {' '.join(c.linearize() for c in self.children)})"
+    def spans(self, start=0):
+        if not self.children:
+            return [(start, start + 1)]
+        span_list = []
+        position = start
+        for c in self.children:
+            cspans = c.spans(start=position)
+            span_list.extend(cspans)
+            position = cspans[0][1]
+        return [(start, position)] + span_list
+    def spans_labels(self, start=0):
+        if not self.children:
+            return [(start, start + 1, self.label)]
+        span_list = []
+        position = start
+        for c in self.children:
+            cspans = c.spans_labels(start=position)
+            span_list.extend(cspans)
+            position = cspans[0][1]
+        return [(start, position, self.label)] + span_list
+def extract_sentence(sentence):
+    t = nltk.Tree.fromstring(sentence)
+    return " ".join(item[0] for item in t.pos())
+def get_constituents(sample_string, want_spans_mapping=False, whole_sentence=True, labels=False):
+    t = nltk.Tree.fromstring(sample_string)
+    if want_spans_mapping:
+        spans = tree_to_spans(t, keep_labels=True)
+        return dict(Counter(item[1] for item in spans))
+    spans = tree_to_spans(t, keep_labels=True)
+    sentence = extract_sentence(sample_string).split()
+    labeled_consituents_lst = []
+    constituents = []
+    for span in spans:
+        labeled_consituents = {}
+        labeled_consituents["labels"] = span[0]
+        i, j = span[1][0], span[1][1]
+        constituents.append(" ".join(sentence[i:j]))
+        labeled_consituents["constituent"] = " ".join(sentence[i:j])
+        labeled_consituents_lst.append(labeled_consituents)
+    # Add original sentence
+    if whole_sentence:
+        constituents = constituents + [" ".join(sentence)]
+    if labels:
+        return labeled_consituents_lst
+    return constituents
+def get_distituents(sample_string):
+    sentence = extract_sentence(sample_string).split()
+    def get_all_combinations(sentence):
+        L = sentence.split()
+        N = len(L)
+        out = []
+        for n in range(2, N):
+            for i in range(N - n + 1):
+                out.append((i, i + n))
+        return out
+    combinations = get_all_combinations(extract_sentence(sample_string))
+    constituents = list(get_constituents(sample_string, want_spans_mapping=True).keys())
+    spans = [item for item in combinations if item not in constituents]
+    distituents = []
+    for span in spans:
+        i, j = span[0], span[1]
+        distituents.append(" ".join(sentence[i:j]))
+    return distituents
+def get_leaves(tree):
+    if not tree.children:
+        return [tree]
+    leaves = []
+    for c in tree.children:
+        leaves.extend(get_leaves(c))
+    return leaves
+def unlinearize(string):
+    """
+    (TOP (S (NP (PRP He)) (VP (VBD was) (ADJP (JJ right))) (. .)))
+    """
+    tokens = string.replace("(", " ( ").replace(")", " ) ").split()
+    def read_tree(start):
+        if tokens[start + 2] != "(":
+            return Tree(tokens[start + 1], None, tokens[start + 2]), start + 4
+        i = start + 2
+        children = []
+        while tokens[i] != ")":
+            tree, i = read_tree(i)
+            children.append(tree)
+        return Tree(tokens[start + 1], children, None), i + 1
+    tree, _ = read_tree(0)
+    return tree
+def recall_by_label(gold_standard, best_parse):
+    correct = {}
+    total = {}
+    for tree1, tree2 in zip(gold_standard, best_parse):
+        try:
+            leaves1, leaves2 = get_leaves(tree1["tree"]), get_leaves(tree2["tree"])
+            for l1, l2 in zip(leaves1, leaves2):
+                assert l1.word.lower() == l2.word.lower(), f"{l1.word} =/= {l2.word}"
+            spanlabels = tree1["tree"].spans_labels()
+            spans = tree2["tree"].spans()
+            for (i, j, label) in spanlabels:
+                if j - i != 1:
+                    if label not in correct:
+                        correct[label] = 0
+                        total[label] = 0
+                    if (i, j) in spans:
+                        correct[label] += 1
+                    total[label] += 1
+        except Exception as e:
+            print(e)
+    acc = {}
+    for label in total.keys():
+        acc[label] = correct[label] / total[label]
+    return acc
+def label_recall_output(gold_standard, best_parse):
+    best_parse_trees = []
+    gold_standard_trees = []
+    for t1, t2 in zip(gold_standard, best_parse):
+        gold_standard_trees.append({"tree": unlinearize(t1)})
+        best_parse_trees.append({"tree": unlinearize(t2)})
+    dct = recall_by_label(gold_standard=gold_standard_trees, best_parse=best_parse_trees)
+    labels = ["SBAR", "NP", "VP", "PP", "ADJP", "ADVP"]
+    l = [{label: f"{recall * 100:.2f}"} for label, recall in dct.items() if label in labels]
+    df = pd.DataFrame([item.values() for item in l], index=[item.keys() for item in l], columns=["recall"])
+    df.index = df.index.map(lambda x: list(x)[0])
+    df_out = df.reindex(labels)
+    return df_out
+if __name__ == "__main__":
+    import pandas as pd
+    from weakly_supervised_parser.utils.prepare_dataset import PTBDataset
+    from weakly_supervised_parser.settings import PTB_TEST_GOLD_WITHOUT_PUNCTUATION_ALIGNED_PATH, PTB_SAVE_TREES_PATH
+    best_parse = PTBDataset(PTB_SAVE_TREES_PATH + "inside_model_predictions.txt").retrieve_all_sentences()
+    gold_standard = PTBDataset(PTB_TEST_GOLD_WITHOUT_PUNCTUATION_ALIGNED_PATH).retrieve_all_sentences()
+    print(label_recall_output(gold_standard, best_parse))

weakly_supervised_parser/utils/__init__.py ADDED Viewed

File without changes

weakly_supervised_parser/utils/cky_algorithm.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import re
+import numpy as np
+from weakly_supervised_parser.tree.helpers import Tree
+def CKY(sent_all, prob_s, label_s, verbose=False):
+    r"""
+    choose tree with maximum expected number of constituents,
+    or max \sum_{(i,j) \in tree} p((i,j) is constituent)
+    """
+    def backpt_to_tree(sent, backpt, label_table):
+        def to_tree(i, j):
+            if j - i == 1:
+                return Tree(sent[i], None, sent[i])
+            else:
+                k = backpt[i][j]
+                return Tree(label_table[i][j], [to_tree(i, k), to_tree(k, j)], None)
+        return to_tree(0, len(sent))
+    def to_table(value_s, i_s, j_s):
+        table = [[None for _ in range(np.max(j_s) + 1)] for _ in range(np.max(i_s) + 1)]
+        for value, i, j in zip(value_s, i_s, j_s):
+            table[i][j] = value
+        return table
+    # produce list of spans to pass to is_constituent, while keeping track of which sentence
+    sent_s, i_s, j_s = [], [], []
+    idx_all = []
+    for sent in sent_all:
+        start = len(sent_s)
+        for i in range(len(sent)):
+            for j in range(i + 1, len(sent) + 1):
+                sent_s.append(sent)
+                i_s.append(i)
+                j_s.append(j)
+        idx_all.append((start, len(sent_s)))
+    # feed spans to is_constituent
+    # prob_s, label_s = self.is_constituent(sent_s, i_s, j_s, verbose = verbose)
+    # given span probs, perform CKY to get best tree for each sentence.
+    tree_all, prob_all = [], []
+    for sent, idx in zip(sent_all, idx_all):
+        # first, use tables to keep track of things
+        k, l = idx
+        prob, label = prob_s[k:l], label_s[k:l]
+        i, j = i_s[k:l], j_s[k:l]
+        prob_table = to_table(prob, i, j)
+        label_table = to_table(label, i, j)
+        # perform cky using scores and backpointers
+        score_table = [[None for _ in range(len(sent) + 1)] for _ in range(len(sent))]
+        backpt_table = [[None for _ in range(len(sent) + 1)] for _ in range(len(sent))]
+        for i in range(len(sent)):  # base case: single words
+            score_table[i][i + 1] = 1
+        for j in range(2, len(sent) + 1):
+            for i in range(j - 2, -1, -1):
+                best, argmax = -np.inf, None
+                for k in range(i + 1, j):  # find splitpoint
+                    score = score_table[i][k] + score_table[k][j]
+                    if score > best:
+                        best, argmax = score, k
+                score_table[i][j] = best + prob_table[i][j]
+                backpt_table[i][j] = argmax
+        tree = backpt_to_tree(sent, backpt_table, label_table)
+        tree_all.append(tree)
+        prob_all.append(prob_table)
+    return tree_all, prob_all
+def get_best_parse(sentence, spans):
+    flattened_scores = []
+    for i in range(spans.shape[0]):
+        for j in range(spans.shape[1]):
+            if i > j:
+                continue
+            else:
+                flattened_scores.append(spans[i, j])
+    prob_s, label_s = flattened_scores, ["S"] * len(flattened_scores)
+    # print(prob_s, label_s)
+    trees, _ = CKY(sent_all=sentence, prob_s=prob_s, label_s=label_s)
+    s = str(trees[0])
+    # Replace previous occurrence of string
+    out = re.sub(r"(?<![^\s()])([^\s()]+)(?=\s+\1(?![^\s()]))", "S", s)
+    # best_parse = "(ROOT " + out + ")"
+    return out  # best_parse

weakly_supervised_parser/utils/create_inside_outside_strings.py ADDED Viewed

	@@ -0,0 +1,40 @@

+class InsideOutside:
+    def __init__(self, sentence):
+        self.sentence = sentence.split()
+        self.sentence_length = len(self.sentence)
+    def calculate_inside(self, idx_start, idx_end):
+        # get inside string
+        return self.sentence[idx_start:idx_end]
+    def calculate_outside(self, idx_start, idx_end):
+        # get outside string
+        if idx_start == 0 and idx_end == self.sentence_length:
+            left_outside = ["<s>"]  # bos_token roberta   # ["[UNK]"]
+            right_outside = ["</s>"]  # eos_token roberta  # ["[UNK]"]
+        elif idx_start == 0:
+            left_outside = ["<s>"]  # ["[UNK]"]
+            right_outside = self.sentence[idx_end:]
+        elif idx_end == self.sentence_length:
+            left_outside = self.sentence[:idx_start]
+            right_outside = ["</s>"]  # ["[UNK]"]
+        else:
+            left_outside = self.sentence[:idx_start]
+            right_outside = self.sentence[idx_end:]
+        return left_outside, right_outside
+    def create_inside_outside_matrix(self, ngram):
+        i, j = ngram[0][0], ngram[0][-1]
+        inside_string = self.calculate_inside(i, j)
+        outside_string = self.calculate_outside(i, j)
+        output_dict = {
+            "span": ngram[0],
+            "inside_string": " ".join(inside_string),
+            "left_outside_string": " ".join(outside_string[0]),
+            "right_outside_string": " ".join(outside_string[-1]),
+        }
+        inside_string_template = output_dict["inside_string"]
+        outside_string_template = (
+            output_dict["left_outside_string"].split()[-1] + " " + "<mask>" + " " + output_dict["right_outside_string"].split()[0]
+        )
+        return output_dict, inside_string_template, outside_string_template

weakly_supervised_parser/utils/distant_supervision.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from collections import defaultdict, Counter
+from nltk.corpus import stopwords
+class RuleBasedHeuristic:
+    def __init__(self, sentence=None, corpus=None):
+        self.sentence = sentence
+        self.corpus = corpus
+    def add_contiguous_titlecase_words(self, row):
+        matches = []
+        dd = defaultdict(list)
+        count = 0
+        for i, j in zip(row, row[1:]):
+            if j[0] - i[0] == 1:
+                dd[count].append(i[-1] + " " + j[-1])
+            else:
+                count += 1
+        for key, value in dd.items():
+            if len(value) > 1:
+                out = value[0]
+                inter = ""
+                for item in value[1:]:
+                    inter += " " + item.split()[-1]
+                matches.append(out + inter)
+            else:
+                matches.extend(value)
+        return matches
+    def augment_using_most_frequent_starting_token(self, N=1):
+        first_token = []
+        for sentence in self.corpus:
+            first_token.append(sentence.split()[0])
+        return Counter(first_token).most_common(N)
+    def get_top_tokens(self, top_most_common_ptb=None):
+        out = set(stopwords.words("english"))
+        if top_most_common_ptb:
+            out.update([token for token, counts in self.augment_using_most_frequent_starting_token(N=top_most_common_ptb)])
+        return out

weakly_supervised_parser/utils/populate_chart.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import pandas as pd
+import numpy as np
+from datasets.utils import set_progress_bar_enabled
+from weakly_supervised_parser.utils.prepare_dataset import NGramify
+from weakly_supervised_parser.utils.create_inside_outside_strings import InsideOutside
+from weakly_supervised_parser.utils.cky_algorithm import get_best_parse
+from weakly_supervised_parser.utils.distant_supervision import RuleBasedHeuristic
+from weakly_supervised_parser.utils.prepare_dataset import PTBDataset
+from weakly_supervised_parser.settings import PTB_TRAIN_SENTENCES_WITHOUT_PUNCTUATION_PATH
+# Disable Dataset.map progress bar
+set_progress_bar_enabled(False)
+# ptb = PTBDataset(data_path=PTB_TRAIN_SENTENCES_WITHOUT_PUNCTUATION_PATH)
+# ptb_top_100_common = [item.lower() for item in RuleBasedHeuristic(corpus=ptb.retrieve_all_sentences()).get_top_tokens(top_most_common_ptb=100)]
+ptb_top_100_common = ['this', 'myself', 'shouldn', 'not', 'analysts', 'same', 'mightn', 'we', 'american', 'the', 'another', 'until', "aren't", 'when', 'if', 'am', 'over', 'ma', 'as', 'of', 'with', 'even', 'couldn', 'not', "needn't", 'where', 'there', 'isn', 'however', 'my', 'sales', 'here', 'at', 'yours', 'into', 'wouldn', 'officials', 'no', "hasn't", 'to', 'wasn', 'any', 'ours', 'out', 'each', "wasn't", 'is', 'and', 'me', 'off', 'once', "it's", 'they', 'most', 'also', 'through', 'hasn', 'our', 'or', 'after', "weren't", 'about', 'mr.', 'first', 'haven', 'needn', 'have', "isn't", 'now', "didn't", 'on', 'theirs', 'these', 'before', 'there', 'was', 'which', 'those', 'having', 'do', 'most', 'own', 'among', 'because', 'for', "should've", "shan't", 'so', 'being', 'few', 'too', 'to', 'at', 'people', 'her', 'meanwhile', 'both', 'down', 'doesn', 'below', 'mustn', 'an', 'two', 'more', 'japanese', 'ford', "you'd", 'about', 'but', 'doing', 'itself', 've', 'under', 'what', 'again', 'then', 'your', 'himself', 'now', 'against', 'just', 'does', 'net', "couldn't", 'that', 'he', 'revenue', 'because', 'yesterday', 'them', 'i', 'their', 'all', 'under', 'up', "haven't", 'while', "won't", 'it', 'more', 'it', 'ain', 'him', 'still', 'a', 'he', 'despite', 'should', 'during', 'nor', "shouldn't", 'such', "doesn't", 'are', "that'll", 'since', 'yourselves', 'such', 'those', 'after', 'weren', "you're", 'd', 'like', 'did', 'hadn', 'themselves', 'its', 'but', 'been', 's', "don't", 'these', 'they', 'this', 'his', "mightn't", 'moreover', 'how', 'new', 'above', 'ourselves', 'so', 'why', 'between', 'their', 'general', "wouldn't", 'who', 'i', 'in', 'don', 'shan', 'u.s.', 'ibm', 'separately', 'had', 'you', 'federal', 'if', 'our', 'and', 'only', 'y', 'many', 'one', 'no', 'though', 'won', 'last', 'from', 'each', 'traders', 'john', 'further', 'hers', 'both', "you've", "you'll", 'that', 'all', 'its', 'only', 'here', 'according', "mustn't", 'while', 'in', 'what', 'didn', 'when', 'some', 'on', 'can', 'yourself', 'herself', 'than', 'with', 'has', 'she', 'during', 'will', 'of', 'thus', 'you', 'very', 'o', 'investors', 'a', 'ms.', 'japan', 'were', 'the', 'we', 'm', 'as', 'll', 'be', 'by', 'other', 'yet', 'whom', 'some', 'indeed', 'other', "she's", "hadn't", 'by', 'earlier', 'for', 'instead', 'she', 'an', 't', 're', 'his', 'then', 'aren', 'although']
+# ptb_most_common_first_token = RuleBasedHeuristic(corpus=ptb.retrieve_all_sentences()).augment_using_most_frequent_starting_token(N=1)[0][0].lower()
+ptb_most_common_first_token = "the"
+class PopulateCKYChart:
+    def __init__(self, sentence):
+        self.sentence = sentence
+        self.sentence_list = sentence.split()
+        self.sentence_length = len(sentence.split())
+        self.span_scores = np.zeros((self.sentence_length + 1, self.sentence_length + 1), dtype=float)
+        self.all_spans = NGramify(self.sentence).generate_ngrams(single_span=True, whole_span=True)
+    def compute_scores(self, model, predict_type, scale_axis, predict_batch_size, chunks=128):
+        inside_strings = []
+        outside_strings = []
+        inside_scores = []
+        outside_scores = []
+        for span in self.all_spans:
+            _, inside_string, outside_string = InsideOutside(sentence=self.sentence).create_inside_outside_matrix(span)
+            inside_strings.append(inside_string)
+            outside_strings.append(outside_string)
+        data = pd.DataFrame({"inside_sentence": inside_strings, "outside_sentence": outside_strings, "span": [span[0] for span in self.all_spans]})
+        if predict_type == "inside":
+            if data.shape[0] > chunks:
+                data_chunks = np.array_split(data, data.shape[0] // chunks)
+                for data_chunk in data_chunks:
+                    inside_scores.extend(model.predict_proba(spans=data_chunk.rename(columns={"inside_sentence": "sentence"})[["sentence"]],
+                                                             scale_axis=scale_axis,
+                                                             predict_batch_size=predict_batch_size)[:, 1])
+            else:
+                inside_scores.extend(model.predict_proba(spans=data.rename(columns={"inside_sentence": "sentence"})[["sentence"]],
+                                                         scale_axis=scale_axis,
+                                                         predict_batch_size=predict_batch_size)[:, 1])
+            data["inside_scores"] = inside_scores
+            data.loc[
+                (data["inside_sentence"].str.lower().str.startswith(ptb_most_common_first_token))
+                & (data["inside_sentence"].str.lower().str.split().str.len() == 2)
+                & (~data["inside_sentence"].str.lower().str.split().str[-1].isin(RuleBasedHeuristic().get_top_tokens())),
+                "inside_scores",
+            ] = 1
+            is_upper_or_title = all([item.istitle() or item.isupper() for item in self.sentence.split()])
+            is_stop = any([item for item in self.sentence.split() if item.lower() in ptb_top_100_common])
+            flags = is_upper_or_title and not is_stop
+            data["scores"] = data["inside_scores"]
+        elif predict_type == "outside":
+            outside_scores.extend(model.predict_proba(spans=data.rename(columns={"outside_sentence": "sentence"})[["sentence"]],
+                                                      scale_axis=scale_axis,
+                                                      predict_batch_size=predict_batch_size)[:, 1])
+            data["outside_scores"] = outside_scores
+            flags = False
+            data["scores"] = data["outside_scores"]
+        return flags, data
+    def fill_chart(self, model, predict_type, scale_axis, predict_batch_size, data=None):
+        if data is None:
+            flags, data = self.compute_scores(model, predict_type, scale_axis, predict_batch_size)
+        for span in self.all_spans:
+            for i in range(0, self.sentence_length):
+                for j in range(i + 1, self.sentence_length + 1):
+                    if span[0] == (i, j):
+                        self.span_scores[i, j] = data.loc[data["span"] == span[0], "scores"].item()
+        return flags, self.span_scores, data
+    def best_parse_tree(self, span_scores):
+        span_scores_cky_format = span_scores[:-1, 1:]
+        best_parse = get_best_parse(sentence=[self.sentence_list], spans=span_scores_cky_format)
+        return best_parse

weakly_supervised_parser/utils/prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import csv
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from weakly_supervised_parser.utils.process_ptb import punctuation_words, currency_tags_words
+from weakly_supervised_parser.utils.distant_supervision import RuleBasedHeuristic
+filterchars = punctuation_words + currency_tags_words
+filterchars = [char for char in filterchars if char not in list(",;-") and char not in "``" and char not in "''"]
+class NGramify:
+    def __init__(self, sentence):
+        self.sentence = sentence.split()
+        self.sentence_length = len(self.sentence)
+        self.ngrams = []
+    def generate_ngrams(self, single_span=True, whole_span=True):
+        # number of substrings possible is N*(N+1)/2
+        # exclude substring or spans of length 1 and length N
+        if single_span:
+            start = 1
+        else:
+            start = 2
+        if whole_span:
+            end = self.sentence_length + 1
+        else:
+            end = self.sentence_length
+        for n in range(start, end):
+            for i in range(self.sentence_length - n + 1):
+                self.ngrams.append(((i, i + n), self.sentence[i : i + n]))
+        return self.ngrams
+    def generate_all_possible_spans(self):
+        for n in range(2, self.sentence_length):
+            for i in range(self.sentence_length - n + 1):
+                if i > 0 and (i + n) < self.sentence_length:
+                    self.ngrams.append(
+                        (
+                            (i, i + n),
+                            " ".join(self.sentence[i : i + n]),
+                            " ".join(self.sentence[0:i])
+                            + " ("
+                            + " ".join(self.sentence[i : i + n])
+                            + ") "
+                            + " ".join(self.sentence[i + n : self.sentence_length]),
+                        )
+                    )
+                elif i == 0:
+                    self.ngrams.append(
+                        (
+                            (i, i + n),
+                            " ".join(self.sentence[i : i + n]),
+                            "(" + " ".join(self.sentence[i : i + n]) + ") " + " ".join(self.sentence[i + n : self.sentence_length]),
+                        )
+                    )
+                elif (i + n) == self.sentence_length:
+                    self.ngrams.append(
+                        (
+                            (i, i + n),
+                            " ".join(self.sentence[i : i + n]),
+                            " ".join(self.sentence[0:i]) + " (" + " ".join(self.sentence[i : i + n]) + ")",
+                        )
+                    )
+        return self.ngrams
+class DataLoaderHelper:
+    def __init__(self, input_file_object=None, output_file_object=None):
+        self.input_file_object = input_file_object
+        self.output_file_object = output_file_object
+    def read_lines(self):
+        with open(self.input_file_object, "r") as f:
+            lines = f.read().splitlines()
+        return lines
+    def __getitem__(self, index):
+        return self.read_lines()[index]
+    def write_lines(self, keys, values):
+        with open(self.output_file_object, "w", newline="\n") as output_file:
+            dict_writer = csv.DictWriter(output_file, keys, delimiter="\t")
+            dict_writer.writeheader()
+            dict_writer.writerows(values)
+class PTBDataset:
+    def __init__(self, data_path):
+        self.data = pd.read_csv(data_path, sep="\t", header=None, names=["sentence"])
+        self.data["sentence"] = self.data
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        return self.data["sentence"].loc[index]
+    def retrieve_all_sentences(self, N=None):
+        if N:
+            return self.data["sentence"].iloc[:N].tolist()
+        return self.data["sentence"].tolist()
+    def preprocess(self):
+        self.data["sentence"] = self.data["sentence"].apply(
+            lambda row: " ".join([sentence for sentence in row.split() if sentence not in filterchars])
+        )
+        return self.data
+    def seed_bootstrap_constituent(self):
+        whole_span_slice = self.data["sentence"]
+        func = lambda x: RuleBasedHeuristic().add_contiguous_titlecase_words(
+            row=[(index, character) for index, character in enumerate(x) if character.istitle() or "'" in character]
+        )
+        titlecase_matches = [item for sublist in self.data["sentence"].str.split().apply(func).tolist() for item in sublist if len(item.split()) > 1]
+        titlecase_matches_df = pd.Series(titlecase_matches)
+        titlecase_matches_df = titlecase_matches_df[~titlecase_matches_df.str.split().str[0].str.contains("'")].str.replace("''", "")
+        most_frequent_start_token = RuleBasedHeuristic(corpus=self.retrieve_all_sentences()).augment_using_most_frequent_starting_token(N=1)[0][0]
+        most_frequent_start_token_df = titlecase_matches_df[titlecase_matches_df.str.startswith(most_frequent_start_token)].str.lower()
+        constituent_samples = pd.DataFrame(dict(sentence=pd.concat([whole_span_slice, titlecase_matches_df, most_frequent_start_token_df]), label=1))
+        return constituent_samples
+    def seed_bootstrap_distituent(self):
+        avg_sent_len = int(self.data["sentence"].str.split().str.len().mean())
+        last_but_one_slice = self.data["sentence"].str.split().str[:-1].str.join(" ")
+        last_but_two_slice = self.data[self.data["sentence"].str.split().str.len() > avg_sent_len + 10]["sentence"].str.split().str[:-2].str.join(" ")
+        last_but_three_slice = (
+            self.data[self.data["sentence"].str.split().str.len() > avg_sent_len + 20]["sentence"].str.split().str[:-3].str.join(" ")
+        )
+        last_but_four_slice = (
+            self.data[self.data["sentence"].str.split().str.len() > avg_sent_len + 30]["sentence"].str.split().str[:-4].str.join(" ")
+        )
+        last_but_five_slice = (
+            self.data[self.data["sentence"].str.split().str.len() > avg_sent_len + 40]["sentence"].str.split().str[:-5].str.join(" ")
+        )
+        last_but_six_slice = self.data[self.data["sentence"].str.split().str.len() > avg_sent_len + 50]["sentence"].str.split().str[:-6].str.join(" ")
+        distituent_samples = pd.DataFrame(
+            dict(
+                sentence=pd.concat(
+                    [
+                        last_but_one_slice,
+                        last_but_two_slice,
+                        last_but_three_slice,
+                        last_but_four_slice,
+                        last_but_five_slice,
+                        last_but_six_slice,
+                    ]
+                ),
+                label=0,
+            )
+        )
+        return distituent_samples
+    def train_validation_split(self, seed, test_size=0.5, shuffle=True):
+        self.preprocess()
+        bootstrap_constituent_samples = self.seed_bootstrap_constituent()
+        bootstrap_distituent_samples = self.seed_bootstrap_distituent()
+        df = pd.concat([bootstrap_constituent_samples, bootstrap_distituent_samples], ignore_index=True)
+        df = df.drop_duplicates(subset=["sentence"]).dropna(subset=["sentence"])
+        df["sentence"] = df["sentence"].str.strip()
+        df = df[df["sentence"].str.split().str.len() > 1]
+        train, validation = train_test_split(df, test_size=test_size, random_state=seed, shuffle=shuffle)
+        return train.head(8000), validation.head(2000)