Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Gut more server things
Browse files- server/config.py +0 -20
- server/conftest.py +0 -0
- server/data_processing/README.md +0 -35
- server/data_processing/__init__.py +0 -12
- server/data_processing/convenience_corpus.py +0 -71
- server/data_processing/corpus_data_wrapper.py +0 -147
- server/data_processing/create_corpus.py +0 -28
- server/data_processing/create_faiss.py +0 -78
- server/data_processing/create_hdf5.py +0 -71
- server/data_processing/index_wrapper.py +0 -88
- server/data_processing/sentence_data_wrapper.py +0 -331
- server/data_processing/sentence_extracting.py +0 -181
- server/main.py +0 -1
- server/utils/path_fixes.py +1 -0
- server/utils/token_processing.py +0 -1
server/config.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
This file stores the main configuration variables to run a server.
|
3 |
-
"""
|
4 |
-
from pathlib import Path
|
5 |
-
import utils.path_fixes as pf
|
6 |
-
import os
|
7 |
-
|
8 |
-
ROOT = Path(os.path.abspath(__file__)).parent
|
9 |
-
CORPORA = ROOT / "corpora"
|
10 |
-
|
11 |
-
# Change this to indicate what data is loaded for searching
|
12 |
-
RESOURCE_DIR = CORPORA / "gpt2" / "woz"
|
13 |
-
MODEL_VERSION = "gpt2"
|
14 |
-
# RESOURCE_DIR = CORPORA / "woz_bert-base-cased"
|
15 |
-
# MODEL_VERSION = "bert-base-cased"
|
16 |
-
|
17 |
-
# Below are DEFAULTS. Change only if you changed the way embeddings and contexts are stored and created
|
18 |
-
CORPUS = RESOURCE_DIR / "data.hdf5"
|
19 |
-
EMBEDDING_FAISS = RESOURCE_DIR / "embedding_faiss"
|
20 |
-
CONTEXT_FAISS = RESOURCE_DIR / "context_faiss"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/conftest.py
DELETED
File without changes
|
server/data_processing/README.md
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
# Creating an annotated corpus
|
2 |
-
This module contains the code necessary for extracting and labeling a corpus with semantic data.
|
3 |
-
|
4 |
-
## Known limitations
|
5 |
-
Please note the following:
|
6 |
-
|
7 |
-
- There are many cases in which BPE tokenization and spacy's built in tokenization do not align. To remedy this, contractions that would break the BPE tokenization (defined by Spacy's hard coded exceptions in `spacy.lang.en.TOKENIZER_EXCEPTIONS` and `spacy.lang.tokenizer_exceptions.BASE_EXCEPTIONS`) are instead decomposed into the full words the contractions represent.
|
8 |
-
- Large corpus files require a LOT of hard drive space to store all the attentions and representations at every layer for every head. When tackling a corpus the size of the Wizard of Oz (207kb), make sure you have at least 9GB of free space. For the validation set of WikiText-2 (1.1MB), you will need 47GB.
|
9 |
-
|
10 |
-
## Getting Started
|
11 |
-
The raw Wizard of Oz text used to create the annotated corpus can be found [here](http://www.gutenberg.org/ebooks/55). A small version of Wikipedia (WikiText-2) can be found [here](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
|
12 |
-
|
13 |
-
### Environment
|
14 |
-
Because this module depends on code written in other parts of this repo, we will need to make those files available to the PYTHONPATH. There are several ways to do this, but the easiest way is to do the following:
|
15 |
-
|
16 |
-
1. `conda activate exbert` (Assuming you have taken the time to sort out the conda dependencies)
|
17 |
-
2. `cd server`
|
18 |
-
3. `pip install -e .`
|
19 |
-
|
20 |
-
This essentially makes this repository a local pip package, allowing you access to all packages inside of `server/` whenever the conda environment is active. For instance, if writing your own scripts or running a jupyter notebook, the top level `utils/token_processing` module will be available as `import utils.token_processing as tp`.
|
21 |
-
|
22 |
-
### Overview
|
23 |
-
To create your own dataset from scratch, you will need a large text file whose contents are in English. This repo currently does not support other languages.
|
24 |
-
|
25 |
-
1. Run `python create_corpus.py -f <FNAME>.txt -o <OUTDIR>`. This will create, in `<OUTDIR>`, the following files:
|
26 |
-
- `embeddings/` - A folder containing the `<FNAME>.hdf5` file and all the `<layer_**>.faiss` files needed to index into the embeddings. NOTE: These files can be quite large
|
27 |
-
- `headContext/` - A folder containing the `<FNAME>.hdf5` file and all the `<layer_**>.faiss` files needed to index into the head embeddings/context. NOTE: These files can be quite large
|
28 |
-
|
29 |
-
If you want to overwrite existing files in the output directory, add the `--force` flag onto the `create_corpus.py` command above.
|
30 |
-
|
31 |
-
### Running the individual scripts
|
32 |
-
2. Run `python create_hdf5.py -f <FNAME>.txt -o <OUTDIR>`
|
33 |
-
3. Run `python create_faiss.py -d <OUTDIR>`. This will assume the creation of the `embeddings` and `headContexts` folders inside of `<OUTDIR>`
|
34 |
-
|
35 |
-
You will then need to link these corpora into your application. In the `config.py` file, change the `RESOURCE_DIR` to point at `<OUTDIIR>`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/data_processing/__init__.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
from .corpus_data_wrapper import CorpusDataWrapper
|
2 |
-
from .convenience_corpus import ConvenienceCorpus, from_model
|
3 |
-
from .index_wrapper import Indexes, ContextIndexes
|
4 |
-
from .sentence_data_wrapper import TokenH5Data, SentenceH5Data
|
5 |
-
|
6 |
-
__all__ = [
|
7 |
-
'CorpusDataWrapper',
|
8 |
-
'Indexes',
|
9 |
-
'ContextIndexes',
|
10 |
-
'TokenH5Data',
|
11 |
-
'SentenceH5Data'
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/data_processing/convenience_corpus.py
DELETED
@@ -1,71 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
from .corpus_data_wrapper import CorpusDataWrapper
|
3 |
-
from .index_wrapper import Indexes, ContextIndexes
|
4 |
-
from config import CORPORA
|
5 |
-
from utils.f import memoize, delegates, GetAttr
|
6 |
-
from typing import List
|
7 |
-
|
8 |
-
def get_dir_names(path: Path) -> List[str]:
|
9 |
-
available = [g.name for g in filter(lambda g: g.is_dir(), path.glob("*"))]
|
10 |
-
return available
|
11 |
-
|
12 |
-
|
13 |
-
@memoize
|
14 |
-
def from_model(model_name, corpus_name):
|
15 |
-
"""Get the convenience corpus wrapper for a model and a corpus"""
|
16 |
-
model_dir = Path(CORPORA) / model_name
|
17 |
-
available = get_dir_names(model_dir)
|
18 |
-
if not model_dir.exists() or len(available) == 0:
|
19 |
-
raise FileNotFoundError("There are no corpora present for this model")
|
20 |
-
|
21 |
-
base_dir = model_dir / corpus_name
|
22 |
-
|
23 |
-
if not base_dir.exists():
|
24 |
-
raise FileNotFoundError(f"Desired corpus '{corpus_name}' not available")
|
25 |
-
|
26 |
-
return ConvenienceCorpus(base_dir)
|
27 |
-
|
28 |
-
def files_available(base_dir, glob_pattern="*.faiss"):
|
29 |
-
"""Determine whether the base_dir contains indexed files"""
|
30 |
-
if not base_dir.exists() or len(list(base_dir.glob(glob_pattern))) == 0:
|
31 |
-
return False
|
32 |
-
|
33 |
-
return True
|
34 |
-
class ConvenienceCorpus(GetAttr):
|
35 |
-
def __init__(self, base_dir):
|
36 |
-
bd = Path(base_dir)
|
37 |
-
self.base_dir = bd
|
38 |
-
self.model_dir = bd.parent
|
39 |
-
self.available_corpora = get_dir_names(self.model_dir)
|
40 |
-
|
41 |
-
self.model_name = self.model_dir.name
|
42 |
-
self.corpus_name = bd.name
|
43 |
-
self.name = f"{self.model_name}_{self.corpus_name}"
|
44 |
-
|
45 |
-
self.corpus_f = bd / 'data.hdf5'
|
46 |
-
self.embedding_dir = bd / 'embedding_faiss'
|
47 |
-
self.context_dir = bd / 'context_faiss'
|
48 |
-
|
49 |
-
# Define whether these different files exist or not
|
50 |
-
if not self.corpus_f.exists():
|
51 |
-
raise FileNotFoundError("Main HDF5 file does not exist")
|
52 |
-
|
53 |
-
self.embeddings_available = files_available(self.embedding_dir)
|
54 |
-
self.contexts_available = files_available(self.context_dir)
|
55 |
-
|
56 |
-
self.corpus = CorpusDataWrapper(self.corpus_f, self.name)
|
57 |
-
self.embedding_faiss = Indexes(self.embedding_dir)
|
58 |
-
self.context_faiss = ContextIndexes(self.context_dir)
|
59 |
-
|
60 |
-
self.default = self.corpus # Almost acts like an inherited class, but is rather a composed class
|
61 |
-
|
62 |
-
def search_embeddings(self, layer, query, k):
|
63 |
-
D, I = self.embedding_faiss.search(layer, query, k)
|
64 |
-
return self.find2d(I)[0]
|
65 |
-
|
66 |
-
def search_contexts(self, layer, heads, query, k):
|
67 |
-
D, I = self.context_faiss.search(layer, heads, query, k)
|
68 |
-
return self.find2d(I)[0]
|
69 |
-
|
70 |
-
def __repr__(self):
|
71 |
-
return f"ConvenienceCorpus({self.name})"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/data_processing/corpus_data_wrapper.py
DELETED
@@ -1,147 +0,0 @@
|
|
1 |
-
import h5py
|
2 |
-
import numpy as np
|
3 |
-
from functools import partial
|
4 |
-
from utils.gen_utils import map_nlist, vround
|
5 |
-
import regex as re
|
6 |
-
from spacyface.simple_spacy_token import SimpleSpacyToken
|
7 |
-
from data_processing.sentence_data_wrapper import SentenceH5Data, TokenH5Data
|
8 |
-
from utils.f import ifnone
|
9 |
-
|
10 |
-
ZERO_BUFFER = 12 # Number of decimal places each index takes
|
11 |
-
main_key = r"{:0" + str(ZERO_BUFFER) + r"}"
|
12 |
-
|
13 |
-
def to_idx(idx:int):
|
14 |
-
return main_key.format(idx)
|
15 |
-
|
16 |
-
def zip_len_check(*iters):
|
17 |
-
"""Zip iterables with a check that they are all the same length"""
|
18 |
-
if len(iters) < 2:
|
19 |
-
raise ValueError(f"Expected at least 2 iterables to combine. Got {len(iters)} iterables")
|
20 |
-
n = len(iters[0])
|
21 |
-
for i in iters:
|
22 |
-
n_ = len(i)
|
23 |
-
if n_ != n:
|
24 |
-
raise ValueError(f"Expected all iterations to have len {n} but found {n_}")
|
25 |
-
|
26 |
-
return zip(*iters)
|
27 |
-
|
28 |
-
class CorpusDataWrapper:
|
29 |
-
"""A wrapper for both the token embeddings and the head context.
|
30 |
-
|
31 |
-
This class allows access into an HDF5 file designed according to the data/processing module's contents as if it were
|
32 |
-
and in memory dictionary.
|
33 |
-
"""
|
34 |
-
|
35 |
-
def __init__(self, fname, name=None):
|
36 |
-
"""Open an hdf5 file of the format designed and provide easy access to its contents"""
|
37 |
-
|
38 |
-
# For iterating through the dataset
|
39 |
-
self.__curr = 0
|
40 |
-
|
41 |
-
self.__name = ifnone(name, "CorpusData")
|
42 |
-
self.fname = fname
|
43 |
-
self.data = h5py.File(fname, 'r')
|
44 |
-
|
45 |
-
main_keys = self.data.keys()
|
46 |
-
self.__len = len(main_keys)
|
47 |
-
|
48 |
-
assert self.__len > 0, "Cannot process an empty file"
|
49 |
-
|
50 |
-
embeds = self[0].embeddings
|
51 |
-
self.embedding_dim = embeds.shape[-1]
|
52 |
-
self.n_layers = embeds.shape[0] - 1 # 1 was added for the input layer
|
53 |
-
self.refmap, self.total_vectors = self._init_vector_map()
|
54 |
-
|
55 |
-
def __del__(self):
|
56 |
-
try: self.data.close()
|
57 |
-
|
58 |
-
# If run as a script, won't be able to close because of an import error
|
59 |
-
except ImportError: pass
|
60 |
-
|
61 |
-
except AttributeError:
|
62 |
-
print(f"Never successfully loaded {self.fname}")
|
63 |
-
|
64 |
-
def __iter__(self):
|
65 |
-
return self
|
66 |
-
|
67 |
-
def __len__(self):
|
68 |
-
return self.__len
|
69 |
-
|
70 |
-
def __next__(self):
|
71 |
-
if self.__curr >= self.__len:
|
72 |
-
self.__curr = 0
|
73 |
-
raise StopIteration
|
74 |
-
|
75 |
-
out = self[self.__curr]
|
76 |
-
self.__curr += 1
|
77 |
-
return out
|
78 |
-
|
79 |
-
def __getitem__(self, idx):
|
80 |
-
"""Index into the embeddings"""
|
81 |
-
if isinstance(idx, slice):
|
82 |
-
|
83 |
-
start = idx.start or 0
|
84 |
-
step = idx.step or 1
|
85 |
-
stop = idx.stop or (self.__len - 1)
|
86 |
-
stop = min(stop, self.__len)
|
87 |
-
|
88 |
-
i = start
|
89 |
-
out = []
|
90 |
-
while i < stop:
|
91 |
-
out.append(self[i])
|
92 |
-
i += step
|
93 |
-
|
94 |
-
return out
|
95 |
-
|
96 |
-
elif isinstance(idx, int):
|
97 |
-
if idx < 0: i = self.__len + idx
|
98 |
-
else: i = idx
|
99 |
-
|
100 |
-
key = to_idx(i)
|
101 |
-
return SentenceH5Data(self.data[key])
|
102 |
-
|
103 |
-
else:
|
104 |
-
raise NotImplementedError
|
105 |
-
|
106 |
-
def __repr__(self):
|
107 |
-
return f"{self.__name}: containing {self.__len} items"
|
108 |
-
|
109 |
-
def _init_vector_map(self):
|
110 |
-
"""Create main hashmap for all vectors to get their metadata.
|
111 |
-
|
112 |
-
TODO Initialization is a little slow... Should this be stored in a separate hdf5 file?
|
113 |
-
|
114 |
-
This doesn't change. Check for special hdf5 file and see if it exists already. If it does, open it.
|
115 |
-
If not, create it
|
116 |
-
"""
|
117 |
-
refmap = {}
|
118 |
-
print("Initializing reference map for embedding vector...")
|
119 |
-
n_vec = 0
|
120 |
-
for z, sentence in enumerate(self):
|
121 |
-
for i in range(len(sentence)):
|
122 |
-
refs = TokenH5Data(sentence, i)
|
123 |
-
refmap[n_vec] = refs
|
124 |
-
n_vec += 1
|
125 |
-
|
126 |
-
return refmap, n_vec
|
127 |
-
|
128 |
-
def extract(self, layer):
|
129 |
-
"""Extract embeddings from a particular layer from the dataset
|
130 |
-
|
131 |
-
For all examples
|
132 |
-
"""
|
133 |
-
embeddings = []
|
134 |
-
for i, embeds in enumerate(self):
|
135 |
-
embeddings.append(embeds[layer])
|
136 |
-
|
137 |
-
out = np.vstack(embeddings)
|
138 |
-
return out
|
139 |
-
|
140 |
-
def find(self, vec_num):
|
141 |
-
"""Find a vector's metadata (by id) in the hdf5 file. Needed to find sentence info and other attr"""
|
142 |
-
return self.refmap[vec_num]
|
143 |
-
|
144 |
-
def find2d(self, idxs):
|
145 |
-
"""Find a vector's metadata in the hdf5 file. Needed to find sentence info and other attr"""
|
146 |
-
out = [[self.refmap[i] for i in idx] for idx in idxs]
|
147 |
-
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/data_processing/create_corpus.py
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
from pathlib import Path
|
3 |
-
|
4 |
-
def parse_args():
|
5 |
-
parser = argparse.ArgumentParser()
|
6 |
-
parser.add_argument('-f', '--file', help="Path to .txt file to analyze and annotate")
|
7 |
-
parser.add_argument("-o", "--outdir", help="Path of output directory inside of which to place <model>/<corpus>/ directory containing hdf5 and faiss files")
|
8 |
-
parser.add_argument("-n", "--name", default=None, help="Name the corpus with a code name. If not given, default to the name of the provided .txt file")
|
9 |
-
parser.add_argument("--force", action="store_true", help="If given, overwrite existing hdf5 and faiss files.")
|
10 |
-
parser.add_argument("-m", "--model", help="Specify the huggingface model to use for attentions")
|
11 |
-
parser.add_argument("--nomask", action='store_false', help="INCLUDE attentions from special tokens like [CLS] and [SEP]. By default, ignore these attentions")
|
12 |
-
|
13 |
-
return parser.parse_args()
|
14 |
-
|
15 |
-
if __name__ == "__main__":
|
16 |
-
from utils.f import ifnone
|
17 |
-
import create_hdf5
|
18 |
-
import create_faiss
|
19 |
-
|
20 |
-
args = parse_args()
|
21 |
-
|
22 |
-
f = Path(args.file)
|
23 |
-
corpus_name = ifnone(args.name, f.stem)
|
24 |
-
output_dir = Path(args.outdir) / args.model / corpus_name
|
25 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
26 |
-
|
27 |
-
create_hdf5.main(args.file, output_dir, args.force, args.model, args.nomask)
|
28 |
-
create_faiss.main(output_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/data_processing/create_faiss.py
DELETED
@@ -1,78 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
import faiss
|
3 |
-
import numpy as np
|
4 |
-
from data_processing.corpus_data_wrapper import CorpusDataWrapper
|
5 |
-
from data_processing.index_wrapper import LAYER_TEMPLATE
|
6 |
-
import argparse
|
7 |
-
|
8 |
-
# Get model from base_dir
|
9 |
-
# Use that information to get the model's configuration
|
10 |
-
# From this, get the special tokens associated with that model
|
11 |
-
# Have flag to allow model's special tokens to be ignored
|
12 |
-
# Test what items match 'bert-base-cased'
|
13 |
-
|
14 |
-
def parse_args():
|
15 |
-
parser = argparse.ArgumentParser()
|
16 |
-
parser.add_argument("-d", "--directory", help="Path to the directory that contains the 'embeddings' and 'headContext' folders")
|
17 |
-
|
18 |
-
args = parser.parse_args()
|
19 |
-
return args
|
20 |
-
|
21 |
-
def train_indexes(ce:CorpusDataWrapper, stepsize=100, drop_null=True):
|
22 |
-
"""
|
23 |
-
|
24 |
-
Parameters:
|
25 |
-
===========
|
26 |
-
- corpus_embedding: Wrapper around HDF5 file for easy access to data
|
27 |
-
- stepsize: How many sentences to train with at once
|
28 |
-
- drop_null: Don't index the embeddings of special tokens (e.g., [CLS] and [SEP]) whose spacy POS are null
|
29 |
-
"""
|
30 |
-
NUM_LAYERS = ce.n_layers # want to account for the input layer, which for attentions + contexts is all value 0
|
31 |
-
|
32 |
-
embedding_indexes = [faiss.IndexFlatIP(ce.embedding_dim) for i in range(NUM_LAYERS)]
|
33 |
-
context_indexes = [faiss.IndexFlatIP(ce.embedding_dim) for i in range(NUM_LAYERS)]
|
34 |
-
|
35 |
-
for ix in range(0, len(ce), stepsize):
|
36 |
-
cdata = ce[ix:ix+stepsize]
|
37 |
-
|
38 |
-
if drop_null:
|
39 |
-
embeddings = np.concatenate([c.zero_special_embeddings for c in cdata], axis=1)
|
40 |
-
contexts = np.concatenate([c.zero_special_contexts for c in cdata], axis=1)
|
41 |
-
else:
|
42 |
-
embeddings = np.concatenate([c.embeddings for c in cdata], axis=1)
|
43 |
-
contexts = np.concatenate([c.contexts for c in cdata], axis=1)
|
44 |
-
|
45 |
-
for i in range(NUM_LAYERS):
|
46 |
-
embedding_indexes[i].add(embeddings[i])
|
47 |
-
context_indexes[i].add(contexts[i])
|
48 |
-
|
49 |
-
return embedding_indexes, context_indexes
|
50 |
-
|
51 |
-
def save_indexes(idxs, outdir, base_name=LAYER_TEMPLATE):
|
52 |
-
"""Save the faiss index into a file for each index in idxs"""
|
53 |
-
|
54 |
-
base_dir = Path(outdir)
|
55 |
-
if not base_dir.exists(): base_dir.mkdir(exist_ok=True, parents=True)
|
56 |
-
|
57 |
-
out_name = str(base_dir / base_name)
|
58 |
-
for i, idx in enumerate(idxs):
|
59 |
-
name = out_name.format(i)
|
60 |
-
print(f"Saving to {name}")
|
61 |
-
faiss.write_index(idx, name)
|
62 |
-
|
63 |
-
def main(basedir):
|
64 |
-
base = Path(basedir)
|
65 |
-
h5_fname = base / 'data.hdf5'
|
66 |
-
corpus = CorpusDataWrapper(h5_fname)
|
67 |
-
embedding_faiss, context_faiss = train_indexes(corpus)
|
68 |
-
|
69 |
-
context_faiss_dir = base / "context_faiss"
|
70 |
-
embedding_faiss_dir = base / "embedding_faiss"
|
71 |
-
save_indexes(embedding_faiss, embedding_faiss_dir)
|
72 |
-
save_indexes(context_faiss, context_faiss_dir)
|
73 |
-
|
74 |
-
if __name__ == "__main__":
|
75 |
-
# Creating the indices for both the context and embeddings
|
76 |
-
args = parse_args()
|
77 |
-
|
78 |
-
main(args.directory)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/data_processing/create_hdf5.py
DELETED
@@ -1,71 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import torch
|
3 |
-
import h5py
|
4 |
-
import pickle
|
5 |
-
import argparse
|
6 |
-
from pathlib import Path
|
7 |
-
|
8 |
-
from data_processing.sentence_extracting import extract_chars, extract_lines
|
9 |
-
from data_processing.corpus_data_wrapper import CorpusDataWrapper, to_idx
|
10 |
-
from transformer_details import from_pretrained
|
11 |
-
|
12 |
-
MIN_SENTENCE_CHARLEN = 24
|
13 |
-
|
14 |
-
def parse_args():
|
15 |
-
parser = argparse.ArgumentParser()
|
16 |
-
parser.add_argument("-f", "--file", help="Path to .pckl file of unique sentences from a corpus.")
|
17 |
-
parser.add_argument("-o", "--outdir", help="Path of directory in which to store the analyzed sentences as a .hdf5")
|
18 |
-
parser.add_argument("-m", "--model", default="bert-base-cased", help="Which pretrained transformer model to use. See 'transformer_details.py' for supported models")
|
19 |
-
parser.add_argument("--nomask", action='store_false', help="By default, ignore attentions to special tokens like '[CLS]' and '[SEP]'. If given, include these attentions")
|
20 |
-
parser.add_argument("--force", action="store_true", help="If given, overwrite existing hdf5 files.")
|
21 |
-
|
22 |
-
args = parser.parse_args()
|
23 |
-
return args
|
24 |
-
|
25 |
-
def main(infile, outdir, force, model_name, mask_attentions):
|
26 |
-
outdir = Path(outdir)
|
27 |
-
outdir.mkdir(parents=True, exist_ok=True)
|
28 |
-
data_outfile = outdir / "data.hdf5"
|
29 |
-
f = h5py.File(data_outfile, 'a')
|
30 |
-
if force: f.clear()
|
31 |
-
|
32 |
-
extractor = from_pretrained(model_name)
|
33 |
-
|
34 |
-
# if "gpt" in model_name:
|
35 |
-
# mask_attentions = False
|
36 |
-
|
37 |
-
print_every = 50
|
38 |
-
long_strings = extract_chars(infile, 10000)
|
39 |
-
cutoff_sent = ""
|
40 |
-
i = 0
|
41 |
-
for strip in long_strings:
|
42 |
-
sentences = [sent.text for sent in extractor.aligner.spacy_nlp(strip).sents]
|
43 |
-
fixed_sentences = [cutoff_sent + sentences[0]] + sentences[1:-1]
|
44 |
-
|
45 |
-
# This leads to the possibility that there will be an input that is two sentences long. This is ok.
|
46 |
-
cutoff_sent = sentences[-1]
|
47 |
-
for s in fixed_sentences:
|
48 |
-
if len(s) < MIN_SENTENCE_CHARLEN: continue
|
49 |
-
if ((i + 1) % print_every) == 0: print(f"Starting sentence {i+1}: \n", s)
|
50 |
-
|
51 |
-
try:
|
52 |
-
out = extractor.att_from_sentence(s, mask_attentions=mask_attentions)
|
53 |
-
|
54 |
-
except Exception as e:
|
55 |
-
print(f"Error {e} occured at sentence {i}:\n{s}\n\n Skipping, not creating hdf5 grp")
|
56 |
-
continue
|
57 |
-
|
58 |
-
content = out.to_hdf5_content()
|
59 |
-
meta = out.to_hdf5_meta()
|
60 |
-
grp = f.create_group(to_idx(i))
|
61 |
-
for k,v in content.items(): grp.create_dataset(k, data=v)
|
62 |
-
for k, v in meta.items(): grp.attrs[k] = v
|
63 |
-
|
64 |
-
i += 1 # Increment to mark the next sentence
|
65 |
-
|
66 |
-
print("FINISHED CORPUS PROCESSING SUCCESSFULLY")
|
67 |
-
|
68 |
-
if __name__ == "__main__":
|
69 |
-
args = parse_args()
|
70 |
-
|
71 |
-
main(args.file, args.outdir, args.force, args.model, args.nomask)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/data_processing/index_wrapper.py
DELETED
@@ -1,88 +0,0 @@
|
|
1 |
-
from functools import partial
|
2 |
-
import faiss
|
3 |
-
import numpy as np
|
4 |
-
from pathlib import Path
|
5 |
-
from typing import Iterable
|
6 |
-
from utils.f import memoize
|
7 |
-
from transformers import AutoConfig
|
8 |
-
|
9 |
-
@memoize
|
10 |
-
def get_config(model_name):
|
11 |
-
return AutoConfig.from_pretrained(model_name)
|
12 |
-
|
13 |
-
FAISS_LAYER_PATTERN = 'layer_*.faiss'
|
14 |
-
LAYER_TEMPLATE = 'layer_{:02d}.faiss'
|
15 |
-
|
16 |
-
def create_mask(head_size:int , n_heads:int, selected_heads:Iterable[int]):
|
17 |
-
"""Create a masked vector of size (head_size * n_heads), where 0 indicates we don't care about the contribution of that head 1 indicates that we do care
|
18 |
-
|
19 |
-
Parameters:
|
20 |
-
-----------
|
21 |
-
head_size: Hidden dimension of the heads
|
22 |
-
n_heads: Number of heads the model has
|
23 |
-
selected_heads: Which heads we don't want to zero out
|
24 |
-
"""
|
25 |
-
|
26 |
-
mask = np.zeros(n_heads)
|
27 |
-
for h in selected_heads:
|
28 |
-
mask[int(h)] = 1
|
29 |
-
|
30 |
-
return np.repeat(mask, head_size)
|
31 |
-
|
32 |
-
class Indexes:
|
33 |
-
"""Wrapper around the faiss indices to make searching for a vector simpler and faster.
|
34 |
-
|
35 |
-
Assumes there are files in the folder matching the pattern input
|
36 |
-
"""
|
37 |
-
def __init__(self, folder, pattern=FAISS_LAYER_PATTERN):
|
38 |
-
self.base_dir = Path(folder)
|
39 |
-
self.n_layers = len(list(self.base_dir.glob(pattern))) - 1 # Subtract final output
|
40 |
-
self.indexes = [None] * (self.n_layers + 1) # Initialize empty list, adding 1 for input
|
41 |
-
self.pattern = pattern
|
42 |
-
self.__init_indexes()
|
43 |
-
|
44 |
-
# Extract model name from folder hierarchy
|
45 |
-
self.model_name = self.base_dir.parent.parent.stem
|
46 |
-
self.config = get_config(self.model_name)
|
47 |
-
self.nheads = self.config.num_attention_heads
|
48 |
-
self.hidden_size = self.config.hidden_size
|
49 |
-
assert (self.hidden_size % self.nheads) == 0, "Number of heads does not divide cleanly into the hidden size. Aborting"
|
50 |
-
self.head_size = int(self.config.hidden_size / self.nheads)
|
51 |
-
|
52 |
-
|
53 |
-
def __getitem__(self, v):
|
54 |
-
"""Slices not allowed, but index only"""
|
55 |
-
return self.indexes[v]
|
56 |
-
|
57 |
-
def __init_indexes(self):
|
58 |
-
for fname in self.base_dir.glob(self.pattern):
|
59 |
-
print(fname)
|
60 |
-
idx = fname.stem.split('_')[-1]
|
61 |
-
self.indexes[int(idx)] = faiss.read_index(str(fname))
|
62 |
-
|
63 |
-
def search(self, layer, query, k):
|
64 |
-
"""Search a given layer for the query vector. Return k results"""
|
65 |
-
return self[layer].search(query, k)
|
66 |
-
|
67 |
-
|
68 |
-
class ContextIndexes(Indexes):
|
69 |
-
"""Special index enabling masking of particular heads before searching"""
|
70 |
-
|
71 |
-
def __init__(self, folder, pattern=FAISS_LAYER_PATTERN):
|
72 |
-
super().__init__(folder, pattern)
|
73 |
-
|
74 |
-
self.head_mask = partial(create_mask, self.head_size, self.nheads)
|
75 |
-
|
76 |
-
# Int -> [Int] -> np.Array -> Int -> (np.Array(), )
|
77 |
-
def search(self, layer:int, heads:list, query:np.ndarray, k:int):
|
78 |
-
"""Search the embeddings for the context layer, masking by selected heads"""
|
79 |
-
assert max(heads) < self.nheads, "max of selected heads must be lest than nheads. Are you indexing by 1 instead of 0?"
|
80 |
-
assert min(heads) >= 0, "What is a negative head?"
|
81 |
-
|
82 |
-
unique_heads = list(set(heads))
|
83 |
-
mask_vector = self.head_mask(unique_heads)
|
84 |
-
mask_vector = mask_vector.reshape(query.shape)
|
85 |
-
|
86 |
-
new_query = (query * mask_vector).astype(np.float32)
|
87 |
-
|
88 |
-
return self[layer].search(new_query, k)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/data_processing/sentence_data_wrapper.py
DELETED
@@ -1,331 +0,0 @@
|
|
1 |
-
import h5py
|
2 |
-
import numpy as np
|
3 |
-
from functools import partial
|
4 |
-
from utils.gen_utils import map_nlist, vround
|
5 |
-
import regex as re
|
6 |
-
from spacyface.simple_spacy_token import SimpleSpacyToken
|
7 |
-
|
8 |
-
ZERO_BUFFER = 12 # Number of decimal places each index takes
|
9 |
-
main_key = r"{:0" + str(ZERO_BUFFER) + r"}"
|
10 |
-
suppl_attn_key = r"{:0" + str(ZERO_BUFFER) + r"}_attn"
|
11 |
-
|
12 |
-
def zip_len_check(*iters):
|
13 |
-
"""Zip iterables with a check that they are all the same length"""
|
14 |
-
if len(iters) < 2:
|
15 |
-
raise ValueError(f"Expected at least 2 iterables to combine. Got {len(iters)} iterables")
|
16 |
-
n = len(iters[0])
|
17 |
-
for i in iters:
|
18 |
-
n_ = len(i)
|
19 |
-
if n_ != n:
|
20 |
-
raise ValueError(f"Expected all iterations to have len {n} but found {n_}")
|
21 |
-
|
22 |
-
return zip(*iters)
|
23 |
-
|
24 |
-
class SentenceH5Data:
|
25 |
-
def __init__(self, grp):
|
26 |
-
self.grp = grp
|
27 |
-
|
28 |
-
@property
|
29 |
-
def n_layers(self):
|
30 |
-
return self.embeddings.shape[0] - 1 # 1 was added at the input, not a hidden layer
|
31 |
-
|
32 |
-
@property
|
33 |
-
def sentence(self):
|
34 |
-
return self.grp.attrs['sentence']
|
35 |
-
|
36 |
-
@property
|
37 |
-
def embeddings(self):
|
38 |
-
return self.grp['embeddings'][:]
|
39 |
-
|
40 |
-
@property
|
41 |
-
def zero_special_embeddings(self):
|
42 |
-
out = self.embeddings.copy()
|
43 |
-
out[:, self.mask_is_special] = np.zeros(out[:, self.mask_is_special].shape)
|
44 |
-
return out
|
45 |
-
|
46 |
-
@property
|
47 |
-
def contexts(self):
|
48 |
-
return self.grp['contexts'][:]
|
49 |
-
|
50 |
-
@property
|
51 |
-
def zero_special_contexts(self):
|
52 |
-
out = self.contexts.copy()
|
53 |
-
out[:, self.mask_is_special] = np.zeros(out[:, self.mask_is_special].shape)
|
54 |
-
return out
|
55 |
-
|
56 |
-
@property
|
57 |
-
def attentions(self):
|
58 |
-
"""Return all attentions, including [CLS] and [SEP]
|
59 |
-
|
60 |
-
Note that if the hdf5 is created with CLS and SEP attentions, it will have CLS and SEP attentions"""
|
61 |
-
return self.grp['attentions'][:] # Converts to numpy array
|
62 |
-
|
63 |
-
@property
|
64 |
-
def mask_is_special(self):
|
65 |
-
return np.logical_or(self.deps == '', self.poss == '')
|
66 |
-
|
67 |
-
@property
|
68 |
-
def tokens(self):
|
69 |
-
return self.grp.attrs['token']
|
70 |
-
|
71 |
-
@property
|
72 |
-
def poss(self):
|
73 |
-
return self.grp.attrs['pos']
|
74 |
-
|
75 |
-
@property
|
76 |
-
def deps(self):
|
77 |
-
return self.grp.attrs['dep']
|
78 |
-
|
79 |
-
@property
|
80 |
-
def is_ents(self):
|
81 |
-
return self.grp.attrs['is_ent']
|
82 |
-
|
83 |
-
@property
|
84 |
-
def heads(self):
|
85 |
-
"""Not the attention heads, but rather the head word of the orig sentence"""
|
86 |
-
return self.grp.attrs['head']
|
87 |
-
|
88 |
-
@property
|
89 |
-
def norms(self):
|
90 |
-
return self.grp.attrs['norm']
|
91 |
-
|
92 |
-
@property
|
93 |
-
def tags(self):
|
94 |
-
return self.grp.attrs['tag']
|
95 |
-
|
96 |
-
@property
|
97 |
-
def lemmas(self):
|
98 |
-
return self.grp.attrs['lemma']
|
99 |
-
|
100 |
-
def __len__(self):
|
101 |
-
return len(self.tokens)
|
102 |
-
|
103 |
-
def __repr__(self):
|
104 |
-
sent_len = 40
|
105 |
-
if len(self.sentence) > sent_len: s = self.sentence[:(sent_len - 3)] + '...'
|
106 |
-
else: s = self.sentence
|
107 |
-
return f"SentenceH5Data({s})"
|
108 |
-
|
109 |
-
class TokenH5Data(SentenceH5Data):
|
110 |
-
"""A wrapper around the HDF5 file storage information allowing easy access to information about each
|
111 |
-
processed sentence.
|
112 |
-
|
113 |
-
Sometimes, and index of -1 is used to represent the entire object in memory
|
114 |
-
"""
|
115 |
-
def __init__(self, grp, index):
|
116 |
-
"""Represents returned from the refmap of the CorpusEmbedding class"""
|
117 |
-
if type(grp) == SentenceH5Data: super().__init__(grp.grp)
|
118 |
-
elif type(grp) == h5py._hl.group.Group: super().__init__(grp)
|
119 |
-
self.index = index
|
120 |
-
|
121 |
-
@property
|
122 |
-
def embedding(self):
|
123 |
-
return self.embeddings[:, self.index, :]
|
124 |
-
|
125 |
-
@property
|
126 |
-
def context(self):
|
127 |
-
return self.contexts[:, self.index, :]
|
128 |
-
|
129 |
-
@property
|
130 |
-
def attentions_out(self):
|
131 |
-
"""Access all attention OUT of this token"""
|
132 |
-
output = self.attentions[:,:, self.index, :]
|
133 |
-
return output
|
134 |
-
|
135 |
-
@property
|
136 |
-
def attentions_in(self):
|
137 |
-
"""Access all attention INTO this token"""
|
138 |
-
new_attention = self.attentions.transpose((0,1,3,2))
|
139 |
-
return new_attention[:,:, self.index, :]
|
140 |
-
|
141 |
-
def _select_from_attention(self, layer, heads):
|
142 |
-
if type(heads) is int:
|
143 |
-
heads = [heads]
|
144 |
-
|
145 |
-
# Select layer and heads
|
146 |
-
modified_attentions = self.attentions[layer, heads].mean(0)
|
147 |
-
attentions_out = modified_attentions
|
148 |
-
attentions_in = modified_attentions.transpose()
|
149 |
-
return attentions_out, attentions_in
|
150 |
-
|
151 |
-
def _calc_offset_single(self, attention):
|
152 |
-
"""Get offset to location of max attention"""
|
153 |
-
curr_idx = self.index
|
154 |
-
max_atts = np.argmax(attention)
|
155 |
-
return max_atts - curr_idx
|
156 |
-
|
157 |
-
# Define metadata properties.
|
158 |
-
# Right now, needs manual curation of fields from SimpleSpacyToken. Ideally, this is automated
|
159 |
-
|
160 |
-
@property
|
161 |
-
def token(self):
|
162 |
-
return self.tokens[self.index]
|
163 |
-
|
164 |
-
@property
|
165 |
-
def pos(self):
|
166 |
-
return self.poss[self.index]
|
167 |
-
|
168 |
-
@property
|
169 |
-
def dep(self):
|
170 |
-
return self.deps[self.index]
|
171 |
-
|
172 |
-
@property
|
173 |
-
def is_ent(self):
|
174 |
-
return bool(self.is_ents[self.index])
|
175 |
-
|
176 |
-
@property
|
177 |
-
def norm(self):
|
178 |
-
return self.norms[self.index]
|
179 |
-
|
180 |
-
@property
|
181 |
-
def head(self):
|
182 |
-
return self.heads[self.index]
|
183 |
-
|
184 |
-
@property
|
185 |
-
def lemma(self):
|
186 |
-
return self.lemmas[self.index]
|
187 |
-
|
188 |
-
@property
|
189 |
-
def tag(self):
|
190 |
-
return self.tags[self.index]
|
191 |
-
|
192 |
-
def to_json(self, layer, heads, top_k=5, ndigits=4):
|
193 |
-
"""
|
194 |
-
Convert token information and attention to return to frontend
|
195 |
-
|
196 |
-
Require layer, heads, and top_k to convert the attention into value to return to frontend.
|
197 |
-
|
198 |
-
Output:
|
199 |
-
{
|
200 |
-
sentence: str
|
201 |
-
index: number
|
202 |
-
match: str
|
203 |
-
is_match: bool
|
204 |
-
is_next_word: bool
|
205 |
-
matched_att: {
|
206 |
-
in: { att: number[]
|
207 |
-
, offset_to_max: number
|
208 |
-
, loc_of_max: float
|
209 |
-
}
|
210 |
-
out: { att: number[]
|
211 |
-
, offset_to_max: number
|
212 |
-
, loc_of_max: float
|
213 |
-
}
|
214 |
-
},
|
215 |
-
matched_att_plus_1: {
|
216 |
-
in: { att: number[]
|
217 |
-
, offset_to_max: number
|
218 |
-
}
|
219 |
-
out: { att: number[]
|
220 |
-
, offset_to_max: number
|
221 |
-
}
|
222 |
-
}
|
223 |
-
tokens: List[
|
224 |
-
{ token: string
|
225 |
-
, pos: string
|
226 |
-
, dep: string
|
227 |
-
, is_ent: boolean
|
228 |
-
, inward: number[]
|
229 |
-
, outward: number[]
|
230 |
-
}
|
231 |
-
]
|
232 |
-
}
|
233 |
-
"""
|
234 |
-
keys = [
|
235 |
-
"token",
|
236 |
-
"pos",
|
237 |
-
"dep",
|
238 |
-
"is_ent",
|
239 |
-
"inward",
|
240 |
-
"outward",
|
241 |
-
]
|
242 |
-
|
243 |
-
token_arr = []
|
244 |
-
matched_attentions = {}
|
245 |
-
N = len(self)
|
246 |
-
|
247 |
-
# Iterate through the following
|
248 |
-
tokens = self.tokens.tolist()
|
249 |
-
poss = [p.lower() for p in self.poss.tolist()]
|
250 |
-
deps = [d.lower() for d in self.deps.tolist()]
|
251 |
-
ents = self.is_ents.tolist()
|
252 |
-
attentions_out, attentions_in = self._select_from_attention(layer, heads)
|
253 |
-
|
254 |
-
matched_att_plus_1 = None
|
255 |
-
next_index = None
|
256 |
-
|
257 |
-
for i, tok_info in enumerate(zip_len_check(
|
258 |
-
tokens
|
259 |
-
, poss
|
260 |
-
, deps
|
261 |
-
, ents
|
262 |
-
, attentions_out.tolist()
|
263 |
-
, attentions_in.tolist())):
|
264 |
-
|
265 |
-
def get_interesting_attentions():
|
266 |
-
return {
|
267 |
-
"in": {
|
268 |
-
"att": att_in,
|
269 |
-
"offset_to_max": self._calc_offset_single(att_in).item(),
|
270 |
-
# "loc_of_max": np.argmax(att_in), # Broken
|
271 |
-
},
|
272 |
-
"out": {
|
273 |
-
"att": att_out,
|
274 |
-
"offset_to_max": self._calc_offset_single(att_out).item(),
|
275 |
-
# "loc_of_max": np.argmax(att_out), # Broken
|
276 |
-
}
|
277 |
-
}
|
278 |
-
|
279 |
-
|
280 |
-
# Perform rounding of attentions
|
281 |
-
rounder = partial(round, ndigits=ndigits)
|
282 |
-
att_out = map_nlist(rounder, tok_info[-2])
|
283 |
-
att_in = map_nlist(rounder, tok_info[-1])
|
284 |
-
|
285 |
-
obj = {k: v for (k, v) in zip_len_check(keys, tok_info)}
|
286 |
-
|
287 |
-
IS_LAST_TOKEN = i == (N-1)
|
288 |
-
|
289 |
-
if (i == self.index) or ((i - 1) == self.index):
|
290 |
-
interesting_attentions = get_interesting_attentions()
|
291 |
-
|
292 |
-
if i == self.index:
|
293 |
-
obj['is_match'] = True
|
294 |
-
matched_attentions = interesting_attentions
|
295 |
-
|
296 |
-
elif (i-1) == self.index:
|
297 |
-
matched_att_plus_1 = interesting_attentions
|
298 |
-
obj['is_next_word'] = True
|
299 |
-
next_index = i
|
300 |
-
|
301 |
-
# Edge case for final iteration through sentence
|
302 |
-
|
303 |
-
else:
|
304 |
-
obj['is_match'] = False
|
305 |
-
obj['is_next_word'] = False
|
306 |
-
|
307 |
-
if (IS_LAST_TOKEN and (matched_att_plus_1 is None)):
|
308 |
-
print("Saving matched_att_plus_1 to: ", interesting_attentions)
|
309 |
-
obj['is_next_word'] = True
|
310 |
-
matched_att_plus_1 = get_interesting_attentions()
|
311 |
-
next_index = i
|
312 |
-
|
313 |
-
token_arr.append(obj)
|
314 |
-
|
315 |
-
next_token = self.tokens[next_index]
|
316 |
-
|
317 |
-
obj = {
|
318 |
-
"sentence": self.sentence,
|
319 |
-
"index": self.index,
|
320 |
-
"match": self.token,
|
321 |
-
"next_index": next_index,
|
322 |
-
"match_plus_1": next_token,
|
323 |
-
"matched_att": matched_attentions,
|
324 |
-
"matched_att_plus_1": matched_att_plus_1,
|
325 |
-
"tokens": token_arr,
|
326 |
-
}
|
327 |
-
|
328 |
-
return obj
|
329 |
-
|
330 |
-
def __repr__(self):
|
331 |
-
return f"{self.token}: [{self.pos}, {self.dep}, {self.is_ent}]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/data_processing/sentence_extracting.py
DELETED
@@ -1,181 +0,0 @@
|
|
1 |
-
"""Extractor functions to retrieve sentences by character chunks from a file
|
2 |
-
|
3 |
-
This script contains the logic that allows the user to process and filter
|
4 |
-
sentences of the original corpus. By default, this considers a minimum sentence
|
5 |
-
length, and removes newlines and multiple consecutive spaces.
|
6 |
-
|
7 |
-
Configuration for existing functionality is at the top of the file. Feel free to
|
8 |
-
add new processing and/or filter functions. The "process_line" and "filter_line"
|
9 |
-
functions contain the pipeline for processing the scripts as needed.
|
10 |
-
|
11 |
-
"""
|
12 |
-
import regex as re
|
13 |
-
import argparse
|
14 |
-
from pathlib import Path
|
15 |
-
from functools import partial
|
16 |
-
from typing import Union
|
17 |
-
|
18 |
-
MIN_LINE_LENGTH = 8 # words
|
19 |
-
|
20 |
-
def parse_args():
|
21 |
-
parser = argparse.ArgumentParser()
|
22 |
-
parser.add_argument("-f", "--file", help="Path to .txt file to analyze and annotate")
|
23 |
-
parser.add_argument("-o", "--outdir", help="Path of directory in which to store the analyzed sentences as a .pckl")
|
24 |
-
|
25 |
-
|
26 |
-
args = parser.parse_args()
|
27 |
-
return args
|
28 |
-
|
29 |
-
# ============================================================
|
30 |
-
# Helper functions
|
31 |
-
# ============================================================
|
32 |
-
# String -> String
|
33 |
-
def replace_newlines(s:str) -> str:
|
34 |
-
return re.sub(r"\n+", r" ", s)
|
35 |
-
|
36 |
-
# String -> String
|
37 |
-
def replace_multispace(s:str) -> str:
|
38 |
-
return re.sub(r"\s+", r" ", s)
|
39 |
-
|
40 |
-
def is_short_sentence(s:str, min_len=8) -> str:
|
41 |
-
"""Returns True if the sentence has less than `min_len` number of words"""
|
42 |
-
return len(s.split(' ')) < min_len
|
43 |
-
|
44 |
-
def contains_char(char:str, s:str) -> str:
|
45 |
-
return char in s
|
46 |
-
|
47 |
-
# ============================================================
|
48 |
-
# Compilation functions
|
49 |
-
# ============================================================
|
50 |
-
|
51 |
-
def process_line(line:str) -> str:
|
52 |
-
""""Replaces newlines with spaces and removes multiple consecutive spaces from a chunk of file.
|
53 |
-
|
54 |
-
Args:
|
55 |
-
line: Chunk of text
|
56 |
-
|
57 |
-
Returns:
|
58 |
-
Input that has been stripped of newlines and multiple consecutive spaces.
|
59 |
-
"""
|
60 |
-
s = replace_multispace(replace_newlines(line))
|
61 |
-
return s
|
62 |
-
|
63 |
-
def filter_line(line:str) -> bool:
|
64 |
-
"""Returns True if the sentence passes the MIN_LINE_LENGTH configuration
|
65 |
-
|
66 |
-
Redefine this function with desired helper functions, returning true if you want to keep the line
|
67 |
-
"""
|
68 |
-
fails = is_short_sentence(line, MIN_LINE_LENGTH)
|
69 |
-
|
70 |
-
return not fails
|
71 |
-
|
72 |
-
# ============================================================
|
73 |
-
# Main Logic
|
74 |
-
# ============================================================
|
75 |
-
|
76 |
-
def read_outcomes(chars:str) -> Union[str, None]:
|
77 |
-
"""From a chunk of characters, decide whether to return the processed characters or Nothing.
|
78 |
-
|
79 |
-
If the input is the empty string "", raise StopIteration
|
80 |
-
|
81 |
-
Args:
|
82 |
-
chars: Chunk of text to process
|
83 |
-
|
84 |
-
Returns:
|
85 |
-
The processed chunk of text or nothing if the characters do not pass the filtering
|
86 |
-
|
87 |
-
Raises:
|
88 |
-
StopIteration: If the input is the empty string "", raise StopIteration
|
89 |
-
"""
|
90 |
-
|
91 |
-
if chars == '': raise StopIteration
|
92 |
-
line = process_line(chars)
|
93 |
-
if filter_line(line): return line
|
94 |
-
return None
|
95 |
-
|
96 |
-
def get_chars(n:int, f) -> Union[str, None]:
|
97 |
-
"""Extract `n` chars from opened file `f`
|
98 |
-
|
99 |
-
Args:
|
100 |
-
n: Number of characters to read from the opened file
|
101 |
-
f: Opened file from the return of `open(fname)`
|
102 |
-
|
103 |
-
Returns:
|
104 |
-
The processed chunk of text or nothing if the characters do not pass the filtering
|
105 |
-
|
106 |
-
Raises:
|
107 |
-
This function does not raise any errors of its own, but can pass up the StopIteration exception
|
108 |
-
from read_outcomes
|
109 |
-
"""
|
110 |
-
chars = f.read(n)
|
111 |
-
return read_outcomes(chars)
|
112 |
-
|
113 |
-
def get_line(f):
|
114 |
-
"""Given an open file, get the next line and process it. Handles 3 scenarios:
|
115 |
-
|
116 |
-
1. StopIteration indicates the opened file has reached the end
|
117 |
-
2. Return a processed line if it passes the filter
|
118 |
-
3. If line does not pass the filter line, return None
|
119 |
-
"""
|
120 |
-
line = f.readline()
|
121 |
-
return read_outcomes(line)
|
122 |
-
|
123 |
-
def read_on(reader, f):
|
124 |
-
"""Read from an open file `f` according to the function `reader`
|
125 |
-
|
126 |
-
Args:
|
127 |
-
reader: A unary function of signature (f: _io.TextIOWrapper) -> str
|
128 |
-
f: An opened file, as returned by `open(fname)`
|
129 |
-
|
130 |
-
Yields:
|
131 |
-
A generator that returns lines defined by `reader` until the end of the file is reached.
|
132 |
-
"""
|
133 |
-
while True:
|
134 |
-
try:
|
135 |
-
line = reader(f)
|
136 |
-
except StopIteration:
|
137 |
-
break
|
138 |
-
|
139 |
-
if line is not None:
|
140 |
-
yield line
|
141 |
-
|
142 |
-
|
143 |
-
def extract_chars(infile, n=10000):
|
144 |
-
"""Extract `n` characters from a file"""
|
145 |
-
reader = partial(get_chars, n)
|
146 |
-
src = open(infile, 'r')
|
147 |
-
return read_on(reader, src)
|
148 |
-
src.close()
|
149 |
-
|
150 |
-
|
151 |
-
def extract_lines(infile):
|
152 |
-
"""Given a file, yield the processed lines from that file"""
|
153 |
-
src = open(infile, 'r')
|
154 |
-
return read_on(get_line, src)
|
155 |
-
src.close()
|
156 |
-
|
157 |
-
|
158 |
-
def extract_sentences_to_file(infile, outfname:str):
|
159 |
-
"""Extract sentences from a file into a new file indicated by `outfname`."""
|
160 |
-
out = open(outfname, 'x')
|
161 |
-
|
162 |
-
linegen = extract_lines(infile)
|
163 |
-
|
164 |
-
for line in linegen:
|
165 |
-
out.write(line + "\n")
|
166 |
-
|
167 |
-
out.close()
|
168 |
-
|
169 |
-
def main(infile, outdir):
|
170 |
-
"""Main function for creating the outdir and saving the processed sentences to that file"""
|
171 |
-
outfname = Path(infile).stem + '.txt'
|
172 |
-
outdir = Path(outdir)
|
173 |
-
outdir.mkdir(parents=True, exist_ok=True)
|
174 |
-
outfile = outdir / outfname
|
175 |
-
out_path = extract_sentences_to_file(infile, outfile)
|
176 |
-
|
177 |
-
return out_path
|
178 |
-
|
179 |
-
if __name__ == "__main__":
|
180 |
-
args = parse_args()
|
181 |
-
main(args.file, args.outdir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/main.py
CHANGED
@@ -5,7 +5,6 @@ from flask_cors import CORS
|
|
5 |
from flask import render_template, redirect, send_from_directory
|
6 |
|
7 |
import utils.path_fixes as pf
|
8 |
-
import config
|
9 |
from utils.f import ifnone
|
10 |
|
11 |
from data_processing import from_model
|
|
|
5 |
from flask import render_template, redirect, send_from_directory
|
6 |
|
7 |
import utils.path_fixes as pf
|
|
|
8 |
from utils.f import ifnone
|
9 |
|
10 |
from data_processing import from_model
|
server/utils/path_fixes.py
CHANGED
@@ -5,6 +5,7 @@ FAISS_LAYER_PATTERN = 'layer_*.faiss'
|
|
5 |
LAYER_TEMPLATE = 'layer_{:02d}.faiss'
|
6 |
|
7 |
ROOT_DIR = Path(os.path.abspath(__file__)).parent.parent.parent
|
|
|
8 |
DATA_DIR = ROOT_DIR / 'server' / 'data'
|
9 |
DATASET_DIR = Path.home() / 'Datasets'
|
10 |
ROOT_DIR = Path(os.path.abspath(__file__)).parent.parent.parent
|
|
|
5 |
LAYER_TEMPLATE = 'layer_{:02d}.faiss'
|
6 |
|
7 |
ROOT_DIR = Path(os.path.abspath(__file__)).parent.parent.parent
|
8 |
+
CORPORA = ROOT / "corpora"
|
9 |
DATA_DIR = ROOT_DIR / 'server' / 'data'
|
10 |
DATASET_DIR = Path.home() / 'Datasets'
|
11 |
ROOT_DIR = Path(os.path.abspath(__file__)).parent.parent.parent
|
server/utils/token_processing.py
CHANGED
@@ -5,7 +5,6 @@ If adding more metadata, modify the definitions in `to_spacy_meta` and `meta_to_
|
|
5 |
import h5py
|
6 |
import numpy as np
|
7 |
import spacy
|
8 |
-
import config
|
9 |
from transformers.tokenization_bert import BertTokenizer
|
10 |
from .f import flatten_, assoc, memoize, GetAttr
|
11 |
|
|
|
5 |
import h5py
|
6 |
import numpy as np
|
7 |
import spacy
|
|
|
8 |
from transformers.tokenization_bert import BertTokenizer
|
9 |
from .f import flatten_, assoc, memoize, GetAttr
|
10 |
|