import argparse import os from collections import defaultdict from typing import Dict, Any from concrete.util import CommunicationWriterTGZ from nltk.corpus import framenet, framenet15 from tqdm import tqdm from sftp.data_reader.concrete_srl import concrete_doc from tools.framenet.fn_util import framenet_split, Sentence as TokSentence def process_sentence(sent) -> Dict[str, Any]: ret = {'sentence': sent.text, 'tokenization': list(), 'annotations': list()} tok_sent = TokSentence(sent.text) for token in tok_sent.tokens: ret['tokenization'].append((token.idx, token.idx_end-1)) def process_one_ann_set(ann_set): ret['annotations'].append(event := {'label': ann_set.frame.name, 'children': (arg_list := list())}) target_list = list() for tar_start, tar_end in ann_set.Target: target_list.extend( list(range(tok_sent.span(tar_start, tar_end)[0], tok_sent.span(tar_start, tar_end)[1]+1)) ) target_list.sort() event['span'] = (target_list[0], target_list[-1]) for fe_start, fe_end, fe_name in ann_set.FE[0]: fe_start, fe_end = tok_sent.span(fe_start, fe_end) arg_list.append({ 'span': (fe_start, fe_end), 'label': fe_name }) if 'annotationSet' in sent: for ann_item in sent.annotationSet: if 'Target' not in ann_item: continue process_one_ann_set(ann_item) if 'Target' in sent: process_one_ann_set(sent) return ret def process_doc(docs, dst_path: str): writer = CommunicationWriterTGZ(dst_path) for doc in tqdm(docs): sentences = list() for sent in doc.sentence: sentences.append(process_sentence(sent)) comm = concrete_doc(sentences, doc.filename) writer.write(comm, comm.id + '.concrete') writer.close() def process_exemplar(dst_path, fn): bar = tqdm() raw_annotations = list() print('Loading exemplars...') try: for ann_sent in fn.annotations(full_text=False): if 'Target' not in ann_sent: continue bar.update() raw_annotations.append(ann_sent) except RuntimeError: pass finally: bar.close() char_idx_offset = 0 sentences = list() for sent in raw_annotations: sentences.append(process_sentence(sent)) char_idx_offset += len(sent.text)+1 comm = concrete_doc(sentences, 'exemplar') CommunicationWriterTGZ(dst_path).write(comm, 'exemplar.concrete') def run(): parser = argparse.ArgumentParser() parser.add_argument( 'dst', metavar='DESTINATION', type=str, help='Destination folder path.' ) parser.add_argument( '-v', metavar='VERSION', default='1.7', type=str, choices=['1.5', '1.7'], help='Version of FrameNet. Either 1.5 or 1.7.' ) args = parser.parse_args() fn = framenet if args.v == '1.7' else framenet15 os.makedirs(args.dst, exist_ok=True) doc_group = defaultdict(list) for doc in fn.docs(): if doc.filename in framenet_split['dev']: doc_group['dev'].append(doc) elif doc.filename in framenet_split['test']: doc_group['test'].append(doc) else: doc_group['train'].append(doc) for sp in framenet_split: print(f'Loaded {len(doc_group[sp])} docs for {sp}.') for sp in framenet_split: process_doc(doc_group[sp], dst_path=os.path.join(args.dst, f'{sp}.tar.gz')) process_exemplar(os.path.join(args.dst, 'exemplar.tar.gz'), fn) if __name__ == '__main__': run()