File size: 3,465 Bytes
a02c788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from tensorflow import data, TensorShape, int64, int32
from math import exp
from os import makedirs
from shutil import rmtree, move, copytree
from huggingface_hub import hf_hub_download
import os


def get_features(tokenizer, sentences, labels):
    features = []
    for i, sentence in enumerate(sentences):
        inputs = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=tokenizer.model_max_length
        )
        input_ids, token_type_ids = \
            inputs['input_ids'], inputs['token_type_ids']
        padding_length = tokenizer.model_max_length - len(input_ids)

        if tokenizer.padding_side == 'right':
            attention_mask = [1] * len(input_ids) + [0] * padding_length
            input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
            token_type_ids = token_type_ids + \
                [tokenizer.pad_token_type_id] * padding_length
        else:
            attention_mask = [0] * padding_length + [1] * len(input_ids)
            input_ids = [tokenizer.pad_token_id] * padding_length + input_ids
            token_type_ids = \
                [tokenizer.pad_token_type_id] * padding_length + token_type_ids

        assert tokenizer.model_max_length \
            == len(attention_mask) \
            == len(input_ids) \
            == len(token_type_ids)

        feature = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'label': int(labels[i])
        }

        features.append(feature)

    def gen():
        for feature in features:
            yield (
                {
                    'input_ids': feature['input_ids'],
                    'attention_mask': feature['attention_mask'],
                    'token_type_ids': feature['token_type_ids'],
                },
                feature['label'],
            )

    dataset = data.Dataset.from_generator(
        gen,
        ({
            'input_ids': int32,
            'attention_mask': int32,
            'token_type_ids': int32
        }, int64),
        (
            {
                'input_ids': TensorShape([None]),
                'attention_mask': TensorShape([None]),
                'token_type_ids': TensorShape([None]),
            },
            TensorShape([]),
        ),
    )

    return dataset


def softmax(values):
    exps = [exp(value) for value in values]
    exps_sum = sum(exp_value for exp_value in exps)
    return tuple(map(lambda x: x / exps_sum, exps))


def make_dir(path):
    try:
        makedirs(path)
    except FileExistsError:
        pass


def remove_dir(path):
    rmtree(path)


def copy_dir(source_path, target_path):
    copytree(source_path, target_path)


def move_dir(source_path, target_path):
    move(source_path, target_path)

def download_from_hub(repo_id, filename, revision=None, cache_dir=None):
    try:
        hf_hub_download(repo_id=repo_id, filename=filename, revision=revision, cache_dir=cache_dir)
    except Exception as exp:
        raise exp

            
    if cache_dir is not None:

        files = os.listdir(cache_dir)

        for f in files:
            if '.lock' in f:
                name = f[0:-5]

                os.rename(cache_dir+name, cache_dir+filename)
                os.remove(cache_dir+name+'.lock')
                os.remove(cache_dir+name+'.json')