pantelis-ninja commited on
Commit
f378b77
·
1 Parent(s): ef82fa0

add code for huggingface endpoints capability

Browse files
handler.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import AutoConfig, AutoTokenizer
3
+ from src.models import DNikudModel, ModelConfig
4
+ from src.running_params import BATCH_SIZE, MAX_LENGTH_SEN
5
+ from src.utiles_data import Nikud
6
+ from src.models_utils import predict_single
7
+ import torch
8
+ import os
9
+
10
+
11
+ class EndpointHandler:
12
+ def __init__(self, path=""):
13
+ self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
+
15
+ self.tokenizer = AutoTokenizer.from_pretrained("tau/tavbert-he")
16
+ dir_model_config = os.path.join("models", "config.yml")
17
+ self.config = ModelConfig.load_from_file(dir_model_config)
18
+ self.model = DNikudModel(
19
+ self.config,
20
+ len(Nikud.label_2_id["nikud"]),
21
+ len(Nikud.label_2_id["dagesh"]),
22
+ len(Nikud.label_2_id["sin"]),
23
+ device=self.DEVICE,
24
+ ).to(self.DEVICE)
25
+
26
+ def back_2_text(self, labels, text):
27
+ nikud = Nikud()
28
+ new_line = ""
29
+ for indx_char, c in enumerate(text):
30
+ new_line += (
31
+ c
32
+ + nikud.id_2_char(labels[0][1][1], "dagesh")
33
+ + nikud.id_2_char(labels[0][1][2], "sin")
34
+ + nikud.id_2_char(labels[0][1][0], "nikud")
35
+ )
36
+ print(indx_char, c)
37
+ print(labels)
38
+ return new_line
39
+
40
+ def predict_single_text(
41
+ self,
42
+ text,
43
+ ):
44
+ data = self.tokenizer(text, return_tensors="pt")
45
+ all_labels = predict_single(self.model, data, self.DEVICE)
46
+ return all_labels
47
+
48
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
49
+ """
50
+ data args:
51
+ """
52
+
53
+ # get inputs
54
+ inputs = data.pop("text", data)
55
+
56
+ # run normal prediction
57
+ prediction = self.predict_single_text(inputs)
58
+
59
+ # result = []
60
+ # for pred in prediction:
61
+ # result.append(self.back_2_text(pred, inputs))
62
+ result = self.back_2_text(prediction, inputs)
63
+ return result
main.py ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # general
2
+ import argparse
3
+ import os
4
+ import sys
5
+ from datetime import datetime
6
+ import logging
7
+ from logging.handlers import RotatingFileHandler
8
+ from pathlib import Path
9
+
10
+ # ML
11
+ import torch
12
+ import torch.nn as nn
13
+ from transformers import AutoConfig, AutoTokenizer
14
+
15
+ # DL
16
+ from src.models import DNikudModel, ModelConfig
17
+ from src.models_utils import training, evaluate, predict
18
+ from src.plot_helpers import (
19
+ generate_plot_by_nikud_dagesh_sin_dict,
20
+ generate_word_and_letter_accuracy_plot,
21
+ )
22
+ from src.running_params import BATCH_SIZE, MAX_LENGTH_SEN
23
+ from src.utiles_data import (
24
+ NikudDataset,
25
+ Nikud,
26
+ create_missing_folders,
27
+ extract_text_to_compare_nakdimon,
28
+ )
29
+
30
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
+ assert DEVICE == "cuda"
32
+
33
+
34
+ def get_logger(
35
+ log_level, name_func, date_time=datetime.now().strftime("%d_%m_%y__%H_%M")
36
+ ):
37
+ log_location = os.path.join(
38
+ os.path.join(Path(__file__).parent, "logging"),
39
+ f"log_model_{name_func}_{date_time}",
40
+ )
41
+ create_missing_folders(log_location)
42
+
43
+ log_format = "%(asctime)s %(levelname)-8s Thread_%(thread)-6d ::: %(funcName)s(%(lineno)d) ::: %(message)s"
44
+ logger = logging.getLogger("algo")
45
+ logger.setLevel(getattr(logging, log_level))
46
+ cnsl_log_formatter = logging.Formatter(log_format)
47
+ cnsl_handler = logging.StreamHandler()
48
+ cnsl_handler.setFormatter(cnsl_log_formatter)
49
+ cnsl_handler.setLevel(log_level)
50
+ logger.addHandler(cnsl_handler)
51
+
52
+ create_missing_folders(log_location)
53
+
54
+ file_location = os.path.join(log_location, "Diacritization_Model_DEBUG.log")
55
+ file_log_formatter = logging.Formatter(log_format)
56
+
57
+ SINGLE_LOG_SIZE = 2 * 1024 * 1024 # in Bytes
58
+ MAX_LOG_FILES = 20
59
+ file_handler = RotatingFileHandler(
60
+ file_location, mode="a", maxBytes=SINGLE_LOG_SIZE, backupCount=MAX_LOG_FILES
61
+ )
62
+ file_handler.setFormatter(file_log_formatter)
63
+ file_handler.setLevel(log_level)
64
+ logger.addHandler(file_handler)
65
+
66
+ return logger
67
+
68
+
69
+ def evaluate_text(
70
+ path,
71
+ dnikud_model,
72
+ tokenizer_tavbert,
73
+ logger,
74
+ plots_folder=None,
75
+ batch_size=BATCH_SIZE,
76
+ ):
77
+ path_name = os.path.basename(path)
78
+
79
+ msg = f"evaluate text: {path_name} on D-nikud Model"
80
+ logger.debug(msg)
81
+
82
+ if os.path.isfile(path):
83
+ dataset = NikudDataset(
84
+ tokenizer_tavbert, file=path, logger=logger, max_length=MAX_LENGTH_SEN
85
+ )
86
+ elif os.path.isdir(path):
87
+ dataset = NikudDataset(
88
+ tokenizer_tavbert, folder=path, logger=logger, max_length=MAX_LENGTH_SEN
89
+ )
90
+ else:
91
+ raise Exception("input path doesnt exist")
92
+
93
+ dataset.prepare_data(name="evaluate")
94
+ mtb_dl = torch.utils.data.DataLoader(dataset.prepered_data, batch_size=batch_size)
95
+
96
+ word_level_correct, letter_level_correct_dev = evaluate(
97
+ dnikud_model, mtb_dl, plots_folder, device=DEVICE
98
+ )
99
+
100
+ msg = (
101
+ f"Dnikud Model\n{path_name} evaluate\nLetter level accuracy:{letter_level_correct_dev}\n"
102
+ f"Word level accuracy: {word_level_correct}"
103
+ )
104
+ logger.debug(msg)
105
+
106
+
107
+ def predict_text(
108
+ text_file,
109
+ tokenizer_tavbert,
110
+ output_file,
111
+ logger,
112
+ dnikud_model,
113
+ compare_nakdimon=False,
114
+ ):
115
+ dataset = NikudDataset(
116
+ tokenizer_tavbert, file=text_file, logger=logger, max_length=MAX_LENGTH_SEN
117
+ )
118
+
119
+ dataset.prepare_data(name="prediction")
120
+ mtb_prediction_dl = torch.utils.data.DataLoader(
121
+ dataset.prepered_data, batch_size=BATCH_SIZE
122
+ )
123
+ all_labels = predict(dnikud_model, mtb_prediction_dl, DEVICE)
124
+ text_data_with_labels = dataset.back_2_text(labels=all_labels)
125
+
126
+ if output_file is None:
127
+ for line in text_data_with_labels:
128
+ print(line)
129
+ else:
130
+ with open(output_file, "w", encoding="utf-8") as f:
131
+ if compare_nakdimon:
132
+ f.write(extract_text_to_compare_nakdimon(text_data_with_labels))
133
+ else:
134
+ f.write(text_data_with_labels)
135
+
136
+
137
+ def predict_folder(
138
+ folder,
139
+ output_folder,
140
+ logger,
141
+ tokenizer_tavbert,
142
+ dnikud_model,
143
+ compare_nakdimon=False,
144
+ ):
145
+ create_missing_folders(output_folder)
146
+
147
+ for filename in os.listdir(folder):
148
+ file_path = os.path.join(folder, filename)
149
+
150
+ if filename.lower().endswith(".txt") and os.path.isfile(file_path):
151
+ output_file = os.path.join(output_folder, filename)
152
+ predict_text(
153
+ file_path,
154
+ output_file=output_file,
155
+ logger=logger,
156
+ tokenizer_tavbert=tokenizer_tavbert,
157
+ dnikud_model=dnikud_model,
158
+ compare_nakdimon=compare_nakdimon,
159
+ )
160
+ elif (
161
+ os.path.isdir(file_path) and filename != ".git" and filename != "README.md"
162
+ ):
163
+ sub_folder = file_path
164
+ sub_folder_output = os.path.join(output_folder, filename)
165
+ predict_folder(
166
+ sub_folder,
167
+ sub_folder_output,
168
+ logger,
169
+ tokenizer_tavbert,
170
+ dnikud_model,
171
+ compare_nakdimon=compare_nakdimon,
172
+ )
173
+
174
+
175
+ def update_compare_folder(folder, output_folder):
176
+ create_missing_folders(output_folder)
177
+
178
+ for filename in os.listdir(folder):
179
+ file_path = os.path.join(folder, filename)
180
+
181
+ if filename.lower().endswith(".txt") and os.path.isfile(file_path):
182
+ output_file = os.path.join(output_folder, filename)
183
+ with open(file_path, "r", encoding="utf-8") as f:
184
+ text_data_with_labels = f.read()
185
+ with open(output_file, "w", encoding="utf-8") as f:
186
+ f.write(extract_text_to_compare_nakdimon(text_data_with_labels))
187
+ elif os.path.isdir(file_path) and filename != ".git":
188
+ sub_folder = file_path
189
+ sub_folder_output = os.path.join(output_folder, filename)
190
+ update_compare_folder(sub_folder, sub_folder_output)
191
+
192
+
193
+ def check_files_excepted(folder):
194
+ for filename in os.listdir(folder):
195
+ file_path = os.path.join(folder, filename)
196
+
197
+ if filename.lower().endswith(".txt") and os.path.isfile(file_path):
198
+ try:
199
+ x = NikudDataset(None, file=file_path)
200
+ except:
201
+ print(f"failed in file: {filename}")
202
+ elif os.path.isdir(file_path) and filename != ".git":
203
+ check_files_excepted(file_path)
204
+
205
+
206
+ def do_predict(
207
+ input_path, output_path, tokenizer_tavbert, logger, dnikud_model, compare_nakdimon
208
+ ):
209
+ if os.path.isdir(input_path):
210
+ predict_folder(
211
+ input_path,
212
+ output_path,
213
+ logger,
214
+ tokenizer_tavbert,
215
+ dnikud_model,
216
+ compare_nakdimon=compare_nakdimon,
217
+ )
218
+ elif os.path.isfile(input_path):
219
+ predict_text(
220
+ input_path,
221
+ output_file=output_path,
222
+ logger=logger,
223
+ tokenizer_tavbert=tokenizer_tavbert,
224
+ dnikud_model=dnikud_model,
225
+ compare_nakdimon=compare_nakdimon,
226
+ )
227
+ else:
228
+ raise Exception("Input file not exist")
229
+
230
+
231
+ def evaluate_folder(folder_path, logger, dnikud_model, tokenizer_tavbert, plots_folder):
232
+ msg = f"evaluate sub folder: {folder_path}"
233
+ logger.info(msg)
234
+
235
+ evaluate_text(
236
+ folder_path,
237
+ dnikud_model=dnikud_model,
238
+ tokenizer_tavbert=tokenizer_tavbert,
239
+ logger=logger,
240
+ plots_folder=plots_folder,
241
+ batch_size=BATCH_SIZE,
242
+ )
243
+
244
+ msg = f"\n***************************************\n"
245
+ logger.info(msg)
246
+
247
+ for sub_folder_name in os.listdir(folder_path):
248
+ sub_folder_path = os.path.join(folder_path, sub_folder_name)
249
+
250
+ if (
251
+ not os.path.isdir(sub_folder_path)
252
+ or sub_folder_path == ".git"
253
+ or "not_use" in sub_folder_path
254
+ or "NakdanResults" in sub_folder_path
255
+ ):
256
+ continue
257
+
258
+ evaluate_folder(
259
+ sub_folder_path, logger, dnikud_model, tokenizer_tavbert, plots_folder
260
+ )
261
+
262
+
263
+ def do_evaluate(
264
+ input_path,
265
+ logger,
266
+ dnikud_model,
267
+ tokenizer_tavbert,
268
+ plots_folder,
269
+ eval_sub_folders=False,
270
+ ):
271
+ msg = f"evaluate all_data: {input_path}"
272
+ logger.info(msg)
273
+
274
+ evaluate_text(
275
+ input_path,
276
+ dnikud_model=dnikud_model,
277
+ tokenizer_tavbert=tokenizer_tavbert,
278
+ logger=logger,
279
+ plots_folder=plots_folder,
280
+ batch_size=BATCH_SIZE,
281
+ )
282
+
283
+ msg = f"\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"
284
+ logger.info(msg)
285
+
286
+ if eval_sub_folders:
287
+ for sub_folder_name in os.listdir(input_path):
288
+ sub_folder_path = os.path.join(input_path, sub_folder_name)
289
+
290
+ if (
291
+ not os.path.isdir(sub_folder_path)
292
+ or sub_folder_path == ".git"
293
+ or "not_use" in sub_folder_path
294
+ or "NakdanResults" in sub_folder_path
295
+ ):
296
+ continue
297
+
298
+ evaluate_folder(
299
+ sub_folder_path, logger, dnikud_model, tokenizer_tavbert, plots_folder
300
+ )
301
+
302
+
303
+ def do_train(
304
+ logger,
305
+ plots_folder,
306
+ dir_model_config,
307
+ tokenizer_tavbert,
308
+ dnikud_model,
309
+ output_trained_model_dir,
310
+ data_folder,
311
+ n_epochs,
312
+ checkpoints_frequency,
313
+ learning_rate,
314
+ batch_size,
315
+ ):
316
+ msg = "Loading data..."
317
+ logger.debug(msg)
318
+
319
+ dataset_train = NikudDataset(
320
+ tokenizer_tavbert,
321
+ folder=os.path.join(data_folder, "train"),
322
+ logger=logger,
323
+ max_length=MAX_LENGTH_SEN,
324
+ is_train=True,
325
+ )
326
+ dataset_dev = NikudDataset(
327
+ tokenizer=tokenizer_tavbert,
328
+ folder=os.path.join(data_folder, "dev"),
329
+ logger=logger,
330
+ max_length=dataset_train.max_length,
331
+ is_train=True,
332
+ )
333
+ dataset_test = NikudDataset(
334
+ tokenizer=tokenizer_tavbert,
335
+ folder=os.path.join(data_folder, "test"),
336
+ logger=logger,
337
+ max_length=dataset_train.max_length,
338
+ is_train=True,
339
+ )
340
+
341
+ dataset_train.show_data_labels(plots_folder=plots_folder)
342
+
343
+ msg = f"Max length of data: {dataset_train.max_length}"
344
+ logger.debug(msg)
345
+
346
+ msg = (
347
+ f"Num rows in train data: {len(dataset_train.data)}, "
348
+ f"Num rows in dev data: {len(dataset_dev.data)}, "
349
+ f"Num rows in test data: {len(dataset_test.data)}"
350
+ )
351
+ logger.debug(msg)
352
+
353
+ msg = "Loading tokenizer and prepare data..."
354
+ logger.debug(msg)
355
+
356
+ dataset_train.prepare_data(name="train")
357
+ dataset_dev.prepare_data(name="dev")
358
+ dataset_test.prepare_data(name="test")
359
+
360
+ mtb_train_dl = torch.utils.data.DataLoader(
361
+ dataset_train.prepered_data, batch_size=batch_size
362
+ )
363
+ mtb_dev_dl = torch.utils.data.DataLoader(
364
+ dataset_dev.prepered_data, batch_size=batch_size
365
+ )
366
+
367
+ if not os.path.isfile(dir_model_config):
368
+ our_model_config = ModelConfig(dataset_train.max_length)
369
+ our_model_config.save_to_file(dir_model_config)
370
+
371
+ optimizer = torch.optim.Adam(dnikud_model.parameters(), lr=learning_rate)
372
+
373
+ msg = "training..."
374
+ logger.debug(msg)
375
+
376
+ criterion_nikud = nn.CrossEntropyLoss(ignore_index=Nikud.PAD_OR_IRRELEVANT).to(
377
+ DEVICE
378
+ )
379
+ criterion_dagesh = nn.CrossEntropyLoss(ignore_index=Nikud.PAD_OR_IRRELEVANT).to(
380
+ DEVICE
381
+ )
382
+ criterion_sin = nn.CrossEntropyLoss(ignore_index=Nikud.PAD_OR_IRRELEVANT).to(DEVICE)
383
+
384
+ training_params = {
385
+ "n_epochs": n_epochs,
386
+ "checkpoints_frequency": checkpoints_frequency,
387
+ }
388
+ (
389
+ best_model_details,
390
+ best_accuracy,
391
+ epochs_loss_train_values,
392
+ steps_loss_train_values,
393
+ loss_dev_values,
394
+ accuracy_dev_values,
395
+ ) = training(
396
+ dnikud_model,
397
+ mtb_train_dl,
398
+ mtb_dev_dl,
399
+ criterion_nikud,
400
+ criterion_dagesh,
401
+ criterion_sin,
402
+ training_params,
403
+ logger,
404
+ output_trained_model_dir,
405
+ optimizer,
406
+ device=DEVICE,
407
+ )
408
+
409
+ generate_plot_by_nikud_dagesh_sin_dict(
410
+ epochs_loss_train_values, "Train epochs loss", "Loss", plots_folder
411
+ )
412
+ generate_plot_by_nikud_dagesh_sin_dict(
413
+ steps_loss_train_values, "Train steps loss", "Loss", plots_folder
414
+ )
415
+ generate_plot_by_nikud_dagesh_sin_dict(
416
+ loss_dev_values, "Dev epochs loss", "Loss", plots_folder
417
+ )
418
+ generate_plot_by_nikud_dagesh_sin_dict(
419
+ accuracy_dev_values, "Dev accuracy", "Accuracy", plots_folder
420
+ )
421
+ generate_word_and_letter_accuracy_plot(
422
+ accuracy_dev_values, "Accuracy", plots_folder
423
+ )
424
+
425
+ msg = "Done"
426
+ logger.info(msg)
427
+
428
+
429
+ if __name__ == "__main__":
430
+ tokenizer_tavbert = AutoTokenizer.from_pretrained("tau/tavbert-he")
431
+
432
+ parser = argparse.ArgumentParser(
433
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
434
+ description="""Predict D-nikud""",
435
+ )
436
+ parser.add_argument(
437
+ "-l",
438
+ "--log",
439
+ dest="log_level",
440
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
441
+ default="DEBUG",
442
+ help="Set the logging level",
443
+ )
444
+ parser.add_argument(
445
+ "-m",
446
+ "--output_model_dir",
447
+ type=str,
448
+ default="models",
449
+ help="save directory for model",
450
+ )
451
+ subparsers = parser.add_subparsers(
452
+ help="sub-command help", dest="command", required=True
453
+ )
454
+
455
+ parser_predict = subparsers.add_parser("predict", help="diacritize a text files ")
456
+ parser_predict.add_argument("input_path", help="input file or folder")
457
+ parser_predict.add_argument("output_path", help="output file")
458
+ parser_predict.add_argument(
459
+ "-ptmp",
460
+ "--pretrain_model_path",
461
+ type=str,
462
+ default=os.path.join(Path(__file__).parent, "models", "Dnikud_best_model.pth"),
463
+ help="pre-train model path - use only if you want to use trained model weights",
464
+ )
465
+ parser_predict.add_argument(
466
+ "-c",
467
+ "--compare",
468
+ dest="compare_nakdimon",
469
+ default=False,
470
+ help="predict text for comparing with Nakdimon",
471
+ )
472
+ parser_predict.set_defaults(func=do_predict)
473
+
474
+ parser_evaluate = subparsers.add_parser("evaluate", help="evaluate D-nikud")
475
+ parser_evaluate.add_argument("input_path", help="input file or folder")
476
+ parser_evaluate.add_argument(
477
+ "-ptmp",
478
+ "--pretrain_model_path",
479
+ type=str,
480
+ default=os.path.join(Path(__file__).parent, "models", "Dnikud_best_model.pth"),
481
+ help="pre-train model path - use only if you want to use trained model weights",
482
+ )
483
+ parser_evaluate.add_argument(
484
+ "-df",
485
+ "--plots_folder",
486
+ dest="plots_folder",
487
+ default=os.path.join(Path(__file__).parent, "plots"),
488
+ help="set the debug folder",
489
+ )
490
+ parser_evaluate.add_argument(
491
+ "-es",
492
+ "--eval_sub_folders",
493
+ dest="eval_sub_folders",
494
+ default=False,
495
+ help="accuracy calculation includes the evaluation of sub-folders "
496
+ "within the input_path folder, providing independent assessments "
497
+ "for each subfolder.",
498
+ )
499
+ parser_evaluate.set_defaults(func=do_evaluate)
500
+
501
+ # train --n_epochs 20
502
+
503
+ parser_train = subparsers.add_parser("train", help="train D-nikud")
504
+ parser_train.add_argument(
505
+ "-ptmp",
506
+ "--pretrain_model_path",
507
+ type=str,
508
+ default=None,
509
+ help="pre-train model path - use only if you want to use trained model weights",
510
+ )
511
+ parser_train.add_argument(
512
+ "--learning_rate", type=float, default=0.001, help="Learning rate"
513
+ )
514
+ parser_train.add_argument("--batch_size", type=int, default=32, help="batch_size")
515
+ parser_train.add_argument(
516
+ "--n_epochs", type=int, default=10, help="number of epochs"
517
+ )
518
+ parser_train.add_argument(
519
+ "--data_folder",
520
+ dest="data_folder",
521
+ default=os.path.join(Path(__file__).parent, "data"),
522
+ help="Set the debug folder",
523
+ )
524
+ parser_train.add_argument(
525
+ "--checkpoints_frequency",
526
+ type=int,
527
+ default=1,
528
+ help="checkpoints frequency for save the model",
529
+ )
530
+ parser_train.add_argument(
531
+ "-df",
532
+ "--plots_folder",
533
+ dest="plots_folder",
534
+ default=os.path.join(Path(__file__).parent, "plots"),
535
+ help="Set the debug folder",
536
+ )
537
+ parser_train.set_defaults(func=do_train)
538
+
539
+ args = parser.parse_args()
540
+ kwargs = vars(args).copy()
541
+ date_time = datetime.now().strftime("%d_%m_%y__%H_%M")
542
+ logger = get_logger(kwargs["log_level"], args.command, date_time)
543
+
544
+ del kwargs["log_level"]
545
+
546
+ kwargs["tokenizer_tavbert"] = tokenizer_tavbert
547
+ kwargs["logger"] = logger
548
+
549
+ msg = "Loading model..."
550
+ logger.debug(msg)
551
+
552
+ if args.command in ["evaluate", "predict"] or (
553
+ args.command == "train" and args.pretrain_model_path is not None
554
+ ):
555
+ dir_model_config = os.path.join("models", "config.yml")
556
+ config = ModelConfig.load_from_file(dir_model_config)
557
+
558
+ dnikud_model = DNikudModel(
559
+ config,
560
+ len(Nikud.label_2_id["nikud"]),
561
+ len(Nikud.label_2_id["dagesh"]),
562
+ len(Nikud.label_2_id["sin"]),
563
+ device=DEVICE,
564
+ ).to(DEVICE)
565
+ state_dict_model = dnikud_model.state_dict()
566
+ state_dict_model.update(torch.load(args.pretrain_model_path))
567
+ dnikud_model.load_state_dict(state_dict_model)
568
+ else:
569
+ base_model_name = "tau/tavbert-he"
570
+ config = AutoConfig.from_pretrained(base_model_name)
571
+ dnikud_model = DNikudModel(
572
+ config,
573
+ len(Nikud.label_2_id["nikud"]),
574
+ len(Nikud.label_2_id["dagesh"]),
575
+ len(Nikud.label_2_id["sin"]),
576
+ pretrain_model=base_model_name,
577
+ device=DEVICE,
578
+ ).to(DEVICE)
579
+
580
+ if args.command == "train":
581
+ output_trained_model_dir = os.path.join(
582
+ kwargs["output_model_dir"], "latest", f"output_models_{date_time}"
583
+ )
584
+ create_missing_folders(output_trained_model_dir)
585
+ dir_model_config = os.path.join(kwargs["output_model_dir"], "config.yml")
586
+ kwargs["dir_model_config"] = dir_model_config
587
+ kwargs["output_trained_model_dir"] = output_trained_model_dir
588
+ del kwargs["pretrain_model_path"]
589
+ del kwargs["output_model_dir"]
590
+ kwargs["dnikud_model"] = dnikud_model
591
+
592
+ del kwargs["command"]
593
+ del kwargs["func"]
594
+ args.func(**kwargs)
595
+
596
+ sys.exit(0)
models/config.yml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _commit_hash: 41265b09a862144b2517afdfd46da4388f1380df
2
+ _name_or_path: tau/tavbert-he
3
+ add_cross_attention: false
4
+ architectures:
5
+ - RobertaForMaskedLM
6
+ attention_probs_dropout_prob: 0.1
7
+ bad_words_ids: null
8
+ begin_suppress_tokens: null
9
+ bos_token_id: 0
10
+ chunk_size_feed_forward: 0
11
+ classifier_dropout: null
12
+ cross_attention_hidden_size: null
13
+ decoder_start_token_id: null
14
+ diversity_penalty: 0.0
15
+ do_sample: false
16
+ early_stopping: false
17
+ encoder_no_repeat_ngram_size: 0
18
+ eos_token_id: 2
19
+ exponential_decay_length_penalty: null
20
+ finetuning_task: null
21
+ forced_bos_token_id: null
22
+ forced_eos_token_id: null
23
+ gradient_checkpointing: false
24
+ hidden_act: gelu
25
+ hidden_dropout_prob: 0.1
26
+ hidden_size: 768
27
+ id2label:
28
+ 0: LABEL_0
29
+ 1: LABEL_1
30
+ initializer_range: 0.02
31
+ intermediate_size: 3072
32
+ is_decoder: false
33
+ is_encoder_decoder: false
34
+ label2id:
35
+ LABEL_0: 0
36
+ LABEL_1: 1
37
+ layer_norm_eps: 1.0e-05
38
+ length_penalty: 1.0
39
+ max_length: 512
40
+ max_position_embeddings: 2050
41
+ min_length: 0
42
+ model_type: roberta
43
+ no_repeat_ngram_size: 0
44
+ num_attention_heads: 12
45
+ num_beam_groups: 1
46
+ num_beams: 1
47
+ num_hidden_layers: 12
48
+ num_return_sequences: 1
49
+ output_attentions: false
50
+ output_hidden_states: false
51
+ output_scores: false
52
+ pad_token_id: 1
53
+ position_embedding_type: absolute
54
+ prefix: null
55
+ problem_type: null
56
+ pruned_heads: {}
57
+ remove_invalid_values: false
58
+ repetition_penalty: 1.0
59
+ return_dict: true
60
+ return_dict_in_generate: false
61
+ sep_token_id: null
62
+ suppress_tokens: null
63
+ task_specific_params: null
64
+ temperature: 1.0
65
+ tf_legacy_loss: false
66
+ tie_encoder_decoder: false
67
+ tie_word_embeddings: true
68
+ tokenizer_class: null
69
+ top_k: 50
70
+ top_p: 1.0
71
+ torch_dtype: null
72
+ torchscript: false
73
+ transformers_version: 4.6.0.dev0
74
+ type_vocab_size: 2
75
+ typical_p: 1.0
76
+ use_bfloat16: false
77
+ use_cache: true
78
+ vocab_size: 345
src/models.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # general
2
+ import subprocess
3
+ import yaml
4
+
5
+ # ML
6
+ import torch.nn as nn
7
+ from transformers import AutoConfig, RobertaForMaskedLM, PretrainedConfig
8
+
9
+
10
+ class DNikudModel(nn.Module):
11
+ def __init__(self, config, nikud_size, dagesh_size, sin_size, pretrain_model=None, device='cpu'):
12
+ super(DNikudModel, self).__init__()
13
+
14
+ if pretrain_model is not None:
15
+ model_base = RobertaForMaskedLM.from_pretrained(pretrain_model).to(device)
16
+ else:
17
+ model_base = RobertaForMaskedLM(config=config).to(device)
18
+
19
+ self.model = model_base.roberta
20
+ for name, param in self.model.named_parameters():
21
+ param.requires_grad = False
22
+
23
+ self.lstm1 = nn.LSTM(config.hidden_size, config.hidden_size, bidirectional=True, dropout=0.1, batch_first=True)
24
+ self.lstm2 = nn.LSTM(2 * config.hidden_size, config.hidden_size, bidirectional=True, dropout=0.1, batch_first=True)
25
+ self.dense = nn.Linear(2 * config.hidden_size, config.hidden_size)
26
+ self.out_n = nn.Linear(config.hidden_size, nikud_size)
27
+ self.out_d = nn.Linear(config.hidden_size, dagesh_size)
28
+ self.out_s = nn.Linear(config.hidden_size, sin_size)
29
+
30
+ def forward(self, input_ids, attention_mask):
31
+ last_hidden_state = self.model(input_ids, attention_mask=attention_mask).last_hidden_state
32
+ lstm1, _ = self.lstm1(last_hidden_state)
33
+ lstm2, _ = self.lstm2(lstm1)
34
+ dense = self.dense(lstm2)
35
+
36
+ nikud = self.out_n(dense)
37
+ dagesh = self.out_d(dense)
38
+ sin = self.out_s(dense)
39
+
40
+ return nikud, dagesh, sin
41
+
42
+
43
+ def get_git_commit_hash():
44
+ try:
45
+ commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('ascii').strip()
46
+ return commit_hash
47
+ except subprocess.CalledProcessError:
48
+ # This will be raised if you're not in a Git repository
49
+ print("Not inside a Git repository!")
50
+ return None
51
+
52
+
53
+ class ModelConfig(PretrainedConfig):
54
+ def __init__(self, max_length=None, dict=None):
55
+ super(ModelConfig, self).__init__()
56
+ if dict is None:
57
+ self.__dict__.update(AutoConfig.from_pretrained("tau/tavbert-he").__dict__)
58
+ self.max_length = max_length
59
+ self._commit_hash = get_git_commit_hash()
60
+ else:
61
+ self.__dict__.update(dict)
62
+
63
+ def print(self):
64
+ print(self.__dict__)
65
+
66
+ def save_to_file(self, file_path):
67
+ with open(file_path, "w") as yaml_file:
68
+ yaml.dump(self.__dict__, yaml_file, default_flow_style=False)
69
+
70
+ @classmethod
71
+ def load_from_file(cls, file_path):
72
+ with open(file_path, "r") as yaml_file:
73
+ config_dict = yaml.safe_load(yaml_file)
74
+ return cls(dict=config_dict)
src/models_utils.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # general
2
+ import json
3
+ import os
4
+
5
+ # ML
6
+ import numpy as np
7
+ import pandas as pd
8
+ import torch
9
+
10
+ # visual
11
+ import matplotlib.pyplot as plt
12
+ import seaborn as sns
13
+ from sklearn.metrics import confusion_matrix
14
+ from tqdm import tqdm
15
+
16
+ from src.running_params import DEBUG_MODE
17
+ from src.utiles_data import Nikud, create_missing_folders
18
+
19
+ CLASSES_LIST = ["nikud", "dagesh", "sin"]
20
+
21
+
22
+ def calc_num_correct_words(input, letter_correct_mask):
23
+ SPACE_TOKEN = 104
24
+ START_SENTENCE_TOKEN = 1
25
+ END_SENTENCE_TOKEN = 2
26
+
27
+ correct_words_count = 0
28
+ words_count = 0
29
+ for index in range(input.shape[0]):
30
+ input[index][np.where(input[index] == SPACE_TOKEN)[0]] = 0
31
+ input[index][np.where(input[index] == START_SENTENCE_TOKEN)[0]] = 0
32
+ input[index][np.where(input[index] == END_SENTENCE_TOKEN)[0]] = 0
33
+ words_end_index = np.concatenate(
34
+ (np.array([-1]), np.where(input[index] == 0)[0])
35
+ )
36
+ is_correct_words_array = [
37
+ bool(
38
+ letter_correct_mask[index][
39
+ list(range((words_end_index[s] + 1), words_end_index[s + 1]))
40
+ ].all()
41
+ )
42
+ for s in range(len(words_end_index) - 1)
43
+ if words_end_index[s + 1] - (words_end_index[s] + 1) > 1
44
+ ]
45
+ correct_words_count += np.array(is_correct_words_array).sum()
46
+ words_count += len(is_correct_words_array)
47
+
48
+ return correct_words_count, words_count
49
+
50
+
51
+ def predict(model, data_loader, device="cpu"):
52
+ model.to(device)
53
+
54
+ all_labels = None
55
+ with torch.no_grad():
56
+ for index_data, data in enumerate(data_loader):
57
+ (inputs, attention_mask, labels_demo) = data
58
+ inputs = inputs.to(device)
59
+ attention_mask = attention_mask.to(device)
60
+ labels_demo = labels_demo.to(device)
61
+
62
+ mask_cant_be_nikud = np.array(labels_demo.cpu())[:, :, 0] == -1
63
+ mask_cant_be_dagesh = np.array(labels_demo.cpu())[:, :, 1] == -1
64
+ mask_cant_be_sin = np.array(labels_demo.cpu())[:, :, 2] == -1
65
+
66
+ nikud_probs, dagesh_probs, sin_probs = model(inputs, attention_mask)
67
+
68
+ pred_nikud = np.array(torch.max(nikud_probs, 2).indices.cpu()).reshape(
69
+ inputs.shape[0], inputs.shape[1], 1
70
+ )
71
+ pred_dagesh = np.array(torch.max(dagesh_probs, 2).indices.cpu()).reshape(
72
+ inputs.shape[0], inputs.shape[1], 1
73
+ )
74
+ pred_sin = np.array(torch.max(sin_probs, 2).indices.cpu()).reshape(
75
+ inputs.shape[0], inputs.shape[1], 1
76
+ )
77
+
78
+ pred_nikud[mask_cant_be_nikud] = -1
79
+ pred_dagesh[mask_cant_be_dagesh] = -1
80
+ pred_sin[mask_cant_be_sin] = -1
81
+
82
+ pred_labels = np.concatenate((pred_nikud, pred_dagesh, pred_sin), axis=2)
83
+
84
+ if all_labels is None:
85
+ all_labels = pred_labels
86
+ else:
87
+ all_labels = np.concatenate((all_labels, pred_labels), axis=0)
88
+
89
+ return all_labels
90
+
91
+
92
+ def predict_single(model, data, device="cpu"):
93
+ # model.to(device)
94
+
95
+ all_labels = None
96
+ with torch.no_grad():
97
+ inputs = data["input_ids"].to(device)
98
+ attention_mask = data["attention_mask"].to(device)
99
+
100
+ # mask_cant_be_nikud = np.array(labels_demo.cpu())[:, :, 0] == -1
101
+ # mask_cant_be_dagesh = np.array(labels_demo.cpu())[:, :, 1] == -1
102
+ # mask_cant_be_sin = np.array(labels_demo.cpu())[:, :, 2] == -1
103
+
104
+ nikud_probs, dagesh_probs, sin_probs = model(inputs, attention_mask)
105
+ print(nikud_probs, dagesh_probs, sin_probs)
106
+
107
+ pred_nikud = np.array(torch.max(nikud_probs, 2).indices.cpu()).reshape(
108
+ inputs.shape[0], inputs.shape[1], 1
109
+ )
110
+ pred_dagesh = np.array(torch.max(dagesh_probs, 2).indices.cpu()).reshape(
111
+ inputs.shape[0], inputs.shape[1], 1
112
+ )
113
+ pred_sin = np.array(torch.max(sin_probs, 2).indices.cpu()).reshape(
114
+ inputs.shape[0], inputs.shape[1], 1
115
+ )
116
+
117
+ # pred_nikud[mask_cant_be_nikud] = -1
118
+ # pred_dagesh[mask_cant_be_dagesh] = -1
119
+ # pred_sin[mask_cant_be_sin] = -1
120
+ # print(pred_nikud, pred_dagesh, pred_sin)
121
+ pred_labels = np.concatenate((pred_nikud, pred_dagesh, pred_sin), axis=2)
122
+ print(pred_labels)
123
+ if all_labels is None:
124
+ all_labels = pred_labels
125
+ else:
126
+ all_labels = np.concatenate((all_labels, pred_labels), axis=0)
127
+
128
+ return all_labels
129
+
130
+
131
+ def training(
132
+ model,
133
+ train_loader,
134
+ dev_loader,
135
+ criterion_nikud,
136
+ criterion_dagesh,
137
+ criterion_sin,
138
+ training_params,
139
+ logger,
140
+ output_model_path,
141
+ optimizer,
142
+ device="cpu",
143
+ ):
144
+ max_length = None
145
+ best_accuracy = 0.0
146
+
147
+ logger.info(f"start training with training_params: {training_params}")
148
+ model = model.to(device)
149
+
150
+ criteria = {
151
+ "nikud": criterion_nikud.to(device),
152
+ "dagesh": criterion_dagesh.to(device),
153
+ "sin": criterion_sin.to(device),
154
+ }
155
+
156
+ output_checkpoints_path = os.path.join(output_model_path, "checkpoints")
157
+ create_missing_folders(output_checkpoints_path)
158
+
159
+ train_steps_loss_values = {"nikud": [], "dagesh": [], "sin": []}
160
+ train_epochs_loss_values = {"nikud": [], "dagesh": [], "sin": []}
161
+ dev_loss_values = {"nikud": [], "dagesh": [], "sin": []}
162
+ dev_accuracy_values = {
163
+ "nikud": [],
164
+ "dagesh": [],
165
+ "sin": [],
166
+ "all_nikud_letter": [],
167
+ "all_nikud_word": [],
168
+ }
169
+
170
+ for epoch in tqdm(range(training_params["n_epochs"]), desc="Training"):
171
+ model.train()
172
+ train_loss = {"nikud": 0.0, "dagesh": 0.0, "sin": 0.0}
173
+ relevant_count = {"nikud": 0.0, "dagesh": 0.0, "sin": 0.0}
174
+
175
+ for index_data, data in enumerate(train_loader):
176
+ (inputs, attention_mask, labels) = data
177
+
178
+ if max_length is None:
179
+ max_length = labels.shape[1]
180
+
181
+ inputs = inputs.to(device)
182
+ attention_mask = attention_mask.to(device)
183
+ labels = labels.to(device)
184
+
185
+ optimizer.zero_grad()
186
+ nikud_probs, dagesh_probs, sin_probs = model(inputs, attention_mask)
187
+
188
+ for i, (probs, class_name) in enumerate(
189
+ zip([nikud_probs, dagesh_probs, sin_probs], CLASSES_LIST)
190
+ ):
191
+ reshaped_tensor = (
192
+ torch.transpose(probs, 1, 2)
193
+ .contiguous()
194
+ .view(probs.shape[0], probs.shape[2], probs.shape[1])
195
+ )
196
+ loss = criteria[class_name](reshaped_tensor, labels[:, :, i]).to(device)
197
+
198
+ num_relevant = (labels[:, :, i] != -1).sum()
199
+ train_loss[class_name] += loss.item() * num_relevant
200
+ relevant_count[class_name] += num_relevant
201
+
202
+ loss.backward(retain_graph=True)
203
+
204
+ for i, class_name in enumerate(CLASSES_LIST):
205
+ train_steps_loss_values[class_name].append(
206
+ float(train_loss[class_name] / relevant_count[class_name])
207
+ )
208
+
209
+ optimizer.step()
210
+ if (index_data + 1) % 100 == 0:
211
+ msg = f"epoch: {epoch} , index_data: {index_data + 1}\n"
212
+ for i, class_name in enumerate(CLASSES_LIST):
213
+ msg += f"mean loss train {class_name}: {float(train_loss[class_name] / relevant_count[class_name])}, "
214
+
215
+ logger.debug(msg[:-2])
216
+
217
+ for i, class_name in enumerate(CLASSES_LIST):
218
+ train_epochs_loss_values[class_name].append(
219
+ float(train_loss[class_name] / relevant_count[class_name])
220
+ )
221
+
222
+ for class_name in train_loss.keys():
223
+ train_loss[class_name] /= relevant_count[class_name]
224
+
225
+ msg = f"Epoch {epoch + 1}/{training_params['n_epochs']}\n"
226
+ for i, class_name in enumerate(CLASSES_LIST):
227
+ msg += f"mean loss train {class_name}: {train_loss[class_name]}, "
228
+ logger.debug(msg[:-2])
229
+
230
+ model.eval()
231
+ dev_loss = {"nikud": 0.0, "dagesh": 0.0, "sin": 0.0}
232
+ dev_accuracy = {"nikud": 0.0, "dagesh": 0.0, "sin": 0.0}
233
+ relevant_count = {"nikud": 0.0, "dagesh": 0.0, "sin": 0.0}
234
+ correct_preds = {"nikud": 0.0, "dagesh": 0.0, "sin": 0.0}
235
+ un_masks = {"nikud": 0.0, "dagesh": 0.0, "sin": 0.0}
236
+ predictions = {"nikud": 0.0, "dagesh": 0.0, "sin": 0.0}
237
+ labels_class = {"nikud": 0.0, "dagesh": 0.0, "sin": 0.0}
238
+
239
+ all_nikud_types_correct_preds_letter = 0.0
240
+
241
+ letter_count = 0.0
242
+ correct_words_count = 0.0
243
+ word_count = 0.0
244
+ with torch.no_grad():
245
+ for index_data, data in enumerate(dev_loader):
246
+ (inputs, attention_mask, labels) = data
247
+ inputs = inputs.to(device)
248
+ attention_mask = attention_mask.to(device)
249
+ labels = labels.to(device)
250
+
251
+ nikud_probs, dagesh_probs, sin_probs = model(inputs, attention_mask)
252
+
253
+ for i, (probs, class_name) in enumerate(
254
+ zip([nikud_probs, dagesh_probs, sin_probs], CLASSES_LIST)
255
+ ):
256
+ reshaped_tensor = (
257
+ torch.transpose(probs, 1, 2)
258
+ .contiguous()
259
+ .view(probs.shape[0], probs.shape[2], probs.shape[1])
260
+ )
261
+ loss = criteria[class_name](reshaped_tensor, labels[:, :, i]).to(
262
+ device
263
+ )
264
+ un_masked = labels[:, :, i] != -1
265
+ num_relevant = un_masked.sum()
266
+ relevant_count[class_name] += num_relevant
267
+ _, preds = torch.max(probs, 2)
268
+ dev_loss[class_name] += loss.item() * num_relevant
269
+ correct_preds[class_name] += torch.sum(
270
+ preds[un_masked] == labels[:, :, i][un_masked]
271
+ )
272
+ un_masks[class_name] = un_masked
273
+ predictions[class_name] = preds
274
+ labels_class[class_name] = labels[:, :, i]
275
+
276
+ un_mask_all_or = torch.logical_or(
277
+ torch.logical_or(un_masks["nikud"], un_masks["dagesh"]),
278
+ un_masks["sin"],
279
+ )
280
+
281
+ correct = {
282
+ class_name: (torch.ones(un_mask_all_or.shape) == 1).to(device)
283
+ for class_name in CLASSES_LIST
284
+ }
285
+
286
+ for i, class_name in enumerate(CLASSES_LIST):
287
+ correct[class_name][un_masks[class_name]] = (
288
+ predictions[class_name][un_masks[class_name]]
289
+ == labels_class[class_name][un_masks[class_name]]
290
+ )
291
+
292
+ letter_correct_mask = torch.logical_and(
293
+ torch.logical_and(correct["sin"], correct["dagesh"]),
294
+ correct["nikud"],
295
+ )
296
+ all_nikud_types_correct_preds_letter += torch.sum(
297
+ letter_correct_mask[un_mask_all_or]
298
+ )
299
+
300
+ letter_correct_mask[~un_mask_all_or] = True
301
+ correct_num, total_words_num = calc_num_correct_words(
302
+ inputs.cpu(), letter_correct_mask
303
+ )
304
+
305
+ word_count += total_words_num
306
+ correct_words_count += correct_num
307
+ letter_count += un_mask_all_or.sum()
308
+
309
+ for class_name in CLASSES_LIST:
310
+ dev_loss[class_name] /= relevant_count[class_name]
311
+ dev_accuracy[class_name] = float(
312
+ correct_preds[class_name].double() / relevant_count[class_name]
313
+ )
314
+
315
+ dev_loss_values[class_name].append(float(dev_loss[class_name]))
316
+ dev_accuracy_values[class_name].append(float(dev_accuracy[class_name]))
317
+
318
+ dev_all_nikud_types_accuracy_letter = float(
319
+ all_nikud_types_correct_preds_letter / letter_count
320
+ )
321
+
322
+ dev_accuracy_values["all_nikud_letter"].append(
323
+ dev_all_nikud_types_accuracy_letter
324
+ )
325
+
326
+ word_all_nikud_accuracy = correct_words_count / word_count
327
+ dev_accuracy_values["all_nikud_word"].append(word_all_nikud_accuracy)
328
+
329
+ msg = (
330
+ f"Epoch {epoch + 1}/{training_params['n_epochs']}\n"
331
+ f'mean loss Dev nikud: {train_loss["nikud"]}, '
332
+ f'mean loss Dev dagesh: {train_loss["dagesh"]}, '
333
+ f'mean loss Dev sin: {train_loss["sin"]}, '
334
+ f"Dev all nikud types letter Accuracy: {dev_all_nikud_types_accuracy_letter}, "
335
+ f'Dev nikud letter Accuracy: {dev_accuracy["nikud"]}, '
336
+ f'Dev dagesh letter Accuracy: {dev_accuracy["dagesh"]}, '
337
+ f'Dev sin letter Accuracy: {dev_accuracy["sin"]}, '
338
+ f"Dev word Accuracy: {word_all_nikud_accuracy}"
339
+ )
340
+ logger.debug(msg)
341
+
342
+ save_progress_details(
343
+ dev_accuracy_values,
344
+ train_epochs_loss_values,
345
+ dev_loss_values,
346
+ train_steps_loss_values,
347
+ )
348
+
349
+ if dev_all_nikud_types_accuracy_letter > best_accuracy:
350
+ best_accuracy = dev_all_nikud_types_accuracy_letter
351
+ best_model = {
352
+ "epoch": epoch,
353
+ "model_state_dict": model.state_dict(),
354
+ "optimizer_state_dict": optimizer.state_dict(),
355
+ "loss": loss,
356
+ }
357
+
358
+ if epoch % training_params["checkpoints_frequency"] == 0:
359
+ save_checkpoint_path = os.path.join(
360
+ output_checkpoints_path, f"checkpoint_model_epoch_{epoch + 1}.pth"
361
+ )
362
+ checkpoint = {
363
+ "epoch": epoch,
364
+ "model_state_dict": model.state_dict(),
365
+ "optimizer_state_dict": optimizer.state_dict(),
366
+ "loss": loss,
367
+ }
368
+ torch.save(checkpoint["model_state_dict"], save_checkpoint_path)
369
+
370
+ save_model_path = os.path.join(output_model_path, "best_model.pth")
371
+ torch.save(best_model["model_state_dict"], save_model_path)
372
+ return (
373
+ best_model,
374
+ best_accuracy,
375
+ train_epochs_loss_values,
376
+ train_steps_loss_values,
377
+ dev_loss_values,
378
+ dev_accuracy_values,
379
+ )
380
+
381
+
382
+ def save_progress_details(
383
+ accuracy_dev_values,
384
+ epochs_loss_train_values,
385
+ loss_dev_values,
386
+ steps_loss_train_values,
387
+ ):
388
+ epochs_data_path = "epochs_data"
389
+ create_missing_folders(epochs_data_path)
390
+
391
+ save_dict_as_json(
392
+ steps_loss_train_values, epochs_data_path, "steps_loss_train_values.json"
393
+ )
394
+ save_dict_as_json(
395
+ epochs_loss_train_values, epochs_data_path, "epochs_loss_train_values.json"
396
+ )
397
+ save_dict_as_json(loss_dev_values, epochs_data_path, "loss_dev_values.json")
398
+ save_dict_as_json(accuracy_dev_values, epochs_data_path, "accuracy_dev_values.json")
399
+
400
+
401
+ def save_dict_as_json(dict, file_path, file_name):
402
+ json_data = json.dumps(dict, indent=4)
403
+ with open(os.path.join(file_path, file_name), "w") as json_file:
404
+ json_file.write(json_data)
405
+
406
+
407
+ def evaluate(model, test_data, plots_folder=None, device="cpu"):
408
+ model.to(device)
409
+ model.eval()
410
+
411
+ true_labels = {"nikud": [], "dagesh": [], "sin": []}
412
+ predictions = {"nikud": 0, "dagesh": 0, "sin": 0}
413
+ predicted_labels_2_report = {"nikud": [], "dagesh": [], "sin": []}
414
+ not_masks = {"nikud": 0, "dagesh": 0, "sin": 0}
415
+ correct_preds = {"nikud": 0, "dagesh": 0, "sin": 0}
416
+ relevant_count = {"nikud": 0, "dagesh": 0, "sin": 0}
417
+ labels_class = {"nikud": 0.0, "dagesh": 0.0, "sin": 0.0}
418
+
419
+ all_nikud_types_letter_level_correct = 0.0
420
+ nikud_letter_level_correct = 0.0
421
+ dagesh_letter_level_correct = 0.0
422
+ sin_letter_level_correct = 0.0
423
+
424
+ letters_count = 0.0
425
+ words_count = 0.0
426
+ correct_words_count = 0.0
427
+ with torch.no_grad():
428
+ for index_data, data in enumerate(test_data):
429
+ if DEBUG_MODE and index_data > 100:
430
+ break
431
+
432
+ (inputs, attention_mask, labels) = data
433
+
434
+ inputs = inputs.to(device)
435
+ attention_mask = attention_mask.to(device)
436
+ labels = labels.to(device)
437
+
438
+ nikud_probs, dagesh_probs, sin_probs = model(inputs, attention_mask)
439
+
440
+ for i, (probs, class_name) in enumerate(
441
+ zip([nikud_probs, dagesh_probs, sin_probs], CLASSES_LIST)
442
+ ):
443
+ labels_class[class_name] = labels[:, :, i]
444
+ not_masked = labels_class[class_name] != -1
445
+ num_relevant = not_masked.sum()
446
+ relevant_count[class_name] += num_relevant
447
+ _, preds = torch.max(probs, 2)
448
+ correct_preds[class_name] += torch.sum(
449
+ preds[not_masked] == labels_class[class_name][not_masked]
450
+ )
451
+ predictions[class_name] = preds
452
+ not_masks[class_name] = not_masked
453
+
454
+ if len(true_labels[class_name]) == 0:
455
+ true_labels[class_name] = (
456
+ labels_class[class_name][not_masked].cpu().numpy()
457
+ )
458
+ else:
459
+ true_labels[class_name] = np.concatenate(
460
+ (
461
+ true_labels[class_name],
462
+ labels_class[class_name][not_masked].cpu().numpy(),
463
+ )
464
+ )
465
+
466
+ if len(predicted_labels_2_report[class_name]) == 0:
467
+ predicted_labels_2_report[class_name] = (
468
+ preds[not_masked].cpu().numpy()
469
+ )
470
+ else:
471
+ predicted_labels_2_report[class_name] = np.concatenate(
472
+ (
473
+ predicted_labels_2_report[class_name],
474
+ preds[not_masked].cpu().numpy(),
475
+ )
476
+ )
477
+
478
+ not_mask_all_or = torch.logical_or(
479
+ torch.logical_or(not_masks["nikud"], not_masks["dagesh"]),
480
+ not_masks["sin"],
481
+ )
482
+
483
+ correct_nikud = (torch.ones(not_mask_all_or.shape) == 1).to(device)
484
+ correct_dagesh = (torch.ones(not_mask_all_or.shape) == 1).to(device)
485
+ correct_sin = (torch.ones(not_mask_all_or.shape) == 1).to(device)
486
+
487
+ correct_nikud[not_masks["nikud"]] = (
488
+ predictions["nikud"][not_masks["nikud"]]
489
+ == labels_class["nikud"][not_masks["nikud"]]
490
+ )
491
+ correct_dagesh[not_masks["dagesh"]] = (
492
+ predictions["dagesh"][not_masks["dagesh"]]
493
+ == labels_class["dagesh"][not_masks["dagesh"]]
494
+ )
495
+ correct_sin[not_masks["sin"]] = (
496
+ predictions["sin"][not_masks["sin"]]
497
+ == labels_class["sin"][not_masks["sin"]]
498
+ )
499
+
500
+ letter_correct_mask = torch.logical_and(
501
+ torch.logical_and(correct_sin, correct_dagesh), correct_nikud
502
+ )
503
+ all_nikud_types_letter_level_correct += torch.sum(
504
+ letter_correct_mask[not_mask_all_or]
505
+ )
506
+
507
+ letter_correct_mask[~not_mask_all_or] = True
508
+ total_correct_count, total_words_num = calc_num_correct_words(
509
+ inputs.cpu(), letter_correct_mask
510
+ )
511
+
512
+ words_count += total_words_num
513
+ correct_words_count += total_correct_count
514
+
515
+ letters_count += not_mask_all_or.sum()
516
+
517
+ nikud_letter_level_correct += torch.sum(correct_nikud[not_mask_all_or])
518
+ dagesh_letter_level_correct += torch.sum(correct_dagesh[not_mask_all_or])
519
+ sin_letter_level_correct += torch.sum(correct_sin[not_mask_all_or])
520
+
521
+ for i, name in enumerate(CLASSES_LIST):
522
+ index_labels = np.unique(true_labels[name])
523
+ cm = confusion_matrix(
524
+ true_labels[name], predicted_labels_2_report[name], labels=index_labels
525
+ )
526
+
527
+ vowel_label = [Nikud.id_2_label[name][l] for l in index_labels]
528
+ unique_vowels_names = [
529
+ Nikud.sign_2_name[int(vowel)] for vowel in vowel_label if vowel != "WITHOUT"
530
+ ]
531
+ if "WITHOUT" in vowel_label:
532
+ unique_vowels_names += ["WITHOUT"]
533
+ cm_df = pd.DataFrame(cm, index=unique_vowels_names, columns=unique_vowels_names)
534
+
535
+ # Display confusion matrix
536
+ plt.figure(figsize=(10, 8))
537
+ sns.heatmap(cm_df, annot=True, cmap="Blues", fmt="d")
538
+ plt.title("Confusion Matrix")
539
+ plt.xlabel("True Label")
540
+ plt.ylabel("Predicted Label")
541
+ if plots_folder is None:
542
+ plt.show()
543
+ else:
544
+ plt.savefig(os.path.join(plots_folder, f"Confusion_Matrix_{name}.jpg"))
545
+
546
+ all_nikud_types_letter_level_correct = (
547
+ all_nikud_types_letter_level_correct / letters_count
548
+ )
549
+ all_nikud_types_word_level_correct = correct_words_count / words_count
550
+ nikud_letter_level_correct = nikud_letter_level_correct / letters_count
551
+ dagesh_letter_level_correct = dagesh_letter_level_correct / letters_count
552
+ sin_letter_level_correct = sin_letter_level_correct / letters_count
553
+ print("\n")
554
+ print(f"nikud_letter_level_correct = {nikud_letter_level_correct}")
555
+ print(f"dagesh_letter_level_correct = {dagesh_letter_level_correct}")
556
+ print(f"sin_letter_level_correct = {sin_letter_level_correct}")
557
+ print(f"word_level_correct = {all_nikud_types_word_level_correct}")
558
+
559
+ return all_nikud_types_word_level_correct, all_nikud_types_letter_level_correct
src/plot_helpers.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # general
2
+ import os
3
+
4
+ # visual
5
+ import matplotlib.pyplot as plt
6
+
7
+ cols = ["precision", "recall", "f1-score", "support"]
8
+
9
+
10
+ def generate_plot_by_nikud_dagesh_sin_dict(nikud_dagesh_sin_dict, title, y_axis, plot_folder=None):
11
+ # Create a figure and axis
12
+ plt.figure(figsize=(8, 6))
13
+ plt.title(title)
14
+
15
+ ax = plt.gca()
16
+ indexes = list(range(1, len(nikud_dagesh_sin_dict["nikud"]) + 1))
17
+
18
+ # Plot data series with different colors and labels
19
+ ax.plot(indexes, nikud_dagesh_sin_dict["nikud"], color='blue', label='Nikud')
20
+ ax.plot(indexes, nikud_dagesh_sin_dict["dagesh"], color='green', label='Dagesh')
21
+ ax.plot(indexes, nikud_dagesh_sin_dict["sin"], color='red', label='Sin')
22
+
23
+ # Add legend
24
+ ax.legend()
25
+
26
+ # Set labels and title
27
+ ax.set_xlabel('Epoch')
28
+ ax.set_ylabel(y_axis)
29
+
30
+ if plot_folder is None:
31
+ plt.show()
32
+ else:
33
+ plt.savefig(os.path.join(plot_folder, f'{title.replace(" ", "_")}_plot.jpg'))
34
+
35
+
36
+ def generate_word_and_letter_accuracy_plot(word_and_letter_accuracy_dict, title, plot_folder=None):
37
+ # Create a figure and axis
38
+ plt.figure(figsize=(8, 6))
39
+ plt.title(title)
40
+
41
+ ax = plt.gca()
42
+ indexes = list(range(1, len(word_and_letter_accuracy_dict["all_nikud_letter"]) + 1))
43
+
44
+ # Plot data series with different colors and labels
45
+ ax.plot(indexes, word_and_letter_accuracy_dict["all_nikud_letter"], color='blue', label='Letter')
46
+ ax.plot(indexes, word_and_letter_accuracy_dict["all_nikud_word"], color='green', label='Word')
47
+
48
+ # Add legend
49
+ ax.legend()
50
+
51
+ # Set labels and title
52
+ ax.set_xlabel("Epoch")
53
+ ax.set_ylabel("Accuracy")
54
+
55
+ if plot_folder is None:
56
+ plt.show()
57
+ else:
58
+ plt.savefig(os.path.join(plot_folder, 'word_and_letter_accuracy_plot.jpg'))
src/running_params.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ DEBUG_MODE = False
2
+ BATCH_SIZE = 32
3
+ MAX_LENGTH_SEN = 1024
src/utiles_data.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # general
2
+ import os.path
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import List, Tuple
6
+ from uuid import uuid1
7
+ import re
8
+ import glob2
9
+
10
+ # visual
11
+ import matplotlib
12
+ import matplotlib.pyplot as plt
13
+ from tqdm import tqdm
14
+
15
+ # ML
16
+ import numpy as np
17
+ import torch
18
+ from torch.utils.data import Dataset
19
+
20
+ from src.running_params import DEBUG_MODE, MAX_LENGTH_SEN
21
+
22
+ matplotlib.use("agg")
23
+ unique_key = str(uuid1())
24
+
25
+
26
+ class Nikud:
27
+ """
28
+ 1456 HEBREW POINT SHEVA
29
+ 1457 HEBREW POINT HATAF SEGOL
30
+ 1458 HEBREW POINT HATAF PATAH
31
+ 1459 HEBREW POINT HATAF QAMATS
32
+ 1460 HEBREW POINT HIRIQ
33
+ 1461 HEBREW POINT TSERE
34
+ 1462 HEBREW POINT SEGOL
35
+ 1463 HEBREW POINT PATAH
36
+ 1464 HEBREW POINT QAMATS
37
+ 1465 HEBREW POINT HOLAM
38
+ 1466 HEBREW POINT HOLAM HASER FOR VAV ***EXTENDED***
39
+ 1467 HEBREW POINT QUBUTS
40
+ 1468 HEBREW POINT DAGESH OR MAPIQ
41
+ 1469 HEBREW POINT METEG ***EXTENDED***
42
+ 1470 HEBREW PUNCTUATION MAQAF ***EXTENDED***
43
+ 1471 HEBREW POINT RAFE ***EXTENDED***
44
+ 1472 HEBREW PUNCTUATION PASEQ ***EXTENDED***
45
+ 1473 HEBREW POINT SHIN DOT
46
+ 1474 HEBREW POINT SIN DOT
47
+ """
48
+
49
+ nikud_dict = {
50
+ "SHVA": 1456,
51
+ "REDUCED_SEGOL": 1457,
52
+ "REDUCED_PATAKH": 1458,
53
+ "REDUCED_KAMATZ": 1459,
54
+ "HIRIK": 1460,
55
+ "TZEIRE": 1461,
56
+ "SEGOL": 1462,
57
+ "PATAKH": 1463,
58
+ "KAMATZ": 1464,
59
+ "KAMATZ_KATAN": 1479,
60
+ "HOLAM": 1465,
61
+ "HOLAM HASER VAV": 1466,
62
+ "KUBUTZ": 1467,
63
+ "DAGESH OR SHURUK": 1468,
64
+ "METEG": 1469,
65
+ "PUNCTUATION MAQAF": 1470,
66
+ "RAFE": 1471,
67
+ "PUNCTUATION PASEQ": 1472,
68
+ "SHIN_YEMANIT": 1473,
69
+ "SHIN_SMALIT": 1474,
70
+ }
71
+
72
+ skip_nikud = (
73
+ []
74
+ ) # [nikud_dict["KAMATZ_KATAN"], nikud_dict["HOLAM HASER VAV"], nikud_dict["METEG"], nikud_dict["PUNCTUATION MAQAF"], nikud_dict["PUNCTUATION PASEQ"]]
75
+ sign_2_name = {sign: name for name, sign in nikud_dict.items()}
76
+ sin = [nikud_dict["RAFE"], nikud_dict["SHIN_YEMANIT"], nikud_dict["SHIN_SMALIT"]]
77
+ dagesh = [
78
+ nikud_dict["RAFE"],
79
+ nikud_dict["DAGESH OR SHURUK"],
80
+ ] # note that DAGESH and SHURUK are one and the same
81
+ nikud = []
82
+ for v in nikud_dict.values():
83
+ if v not in sin and v not in skip_nikud:
84
+ nikud.append(v)
85
+ all_nikud_ord = {v for v in nikud_dict.values()}
86
+ all_nikud_chr = {chr(v) for v in nikud_dict.values()}
87
+
88
+ label_2_id = {
89
+ "nikud": {label: i for i, label in enumerate(nikud + ["WITHOUT"])},
90
+ "dagesh": {label: i for i, label in enumerate(dagesh + ["WITHOUT"])},
91
+ "sin": {label: i for i, label in enumerate(sin + ["WITHOUT"])},
92
+ }
93
+ id_2_label = {
94
+ "nikud": {i: label for i, label in enumerate(nikud + ["WITHOUT"])},
95
+ "dagesh": {i: label for i, label in enumerate(dagesh + ["WITHOUT"])},
96
+ "sin": {i: label for i, label in enumerate(sin + ["WITHOUT"])},
97
+ }
98
+
99
+ DAGESH_LETTER = nikud_dict["DAGESH OR SHURUK"]
100
+ RAFE = nikud_dict["RAFE"]
101
+ PAD_OR_IRRELEVANT = -1
102
+
103
+ LEN_NIKUD = len(label_2_id["nikud"])
104
+ LEN_DAGESH = len(label_2_id["dagesh"])
105
+ LEN_SIN = len(label_2_id["sin"])
106
+
107
+ def id_2_char(self, c, class_type):
108
+ if c == -1:
109
+ return ""
110
+
111
+ label = self.id_2_label[class_type][c]
112
+
113
+ if label != "WITHOUT":
114
+ print("Label =", chr(self.id_2_label[class_type][c]))
115
+ return chr(self.id_2_label[class_type][c])
116
+ return ""
117
+
118
+
119
+ class Letters:
120
+ hebrew = [chr(c) for c in range(0x05D0, 0x05EA + 1)]
121
+ VALID_LETTERS = [
122
+ " ",
123
+ "!",
124
+ '"',
125
+ "'",
126
+ "(",
127
+ ")",
128
+ ",",
129
+ "-",
130
+ ".",
131
+ ":",
132
+ ";",
133
+ "?",
134
+ ] + hebrew
135
+ SPECIAL_TOKENS = ["H", "O", "5", "1"]
136
+ ENDINGS_TO_REGULAR = dict(zip("ךםןףץ", "כמנפצ"))
137
+ vocab = VALID_LETTERS + SPECIAL_TOKENS
138
+ vocab_size = len(vocab)
139
+
140
+
141
+ class Letter:
142
+ def __init__(self, letter):
143
+ self.letter = letter
144
+ self.normalized = None
145
+ self.dagesh = None
146
+ self.sin = None
147
+ self.nikud = None
148
+
149
+ def normalize(self, letter):
150
+ if letter in Letters.VALID_LETTERS:
151
+ return letter
152
+ if letter in Letters.ENDINGS_TO_REGULAR:
153
+ return Letters.ENDINGS_TO_REGULAR[letter]
154
+ if letter in ["\n", "\t"]:
155
+ return " "
156
+ if letter in ["‒", "–", "—", "―", "−", "+"]:
157
+ return "-"
158
+ if letter == "[":
159
+ return "("
160
+ if letter == "]":
161
+ return ")"
162
+ if letter in ["´", "‘", "’"]:
163
+ return "'"
164
+ if letter in ["“", "”", "״"]:
165
+ return '"'
166
+ if letter.isdigit():
167
+ if int(letter) == 1:
168
+ return "1"
169
+ else:
170
+ return "5"
171
+ if letter == "…":
172
+ return ","
173
+ if letter in ["ײ", "װ", "ױ"]:
174
+ return "H"
175
+ return "O"
176
+
177
+ def can_dagesh(self, letter):
178
+ return letter in ("בגדהוזטיכלמנספצקשת" + "ךף")
179
+
180
+ def can_sin(self, letter):
181
+ return letter == "ש"
182
+
183
+ def can_nikud(self, letter):
184
+ return letter in ("אבגדהוזחטיכלמנסעפצקרשת" + "ךן")
185
+
186
+ def get_label_letter(self, labels):
187
+ dagesh_sin_nikud = [
188
+ True if self.can_dagesh(self.letter) else False,
189
+ True if self.can_sin(self.letter) else False,
190
+ True if self.can_nikud(self.letter) else False,
191
+ ]
192
+
193
+ labels_ids = {
194
+ "nikud": Nikud.PAD_OR_IRRELEVANT,
195
+ "dagesh": Nikud.PAD_OR_IRRELEVANT,
196
+ "sin": Nikud.PAD_OR_IRRELEVANT,
197
+ }
198
+
199
+ normalized = self.normalize(self.letter)
200
+
201
+ i = 0
202
+ if Nikud.nikud_dict["PUNCTUATION PASEQ"] in labels:
203
+ labels.remove(Nikud.nikud_dict["PUNCTUATION PASEQ"])
204
+ if Nikud.nikud_dict["PUNCTUATION MAQAF"] in labels:
205
+ labels.remove(Nikud.nikud_dict["PUNCTUATION MAQAF"])
206
+ if Nikud.nikud_dict["HOLAM HASER VAV"] in labels:
207
+ labels.remove(Nikud.nikud_dict["HOLAM HASER VAV"])
208
+ if Nikud.nikud_dict["METEG"] in labels:
209
+ labels.remove(Nikud.nikud_dict["METEG"])
210
+ if Nikud.nikud_dict["KAMATZ_KATAN"] in labels:
211
+ labels[labels.index(Nikud.nikud_dict["KAMATZ_KATAN"])] = Nikud.nikud_dict[
212
+ "KAMATZ"
213
+ ]
214
+ for index, (class_name, group) in enumerate(
215
+ zip(
216
+ ["dagesh", "sin", "nikud"],
217
+ [[Nikud.DAGESH_LETTER], Nikud.sin, Nikud.nikud],
218
+ )
219
+ ):
220
+ # notice - order is important: dagesh then sin and then nikud
221
+ if dagesh_sin_nikud[index]:
222
+ if i < len(labels) and labels[i] in group:
223
+ labels_ids[class_name] = Nikud.label_2_id[class_name][labels[i]]
224
+ i += 1
225
+ else:
226
+ labels_ids[class_name] = Nikud.label_2_id[class_name]["WITHOUT"]
227
+
228
+ if (
229
+ np.array(dagesh_sin_nikud).all()
230
+ and len(labels) == 3
231
+ and labels[0] in Nikud.sin
232
+ ):
233
+ labels_ids["nikud"] = Nikud.label_2_id["nikud"][labels[2]]
234
+ labels_ids["dagesh"] = Nikud.label_2_id["dagesh"][labels[1]]
235
+
236
+ if (
237
+ self.can_sin(self.letter)
238
+ and len(labels) == 2
239
+ and labels[1] == Nikud.DAGESH_LETTER
240
+ ):
241
+ labels_ids["dagesh"] = Nikud.label_2_id["dagesh"][labels[1]]
242
+ labels_ids["nikud"] = Nikud.label_2_id[class_name]["WITHOUT"]
243
+
244
+ if (
245
+ self.letter == "ו"
246
+ and labels_ids["dagesh"] == Nikud.DAGESH_LETTER
247
+ and labels_ids["nikud"] == Nikud.label_2_id["nikud"]["WITHOUT"]
248
+ ):
249
+ labels_ids["dagesh"] = Nikud.label_2_id["dagesh"]["WITHOUT"]
250
+ labels_ids["nikud"] = Nikud.DAGESH_LETTER
251
+
252
+ self.normalized = normalized
253
+ self.dagesh = labels_ids["dagesh"]
254
+ self.sin = labels_ids["sin"]
255
+ self.nikud = labels_ids["nikud"]
256
+
257
+ def name_of(self, letter):
258
+ if "א" <= letter <= "ת":
259
+ return letter
260
+ if letter == Nikud.DAGESH_LETTER:
261
+ return "דגש\שורוק"
262
+ if letter == Nikud.KAMATZ:
263
+ return "קמץ"
264
+ if letter == Nikud.PATAKH:
265
+ return "פתח"
266
+ if letter == Nikud.TZEIRE:
267
+ return "צירה"
268
+ if letter == Nikud.SEGOL:
269
+ return "סגול"
270
+ if letter == Nikud.SHVA:
271
+ return "שוא"
272
+ if letter == Nikud.HOLAM:
273
+ return "חולם"
274
+ if letter == Nikud.KUBUTZ:
275
+ return "קובוץ"
276
+ if letter == Nikud.HIRIK:
277
+ return "חיריק"
278
+ if letter == Nikud.REDUCED_KAMATZ:
279
+ return "חטף-קמץ"
280
+ if letter == Nikud.REDUCED_PATAKH:
281
+ return "חטף-פתח"
282
+ if letter == Nikud.REDUCED_SEGOL:
283
+ return "חטף-סגול"
284
+ if letter == Nikud.SHIN_SMALIT:
285
+ return "שין-שמאלית"
286
+ if letter == Nikud.SHIN_YEMANIT:
287
+ return "שין-ימנית"
288
+ if letter.isprintable():
289
+ return letter
290
+ return "לא ידוע ({})".format(hex(ord(letter)))
291
+
292
+
293
+ def text_contains_nikud(text):
294
+ return len(set(text) & Nikud.all_nikud_chr) > 0
295
+
296
+
297
+ def combine_sentences(list_sentences, max_length=0, is_train=False):
298
+ all_new_sentences = []
299
+ new_sen = ""
300
+ index = 0
301
+ while index < len(list_sentences):
302
+ sen = list_sentences[index]
303
+
304
+ if not text_contains_nikud(sen) and (
305
+ "------------------" in sen or sen == "\n"
306
+ ):
307
+ if len(new_sen) > 0:
308
+ all_new_sentences.append(new_sen)
309
+ if not is_train:
310
+ all_new_sentences.append(sen)
311
+ new_sen = ""
312
+ index += 1
313
+ continue
314
+
315
+ if not text_contains_nikud(sen) and is_train:
316
+ index += 1
317
+ continue
318
+
319
+ if len(sen) > max_length:
320
+ update_sen = sen.replace(". ", f". {unique_key}")
321
+ update_sen = update_sen.replace("? ", f"? {unique_key}")
322
+ update_sen = update_sen.replace("! ", f"! {unique_key}")
323
+ update_sen = update_sen.replace("” ", f"” {unique_key}")
324
+ update_sen = update_sen.replace("\t", f"\t{unique_key}")
325
+ part_sentence = update_sen.split(unique_key)
326
+
327
+ good_parts = []
328
+ for p in part_sentence:
329
+ if len(p) < max_length:
330
+ good_parts.append(p)
331
+ else:
332
+ prev = 0
333
+ while prev <= len(p):
334
+ part = p[prev : (prev + max_length)]
335
+ last_space = 0
336
+ if " " in part:
337
+ last_space = part[::-1].index(" ") + 1
338
+ next = prev + max_length - last_space
339
+ part = p[prev:next]
340
+ good_parts.append(part)
341
+ prev = next
342
+ list_sentences = (
343
+ list_sentences[:index] + good_parts + list_sentences[index + 1 :]
344
+ )
345
+ continue
346
+ if new_sen == "":
347
+ new_sen = sen
348
+ elif len(new_sen) + len(sen) < max_length:
349
+ new_sen += sen
350
+ else:
351
+ all_new_sentences.append(new_sen)
352
+ new_sen = sen
353
+
354
+ index += 1
355
+ if len(new_sen) > 0:
356
+ all_new_sentences.append(new_sen)
357
+ return all_new_sentences
358
+
359
+
360
+ class NikudDataset(Dataset):
361
+ def __init__(
362
+ self,
363
+ tokenizer,
364
+ folder=None,
365
+ file=None,
366
+ logger=None,
367
+ max_length=0,
368
+ is_train=False,
369
+ ):
370
+ self.max_length = max_length
371
+ self.tokenizer = tokenizer
372
+ self.is_train = is_train
373
+ if folder is not None:
374
+ self.data, self.origin_data = self.read_data_folder(folder, logger)
375
+ elif file is not None:
376
+ self.data, self.origin_data = self.read_data(file, logger)
377
+ self.prepered_data = None
378
+
379
+ def read_data_folder(self, folder_path: str, logger=None):
380
+ all_files = glob2.glob(f"{folder_path}/**/*.txt", recursive=True)
381
+ msg = f"number of files: " + str(len(all_files))
382
+ if logger:
383
+ logger.debug(msg)
384
+ else:
385
+ print(msg)
386
+ all_data = []
387
+ all_origin_data = []
388
+ if DEBUG_MODE:
389
+ all_files = all_files[0:2]
390
+ for file in all_files:
391
+ if "not_use" in file or "NakdanResults" in file:
392
+ continue
393
+ data, origin_data = self.read_data(file, logger)
394
+ all_data.extend(data)
395
+ all_origin_data.extend(origin_data)
396
+ return all_data, all_origin_data
397
+
398
+ def read_data(self, filepath: str, logger=None) -> List[Tuple[str, list]]:
399
+ msg = f"read file: {filepath}"
400
+ if logger:
401
+ logger.debug(msg)
402
+ else:
403
+ print(msg)
404
+ data = []
405
+ orig_data = []
406
+ with open(filepath, "r", encoding="utf-8") as file:
407
+ file_data = file.read()
408
+ data_list = self.split_text(file_data)
409
+
410
+ for sen in tqdm(data_list, desc=f"Source: {os.path.basename(filepath)}"):
411
+ if sen == "":
412
+ continue
413
+
414
+ labels = []
415
+ text = ""
416
+ text_org = ""
417
+ index = 0
418
+ sentence_length = len(sen)
419
+ while index < sentence_length:
420
+ if (
421
+ ord(sen[index]) == Nikud.nikud_dict["PUNCTUATION MAQAF"]
422
+ or ord(sen[index]) == Nikud.nikud_dict["PUNCTUATION PASEQ"]
423
+ or ord(sen[index]) == Nikud.nikud_dict["METEG"]
424
+ ):
425
+ index += 1
426
+ continue
427
+
428
+ label = []
429
+ l = Letter(sen[index])
430
+ if not (l.letter not in Nikud.all_nikud_chr):
431
+ if sen[index - 1] == "\n":
432
+ index += 1
433
+ continue
434
+ assert l.letter not in Nikud.all_nikud_chr
435
+ if sen[index] in Letters.hebrew:
436
+ index += 1
437
+ while (
438
+ index < sentence_length
439
+ and ord(sen[index]) in Nikud.all_nikud_ord
440
+ ):
441
+ label.append(ord(sen[index]))
442
+ index += 1
443
+ else:
444
+ index += 1
445
+
446
+ l.get_label_letter(label)
447
+ text += l.normalized
448
+ text_org += l.letter
449
+ labels.append(l)
450
+
451
+ data.append((text, labels))
452
+ orig_data.append(text_org)
453
+
454
+ return data, orig_data
455
+
456
+ def split_text(self, file_data):
457
+ file_data = file_data.replace("\n", f"\n{unique_key}")
458
+ data_list = file_data.split(unique_key)
459
+ data_list = combine_sentences(
460
+ data_list, is_train=self.is_train, max_length=MAX_LENGTH_SEN
461
+ )
462
+ return data_list
463
+
464
+ def show_data_labels(self, plots_folder=None):
465
+ nikud = [
466
+ Nikud.id_2_label["nikud"][label.nikud]
467
+ for _, label_list in self.data
468
+ for label in label_list
469
+ if label.nikud != -1
470
+ ]
471
+ dagesh = [
472
+ Nikud.id_2_label["dagesh"][label.dagesh]
473
+ for _, label_list in self.data
474
+ for label in label_list
475
+ if label.dagesh != -1
476
+ ]
477
+ sin = [
478
+ Nikud.id_2_label["sin"][label.sin]
479
+ for _, label_list in self.data
480
+ for label in label_list
481
+ if label.sin != -1
482
+ ]
483
+
484
+ vowels = nikud + dagesh + sin
485
+ unique_vowels, label_counts = np.unique(vowels, return_counts=True)
486
+ unique_vowels_names = [
487
+ Nikud.sign_2_name[int(vowel)]
488
+ for vowel in unique_vowels
489
+ if vowel != "WITHOUT"
490
+ ] + ["WITHOUT"]
491
+ fig, ax = plt.subplots(figsize=(16, 6))
492
+
493
+ bar_positions = np.arange(len(unique_vowels))
494
+ bar_width = 0.15
495
+ ax.bar(bar_positions, list(label_counts), bar_width)
496
+
497
+ ax.set_title("Distribution of Vowels in dataset")
498
+ ax.set_xlabel("Vowels")
499
+ ax.set_ylabel("Count")
500
+ ax.legend(loc="right", bbox_to_anchor=(1, 0.85))
501
+ ax.set_xticks(bar_positions)
502
+ ax.set_xticklabels(unique_vowels_names, rotation=30, ha="right", fontsize=8)
503
+
504
+ if plots_folder is None:
505
+ plt.show()
506
+ else:
507
+ plt.savefig(os.path.join(plots_folder, "show_data_labels.jpg"))
508
+
509
+ def calc_max_length(self, maximum=MAX_LENGTH_SEN):
510
+ if self.max_length > maximum:
511
+ self.max_length = maximum
512
+ return self.max_length
513
+
514
+ def prepare_data(self, name="train"):
515
+ dataset = []
516
+ for index, (sentence, label) in tqdm(
517
+ enumerate(self.data), desc=f"prepare data {name}"
518
+ ):
519
+ encoded_sequence = self.tokenizer.encode_plus(
520
+ sentence,
521
+ add_special_tokens=True,
522
+ max_length=self.max_length,
523
+ padding="max_length",
524
+ truncation=True,
525
+ return_attention_mask=True,
526
+ return_tensors="pt",
527
+ )
528
+ label_lists = [
529
+ [letter.nikud, letter.dagesh, letter.sin] for letter in label
530
+ ]
531
+ label = torch.tensor(
532
+ [
533
+ [
534
+ Nikud.PAD_OR_IRRELEVANT,
535
+ Nikud.PAD_OR_IRRELEVANT,
536
+ Nikud.PAD_OR_IRRELEVANT,
537
+ ]
538
+ ]
539
+ + label_lists[: (self.max_length - 1)]
540
+ + [
541
+ [
542
+ Nikud.PAD_OR_IRRELEVANT,
543
+ Nikud.PAD_OR_IRRELEVANT,
544
+ Nikud.PAD_OR_IRRELEVANT,
545
+ ]
546
+ for i in range(self.max_length - len(label) - 1)
547
+ ]
548
+ )
549
+
550
+ dataset.append(
551
+ (
552
+ encoded_sequence["input_ids"][0],
553
+ encoded_sequence["attention_mask"][0],
554
+ label,
555
+ )
556
+ )
557
+
558
+ self.prepered_data = dataset
559
+
560
+ def back_2_text(self, labels):
561
+ nikud = Nikud()
562
+ all_text = ""
563
+ for indx_sentance, (input_ids, _, label) in enumerate(self.prepered_data):
564
+ new_line = ""
565
+ for indx_char, c in enumerate(self.origin_data[indx_sentance]):
566
+ new_line += (
567
+ c
568
+ + nikud.id_2_char(labels[indx_sentance, indx_char + 1, 1], "dagesh")
569
+ + nikud.id_2_char(labels[indx_sentance, indx_char + 1, 2], "sin")
570
+ + nikud.id_2_char(labels[indx_sentance, indx_char + 1, 0], "nikud")
571
+ )
572
+ all_text += new_line
573
+ return all_text
574
+
575
+ def __len__(self):
576
+ return self.data.shape[0]
577
+
578
+ def __getitem__(self, idx):
579
+ row = self.data[idx]
580
+
581
+
582
+ def get_sub_folders_paths(main_folder):
583
+ list_paths = []
584
+ for filename in os.listdir(main_folder):
585
+ path = os.path.join(main_folder, filename)
586
+ if os.path.isdir(path) and filename != ".git":
587
+ list_paths.append(path)
588
+ list_paths.extend(get_sub_folders_paths(path))
589
+ return list_paths
590
+
591
+
592
+ def create_missing_folders(folder_path):
593
+ # Check if the folder doesn't exist and create it if needed
594
+ if not os.path.exists(folder_path):
595
+ os.makedirs(folder_path)
596
+
597
+
598
+ def info_folder(folder, num_files, num_hebrew_letters):
599
+ """
600
+ Recursively counts the number of files and the number of Hebrew letters in all subfolders of the given folder path.
601
+
602
+ Args:
603
+ folder (str): The path of the folder to be analyzed.
604
+ num_files (int): The running total of the number of files encountered so far.
605
+ num_hebrew_letters (int): The running total of the number of Hebrew letters encountered so far.
606
+
607
+ Returns:
608
+ Tuple[int, int]: A tuple containing the total number of files and the total number of Hebrew letters.
609
+ """
610
+ for filename in os.listdir(folder):
611
+ file_path = os.path.join(folder, filename)
612
+ if filename.lower().endswith(".txt") and os.path.isfile(file_path):
613
+ num_files += 1
614
+ dataset = NikudDataset(None, file=file_path)
615
+ for line in dataset.data:
616
+ for c in line[0]:
617
+ if c in Letters.hebrew:
618
+ num_hebrew_letters += 1
619
+
620
+ elif os.path.isdir(file_path) and filename != ".git":
621
+ sub_folder = file_path
622
+ n1, n2 = info_folder(sub_folder, num_files, num_hebrew_letters)
623
+ num_files += n1
624
+ num_hebrew_letters += n2
625
+ return num_files, num_hebrew_letters
626
+
627
+
628
+ def extract_text_to_compare_nakdimon(text):
629
+ res = text.replace("|", "")
630
+ res = res.replace(
631
+ chr(Nikud.nikud_dict["KUBUTZ"]) + "ו" + chr(Nikud.nikud_dict["METEG"]),
632
+ "ו" + chr(Nikud.nikud_dict["DAGESH OR SHURUK"]),
633
+ )
634
+ res = res.replace(
635
+ chr(Nikud.nikud_dict["HOLAM"]) + "ו" + chr(Nikud.nikud_dict["METEG"]), "ו"
636
+ )
637
+ res = res.replace(
638
+ "ו" + chr(Nikud.nikud_dict["HOLAM"]) + chr(Nikud.nikud_dict["KAMATZ"]),
639
+ "ו" + chr(Nikud.nikud_dict["KAMATZ"]),
640
+ )
641
+ res = res.replace(chr(Nikud.nikud_dict["METEG"]), "")
642
+ res = res.replace(
643
+ chr(Nikud.nikud_dict["KAMATZ"]) + chr(Nikud.nikud_dict["HIRIK"]),
644
+ chr(Nikud.nikud_dict["KAMATZ"]) + "י" + chr(Nikud.nikud_dict["HIRIK"]),
645
+ )
646
+ res = res.replace(
647
+ chr(Nikud.nikud_dict["PATAKH"]) + chr(Nikud.nikud_dict["HIRIK"]),
648
+ chr(Nikud.nikud_dict["PATAKH"]) + "י" + chr(Nikud.nikud_dict["HIRIK"]),
649
+ )
650
+ res = res.replace(chr(Nikud.nikud_dict["PUNCTUATION MAQAF"]), "")
651
+ res = res.replace(chr(Nikud.nikud_dict["PUNCTUATION PASEQ"]), "")
652
+ res = res.replace(
653
+ chr(Nikud.nikud_dict["KAMATZ_KATAN"]), chr(Nikud.nikud_dict["KAMATZ"])
654
+ )
655
+
656
+ res = re.sub(chr(Nikud.nikud_dict["KUBUTZ"]) + "ו" + "(?=[א-ת])", "ו", res)
657
+ res = res.replace(chr(Nikud.nikud_dict["REDUCED_KAMATZ"]) + "ו", "ו")
658
+
659
+ res = res.replace(
660
+ chr(Nikud.nikud_dict["DAGESH OR SHURUK"]) * 2,
661
+ chr(Nikud.nikud_dict["DAGESH OR SHURUK"]),
662
+ )
663
+ res = res.replace("\u05be", "-")
664
+ res = res.replace("יְהוָֹה", "יהוה")
665
+
666
+ return res
667
+
668
+
669
+ def orgenize_data(main_folder, logger):
670
+ x = NikudDataset(None)
671
+ x.delete_files(os.path.join(Path(main_folder).parent, "train"))
672
+ x.delete_files(os.path.join(Path(main_folder).parent, "dev"))
673
+ x.delete_files(os.path.join(Path(main_folder).parent, "test"))
674
+ x.split_data(
675
+ main_folder, main_folder_name=os.path.basename(main_folder), logger=logger
676
+ )