ACMC commited on
Commit
05e7fc2
·
1 Parent(s): ac08d51

Better error handling

Browse files
Files changed (2) hide show
  1. app.py +25 -21
  2. utils.py +98 -48
app.py CHANGED
@@ -19,27 +19,31 @@ logger.setLevel(logging.INFO)
19
  def convert_to_dataset(files, do_spelling_correction, progress, whatsapp_name, datetime_dayfirst, message_line_format):
20
  modified_dataset = None
21
  for file in progress.tqdm(files, desc="Processing files"):
22
- if modified_dataset is None:
23
- # First file
24
- modified_dataset = process_chat_file(
25
- file,
26
- do_spelling_correction=do_spelling_correction,
27
- whatsapp_name=whatsapp_name,
28
- datetime_dayfirst=datetime_dayfirst,
29
- message_line_format=message_line_format,
30
- )
31
- else:
32
- # Concatenate the datasets
33
- this_file_dataset = process_chat_file(
34
- file,
35
- do_spelling_correction=do_spelling_correction,
36
- whatsapp_name=whatsapp_name,
37
- datetime_dayfirst=datetime_dayfirst,
38
- message_line_format=message_line_format,
39
- )
40
- modified_dataset = datasets.concatenate_datasets(
41
- [modified_dataset, this_file_dataset]
42
- )
 
 
 
 
43
  return modified_dataset
44
 
45
 
 
19
  def convert_to_dataset(files, do_spelling_correction, progress, whatsapp_name, datetime_dayfirst, message_line_format):
20
  modified_dataset = None
21
  for file in progress.tqdm(files, desc="Processing files"):
22
+ try:
23
+ if modified_dataset is None:
24
+ # First file
25
+ modified_dataset = process_chat_file(
26
+ file,
27
+ do_spelling_correction=do_spelling_correction,
28
+ whatsapp_name=whatsapp_name,
29
+ datetime_dayfirst=datetime_dayfirst,
30
+ message_line_format=message_line_format,
31
+ )
32
+ else:
33
+ # Concatenate the datasets
34
+ this_file_dataset = process_chat_file(
35
+ file,
36
+ do_spelling_correction=do_spelling_correction,
37
+ whatsapp_name=whatsapp_name,
38
+ datetime_dayfirst=datetime_dayfirst,
39
+ message_line_format=message_line_format,
40
+ )
41
+ modified_dataset = datasets.concatenate_datasets(
42
+ [modified_dataset, this_file_dataset]
43
+ )
44
+ except Exception as e:
45
+ logger.error(f"Error processing file {file}: {e}")
46
+ raise gr.Error(f"Error processing file {file}: {e}")
47
  return modified_dataset
48
 
49
 
utils.py CHANGED
@@ -91,7 +91,7 @@ def spell_check_conversation_spacy(conversation):
91
  return conversation
92
 
93
 
94
- def remove_whatapp_annotations(conversation):
95
  """
96
  Removes the following annotations from the messages:
97
  - <This message was edited>
@@ -238,45 +238,79 @@ def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayf
238
  logger.exception(example["text"])
239
  raise e
240
 
241
- ds = (
242
- datasets.load_dataset("text", data_files=[file])["train"]
243
- .filter(
 
 
 
 
244
  # Has to begin by date, time, contact name, and contain at least a ':' symbol
245
  lambda x: re.match(
246
  r"^\d{1,2}/\d{1,2}/\d{1,2},\s\d{2}:\d{2}\s-\s.+:", x["text"]
247
  )
248
  )
249
- .map(process_line, remove_columns=["text"])
250
- )
251
-
252
- # Filter out messages that just say '<Media omitted>'
253
- ds = ds.filter(lambda x: x["message"] != "<Media omitted>")
254
-
255
- groups = group_messages(iter(ds))
256
- # Generate the dataset
257
- conversations_ds = datasets.Dataset.from_dict({"conversations": groups})
258
-
259
- # Filter out conversations with less than 5 messages
260
- conversations_ds = conversations_ds.filter(
261
- lambda x: len(x["conversations"]) >= MIN_MESSAGES_THRESHOLD
262
- )
263
-
264
- conversations_ds_without_whatsapp_annotations = conversations_ds.map(
265
- remove_whatapp_annotations,
266
- num_proc=os.cpu_count() - 1,
267
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
  if do_spelling_correction:
270
- spell_checked_conversations_ds = (
271
- conversations_ds_without_whatsapp_annotations.map(spell_check_conversation)
272
- )
 
 
 
 
273
  else:
274
  spell_checked_conversations_ds = conversations_ds_without_whatsapp_annotations
275
 
276
  if do_reordering:
277
- reordered_conversations_ds = spell_checked_conversations_ds.map(
278
- swap_messages_if_needed_in_conversation
279
- )
 
 
 
 
280
  else:
281
  reordered_conversations_ds = spell_checked_conversations_ds
282
 
@@ -287,14 +321,22 @@ def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayf
287
  message["contact_name"] = "Other"
288
  return conversation
289
 
290
- changed_contact_name_ds = reordered_conversations_ds.map(
291
- rewrite_contact_name
292
- ) # , num_proc=os.cpu_count() - 1)
293
-
294
- # Filter out conversations with only one contact
295
- changed_contact_name_ds = changed_contact_name_ds.filter(
296
- lambda x: len(set([msg["contact_name"] for msg in x["conversations"]])) > 1
297
- )
 
 
 
 
 
 
 
 
298
 
299
  return changed_contact_name_ds
300
 
@@ -381,17 +423,25 @@ def transform_conversations_dataset_into_training_examples(
381
  flattened_examples[key] = [d[key] for d in processed_examples]
382
  return flattened_examples
383
 
384
- processed_examples = conversations_ds.map(
385
- process_examples,
386
- remove_columns=["conversations"],
387
- # num_proc=os.cpu_count() - 1,
388
- batched=True,
389
- )
390
-
391
- examples_filtered_by_length = processed_examples.filter(
392
- lambda x: all(
393
- [len(m["content"]) < MAX_CHARACTERS_PER_MESSAGE for m in x["messages"]]
394
  )
395
- )
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
  return examples_filtered_by_length
 
91
  return conversation
92
 
93
 
94
+ def remove_whatsapp_annotations(conversation):
95
  """
96
  Removes the following annotations from the messages:
97
  - <This message was edited>
 
238
  logger.exception(example["text"])
239
  raise e
240
 
241
+ try:
242
+ ds = datasets.load_dataset("text", data_files=[file])["train"]
243
+ except Exception as e:
244
+ logger.exception(f"Error while loading file {file}")
245
+ raise Exception(f"Error while loading file {file}") from e
246
+ try:
247
+ ds = ds.filter(
248
  # Has to begin by date, time, contact name, and contain at least a ':' symbol
249
  lambda x: re.match(
250
  r"^\d{1,2}/\d{1,2}/\d{1,2},\s\d{2}:\d{2}\s-\s.+:", x["text"]
251
  )
252
  )
253
+ except Exception as e:
254
+ logger.exception(f"Error filtering the lines in file {file} so they match the expected format")
255
+ raise Exception(f"Error filtering the lines in file {file} so they match the expected format") from e
256
+ try:
257
+ ds = ds.map(process_line, remove_columns=["text"])
258
+ except Exception as e:
259
+ logger.exception(f"Error mapping the lines in file {file} to the expected format")
260
+ raise Exception(f"Error mapping the lines in file {file} to the expected format") from e
261
+
262
+ try:
263
+ # Filter out messages that just say '<Media omitted>'
264
+ ds = ds.filter(lambda x: x["message"] != "<Media omitted>")
265
+ except Exception as e:
266
+ logger.exception(f"Error filtering out messages that say '<Media omitted>' in file {file}")
267
+ raise Exception(f"Error filtering out messages that say '<Media omitted>' in file {file}") from e
268
+
269
+ try:
270
+ groups = group_messages(iter(ds))
271
+ # Generate the dataset
272
+ conversations_ds = datasets.Dataset.from_dict({"conversations": groups})
273
+ except Exception as e:
274
+ logger.exception(f"Error grouping the messages in file {file}")
275
+ raise Exception(f"Error grouping the messages in file {file}") from e
276
+
277
+ try:
278
+ # Filter out conversations with less than 5 messages
279
+ conversations_ds = conversations_ds.filter(
280
+ lambda x: len(x["conversations"]) >= MIN_MESSAGES_THRESHOLD
281
+ )
282
+ except Exception as e:
283
+ logger.exception(f"Error filtering out conversations with less than {MIN_MESSAGES_THRESHOLD} messages in file {file}")
284
+ raise Exception(f"Error filtering out conversations with less than {MIN_MESSAGES_THRESHOLD} messages in file {file}") from e
285
+
286
+ try:
287
+ conversations_ds_without_whatsapp_annotations = conversations_ds.map(
288
+ remove_whatsapp_annotations,
289
+ num_proc=os.cpu_count() - 1,
290
+ )
291
+ except Exception as e:
292
+ logger.exception(f"Error removing WhatsApp annotations in file {file}")
293
+ raise Exception(f"Error removing WhatsApp annotations in file {file}") from e
294
 
295
  if do_spelling_correction:
296
+ try:
297
+ spell_checked_conversations_ds = (
298
+ conversations_ds_without_whatsapp_annotations.map(spell_check_conversation)
299
+ )
300
+ except Exception as e:
301
+ logger.exception(f"Error spell checking the conversations in file {file}")
302
+ raise Exception(f"Error spell checking the conversations in file {file}") from e
303
  else:
304
  spell_checked_conversations_ds = conversations_ds_without_whatsapp_annotations
305
 
306
  if do_reordering:
307
+ try:
308
+ reordered_conversations_ds = spell_checked_conversations_ds.map(
309
+ swap_messages_if_needed_in_conversation
310
+ )
311
+ except Exception as e:
312
+ logger.exception(f"Error reordering the conversations in file {file}")
313
+ raise Exception(f"Error reordering the conversations in file {file}") from e
314
  else:
315
  reordered_conversations_ds = spell_checked_conversations_ds
316
 
 
321
  message["contact_name"] = "Other"
322
  return conversation
323
 
324
+ try:
325
+ changed_contact_name_ds = reordered_conversations_ds.map(
326
+ rewrite_contact_name
327
+ ) # , num_proc=os.cpu_count() - 1)
328
+ except Exception as e:
329
+ logger.exception(f"Error changing your other contact's names in file {file}")
330
+ raise Exception(f"Error changing your other contact's names in file {file}") from e
331
+
332
+ try:
333
+ # Filter out conversations with only one contact
334
+ changed_contact_name_ds = changed_contact_name_ds.filter(
335
+ lambda x: len(set([msg["contact_name"] for msg in x["conversations"]])) > 1
336
+ )
337
+ except Exception as e:
338
+ logger.exception(f"Error filtering out conversations with only one contact in file {file}")
339
+ raise Exception(f"Error filtering out conversations with only one contact in file {file}") from e
340
 
341
  return changed_contact_name_ds
342
 
 
423
  flattened_examples[key] = [d[key] for d in processed_examples]
424
  return flattened_examples
425
 
426
+ try:
427
+ processed_examples = conversations_ds.map(
428
+ process_examples,
429
+ remove_columns=["conversations"],
430
+ # num_proc=os.cpu_count() - 1,
431
+ batched=True,
 
 
 
 
432
  )
433
+ except Exception as e:
434
+ logger.exception("Error transforming the conversations dataset into training examples")
435
+ raise Exception("Error transforming the conversations dataset into training examples") from e
436
+
437
+ try:
438
+ examples_filtered_by_length = processed_examples.filter(
439
+ lambda x: all(
440
+ [len(m["content"]) < MAX_CHARACTERS_PER_MESSAGE for m in x["messages"]]
441
+ )
442
+ )
443
+ except Exception as e:
444
+ logger.exception("Error filtering out examples with messages longer than the maximum allowed")
445
+ raise Exception("Error filtering out examples with messages longer than the maximum allowed") from e
446
 
447
  return examples_filtered_by_length