File size: 1,961 Bytes
2f7895b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import xml.etree.ElementTree as ET
import glob
import os
import logging

def process_json_files(start, end):
    base_path = "texts/tanach"
    results = {}

    for i in range(start, end + 1):
        file_pattern = f"{base_path}/{i:02}*.xml"
        matching_files = glob.glob(file_pattern)

        if not matching_files:
            logging.warning(f"No file matching pattern '{file_pattern}' found.")
            results[i] = {"title": "No title", "text": []}
            continue

        book_texts = []
        for file_name in matching_files:
            try:
                tree = ET.parse(file_name)
                root = tree.getroot()
                chapter_texts = []
                for chapter in root.findall('.//c'):
                  verse_texts = []
                  for verse in chapter.findall('./v'):
                    verse_text = ""
                    for word in verse.findall('./w'):
                       verse_text += " " + "".join(word.itertext())
                    verse_texts.append(verse_text.strip())
                  chapter_texts.append(verse_texts)
                book_texts = chapter_texts
                book_title = root.find('.//names/name').text if root.find('.//names/name') is not None else os.path.basename(file_name)

                results[i] = {
                    "title": book_title,
                    "text": book_texts
                }

            except FileNotFoundError:
                logging.warning(f"File {file_name} not found.")
                results[i] = {"title": "No title", "text": []}
            except ET.ParseError as e:
                logging.warning(f"File {file_name} could not be read as XML: {e}")
                results[i] = {"title": "No title", "text": []}
            except KeyError as e:
                logging.warning(f"Expected key 'text' is missing in {file_name}: {e}")
                results[i] = {"title": "No title", "text": []}

    return results