|
from typing import List |
|
|
|
from surya.languages import CODE_TO_LANGUAGE, LANGUAGE_TO_CODE |
|
from surya.model.recognition.tokenizer import _tokenize as lang_tokenize |
|
|
|
from marker.ocr.tesseract import LANGUAGE_TO_TESSERACT_CODE, TESSERACT_CODE_TO_LANGUAGE |
|
from marker.settings import settings |
|
|
|
|
|
def langs_to_ids(langs: List[str]): |
|
unique_langs = list(set(langs)) |
|
_, lang_tokens = lang_tokenize("", unique_langs) |
|
return lang_tokens |
|
|
|
|
|
def replace_langs_with_codes(langs): |
|
if settings.OCR_ENGINE == "surya": |
|
for i, lang in enumerate(langs): |
|
if lang.title() in LANGUAGE_TO_CODE: |
|
langs[i] = LANGUAGE_TO_CODE[lang.title()] |
|
else: |
|
for i, lang in enumerate(langs): |
|
if lang in LANGUAGE_TO_CODE: |
|
langs[i] = LANGUAGE_TO_TESSERACT_CODE[lang] |
|
return langs |
|
|
|
|
|
def validate_langs(langs): |
|
if settings.OCR_ENGINE == "surya": |
|
for lang in langs: |
|
if lang not in CODE_TO_LANGUAGE: |
|
raise ValueError(f"Invalid language code {lang} for Surya OCR") |
|
else: |
|
for lang in langs: |
|
if lang not in TESSERACT_CODE_TO_LANGUAGE: |
|
raise ValueError(f"Invalid language code {lang} for Tesseract") |