Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
π update logs
Browse filesSigned-off-by: peter szemraj <[email protected]>
app.py
CHANGED
@@ -19,7 +19,8 @@ os.environ[
|
|
19 |
] = "false" # parallelism on tokenizers is buggy with gradio
|
20 |
|
21 |
logging.basicConfig(
|
22 |
-
level=logging.INFO,
|
|
|
23 |
)
|
24 |
|
25 |
import gradio as gr
|
@@ -232,18 +233,20 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
|
|
232 |
:param bool lower: whether to lowercase the text
|
233 |
:return str: the text of the file
|
234 |
"""
|
|
|
|
|
235 |
# check if mysterious file object is a list
|
236 |
if isinstance(file_obj, list):
|
237 |
file_obj = file_obj[0]
|
238 |
file_path = Path(file_obj.name)
|
239 |
try:
|
240 |
-
|
241 |
if file_path.suffix == ".txt":
|
242 |
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
243 |
raw_text = f.read()
|
244 |
text = clean(raw_text, lower=lower)
|
245 |
elif file_path.suffix == ".pdf":
|
246 |
-
|
247 |
conversion_stats = convert_PDF_to_Text(
|
248 |
file_path,
|
249 |
ocr_model=ocr_model,
|
@@ -251,18 +254,19 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
|
|
251 |
)
|
252 |
text = conversion_stats["converted_text"]
|
253 |
else:
|
254 |
-
|
255 |
text = "ERROR - check file - unknown file type"
|
256 |
|
257 |
return text
|
258 |
except Exception as e:
|
259 |
-
|
260 |
return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
|
261 |
|
262 |
|
263 |
if __name__ == "__main__":
|
264 |
-
logging.
|
265 |
-
|
|
|
266 |
with contextlib.redirect_stdout(None):
|
267 |
ocr_model = ocr_predictor(
|
268 |
"db_resnet50",
|
@@ -271,7 +275,7 @@ if __name__ == "__main__":
|
|
271 |
assume_straight_pages=True,
|
272 |
)
|
273 |
name_to_path = load_example_filenames(_here / "examples")
|
274 |
-
|
275 |
demo = gr.Blocks()
|
276 |
_examples = list(name_to_path.keys())
|
277 |
with demo:
|
@@ -355,7 +359,7 @@ if __name__ == "__main__":
|
|
355 |
minimum=0.5,
|
356 |
maximum=1.0,
|
357 |
label="length penalty",
|
358 |
-
|
359 |
step=0.05,
|
360 |
)
|
361 |
token_batch_length = gr.Radio(
|
@@ -369,7 +373,7 @@ if __name__ == "__main__":
|
|
369 |
minimum=1.0,
|
370 |
maximum=5.0,
|
371 |
label="repetition penalty",
|
372 |
-
|
373 |
step=0.1,
|
374 |
)
|
375 |
no_repeat_ngram_size = gr.Radio(
|
|
|
19 |
] = "false" # parallelism on tokenizers is buggy with gradio
|
20 |
|
21 |
logging.basicConfig(
|
22 |
+
level=logging.INFO,
|
23 |
+
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
|
24 |
)
|
25 |
|
26 |
import gradio as gr
|
|
|
233 |
:param bool lower: whether to lowercase the text
|
234 |
:return str: the text of the file
|
235 |
"""
|
236 |
+
|
237 |
+
logger = logging.getLogger(__name__)
|
238 |
# check if mysterious file object is a list
|
239 |
if isinstance(file_obj, list):
|
240 |
file_obj = file_obj[0]
|
241 |
file_path = Path(file_obj.name)
|
242 |
try:
|
243 |
+
logger.info(f"Loading file:\t{file_path}")
|
244 |
if file_path.suffix == ".txt":
|
245 |
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
246 |
raw_text = f.read()
|
247 |
text = clean(raw_text, lower=lower)
|
248 |
elif file_path.suffix == ".pdf":
|
249 |
+
logger.info(f"loading as PDF file {file_path}")
|
250 |
conversion_stats = convert_PDF_to_Text(
|
251 |
file_path,
|
252 |
ocr_model=ocr_model,
|
|
|
254 |
)
|
255 |
text = conversion_stats["converted_text"]
|
256 |
else:
|
257 |
+
logger.error(f"Unknown file type {file_path.suffix}")
|
258 |
text = "ERROR - check file - unknown file type"
|
259 |
|
260 |
return text
|
261 |
except Exception as e:
|
262 |
+
logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
|
263 |
return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
|
264 |
|
265 |
|
266 |
if __name__ == "__main__":
|
267 |
+
logger = logging.getLogger(__name__)
|
268 |
+
logger.info("Starting app instance")
|
269 |
+
logger.info("Loading OCR model")
|
270 |
with contextlib.redirect_stdout(None):
|
271 |
ocr_model = ocr_predictor(
|
272 |
"db_resnet50",
|
|
|
275 |
assume_straight_pages=True,
|
276 |
)
|
277 |
name_to_path = load_example_filenames(_here / "examples")
|
278 |
+
logger.info(f"Loaded {len(name_to_path)} examples")
|
279 |
demo = gr.Blocks()
|
280 |
_examples = list(name_to_path.keys())
|
281 |
with demo:
|
|
|
359 |
minimum=0.5,
|
360 |
maximum=1.0,
|
361 |
label="length penalty",
|
362 |
+
value=0.7,
|
363 |
step=0.05,
|
364 |
)
|
365 |
token_batch_length = gr.Radio(
|
|
|
373 |
minimum=1.0,
|
374 |
maximum=5.0,
|
375 |
label="repetition penalty",
|
376 |
+
value=1.5,
|
377 |
step=0.1,
|
378 |
)
|
379 |
no_repeat_ngram_size = gr.Radio(
|