From 78a021b1b680a59f6c16ed853f29ca4a028f5740 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EB=B0=95=ED=98=84=EC=9A=B0?= Date: Sun, 21 Jan 2024 20:58:25 +0900 Subject: [PATCH 1/6] feat: _ocrThread --- ocrmypdf_easyocr/__init__.py | 61 ++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/ocrmypdf_easyocr/__init__.py b/ocrmypdf_easyocr/__init__.py index ab1d6b2..63bcfa2 100644 --- a/ocrmypdf_easyocr/__init__.py +++ b/ocrmypdf_easyocr/__init__.py @@ -7,21 +7,25 @@ import logging import os -from multiprocessing import Semaphore +import multiprocessing +import multiprocessing.managers +import threading import cv2 as cv import easyocr import pluggy from ocrmypdf import OcrEngine, hookimpl from ocrmypdf._exec import tesseract +import time from ocrmypdf_easyocr._cv import detect_skew from ocrmypdf_easyocr._easyocr import tidy_easyocr_result from ocrmypdf_easyocr._pdf import easyocr_to_pikepdf -log = logging.getLogger(__name__) +from typing import Optional, Tuple +import numpy.typing as npt -GPU_SEMAPHORE = Semaphore(3) +log = logging.getLogger(__name__) ISO_639_3_2: dict[str, str] = { "afr": "af", @@ -89,12 +93,47 @@ "vie": "vi", } +Task = Tuple[npt.NDArray, multiprocessing.Value, threading.Event] + +def _ocrThread(q: multiprocessing.Queue[Task], options): + reader: Optional[easyocr.Reader] = None + + # TODO: signal _ocrThread to quit after OCR completes. + while True: + (gray, outputDict, event) = q.get() + + + # Init reader on first OCR attempt: Wait until `options` variable is fully initialized. + # Note: `options` variable is on the same process with the main thread. + try: + if reader is None: + useGPU = options.gpu + languages = [ISO_639_3_2[lang] for lang in options.languages] + reader = easyocr.Reader(languages, useGPU) + outputDict["output"] = reader.readtext(gray) + except Exception as e: + print(e) + outputDict["output"] = "" + finally: + event.set() + @hookimpl def initialize(plugin_manager: pluggy.PluginManager): pass +@hookimpl +def check_options(options): + m = multiprocessing.Manager() + q = multiprocessing.Queue(-1) + t = threading.Thread(target=_ocrThread, args=(q, options), daemon=True) + t.start() + options._easyocr_struct = { + "manager": m, + "queue": q + } + @hookimpl def add_options(parser): easyocr_options = parser.add_argument_group( @@ -143,15 +182,19 @@ def generate_hocr(input_file, output_hocr, output_text, options): @staticmethod def generate_pdf(input_file, output_pdf, output_text, options): - languages = [ISO_639_3_2[lang] for lang in options.languages] - img = cv.imread(os.fspath(input_file)) gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY) - with GPU_SEMAPHORE: - reader = easyocr.Reader(languages, gpu=options.gpu) - raw_results = reader.readtext(gray) - results = [tidy_easyocr_result(r) for r in raw_results] + s = options._easyocr_struct + manager: multiprocessing.managers.SyncManager = s["manager"] + queue: multiprocessing.Queue[Task] = s["queue"] + outputDict = manager.dict() + event = manager.Event() + queue.put((gray, outputDict, event)) + event.wait() + raw_results = outputDict["output"] + + results = [tidy_easyocr_result(r) for r in raw_results] text = " ".join([result.text for result in results]) output_text.write_text(text) From 940ce15c31be56ca94fb9a416af1555ab7162084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EB=B0=95=ED=98=84=EC=9A=B0?= Date: Mon, 22 Jan 2024 14:24:03 +0900 Subject: [PATCH 2/6] feat(options): add `easyocr_batch_size` and `easyocr_workers` --- ocrmypdf_easyocr/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ocrmypdf_easyocr/__init__.py b/ocrmypdf_easyocr/__init__.py index 63bcfa2..b1fbc0e 100644 --- a/ocrmypdf_easyocr/__init__.py +++ b/ocrmypdf_easyocr/__init__.py @@ -110,7 +110,11 @@ def _ocrThread(q: multiprocessing.Queue[Task], options): useGPU = options.gpu languages = [ISO_639_3_2[lang] for lang in options.languages] reader = easyocr.Reader(languages, useGPU) - outputDict["output"] = reader.readtext(gray) + outputDict["output"] = reader.readtext( + gray, + batch_size=options.easyocr_batch_size, + workers=options.easyocr_workers + ) except Exception as e: print(e) outputDict["output"] = "" @@ -140,6 +144,8 @@ def add_options(parser): "EasyOCR", "Advanced control of EasyOCR" ) easyocr_options.add_argument("--easyocr-no-gpu", action="store_false", dest="gpu") + easyocr_options.add_argument("--easyocr-batch-size", type=int, default=4) + easyocr_options.add_argument("--easyocr-workers", type=int, default=0) class EasyOCREngine(OcrEngine): From 29a5c5865280f321c81be589d036a052ba24b407 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EB=B0=95=ED=98=84=EC=9A=B0?= Date: Mon, 22 Jan 2024 14:24:28 +0900 Subject: [PATCH 3/6] feat: print traceback error message --- ocrmypdf_easyocr/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrmypdf_easyocr/__init__.py b/ocrmypdf_easyocr/__init__.py index b1fbc0e..c39f1c8 100644 --- a/ocrmypdf_easyocr/__init__.py +++ b/ocrmypdf_easyocr/__init__.py @@ -16,7 +16,7 @@ import pluggy from ocrmypdf import OcrEngine, hookimpl from ocrmypdf._exec import tesseract -import time +import traceback from ocrmypdf_easyocr._cv import detect_skew from ocrmypdf_easyocr._easyocr import tidy_easyocr_result @@ -116,7 +116,7 @@ def _ocrThread(q: multiprocessing.Queue[Task], options): workers=options.easyocr_workers ) except Exception as e: - print(e) + traceback.print_exception(e) outputDict["output"] = "" finally: event.set() From 6e0bf0af769953b6c8982eb9d5314635c811e836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EB=B0=95=ED=98=84=EC=9A=B0?= Date: Mon, 22 Jan 2024 14:38:51 +0900 Subject: [PATCH 4/6] feat: use multiple gpu for devices w/ lots of VRAM --- ocrmypdf_easyocr/__init__.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/ocrmypdf_easyocr/__init__.py b/ocrmypdf_easyocr/__init__.py index c39f1c8..e1fe827 100644 --- a/ocrmypdf_easyocr/__init__.py +++ b/ocrmypdf_easyocr/__init__.py @@ -7,7 +7,6 @@ import logging import os -import multiprocessing import multiprocessing.managers import threading @@ -95,10 +94,10 @@ Task = Tuple[npt.NDArray, multiprocessing.Value, threading.Event] -def _ocrThread(q: multiprocessing.Queue[Task], options): +def _ocrProcess(q: multiprocessing.Queue[Task], options): reader: Optional[easyocr.Reader] = None - # TODO: signal _ocrThread to quit after OCR completes. + # TODO: signal _ocrProcess to quit after OCR completes. while True: (gray, outputDict, event) = q.get() @@ -112,8 +111,7 @@ def _ocrThread(q: multiprocessing.Queue[Task], options): reader = easyocr.Reader(languages, useGPU) outputDict["output"] = reader.readtext( gray, - batch_size=options.easyocr_batch_size, - workers=options.easyocr_workers + batch_size=options.easyocr_batch_size ) except Exception as e: traceback.print_exception(e) @@ -131,8 +129,14 @@ def initialize(plugin_manager: pluggy.PluginManager): def check_options(options): m = multiprocessing.Manager() q = multiprocessing.Queue(-1) - t = threading.Thread(target=_ocrThread, args=(q, options), daemon=True) - t.start() + ocrProcessList = [] + for _ in range(options.easyocr_workers): + t = multiprocessing.Process(target=_ocrProcess, args=(q, options), daemon=True) + t.start() + ocrProcessList.append(t) + + # TODO : proper cleanup code for `ocrProcessList` + options._easyocr_struct = { "manager": m, "queue": q @@ -145,7 +149,7 @@ def add_options(parser): ) easyocr_options.add_argument("--easyocr-no-gpu", action="store_false", dest="gpu") easyocr_options.add_argument("--easyocr-batch-size", type=int, default=4) - easyocr_options.add_argument("--easyocr-workers", type=int, default=0) + easyocr_options.add_argument("--easyocr-workers", type=int, default=1) class EasyOCREngine(OcrEngine): From 330a7a15f091b613907e1ce7670c68630dcec295 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 9 Nov 2023 23:08:17 -0800 Subject: [PATCH 5/6] Allow rendering word boxes for debug --- ocrmypdf_easyocr/__init__.py | 38 ++++++++++++++++++++---------------- ocrmypdf_easyocr/_pdf.py | 3 ++- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/ocrmypdf_easyocr/__init__.py b/ocrmypdf_easyocr/__init__.py index e1fe827..810ef96 100644 --- a/ocrmypdf_easyocr/__init__.py +++ b/ocrmypdf_easyocr/__init__.py @@ -6,24 +6,23 @@ from __future__ import annotations import logging -import os import multiprocessing.managers +import os import threading +import traceback +from typing import Optional, Tuple import cv2 as cv import easyocr +import numpy.typing as npt import pluggy from ocrmypdf import OcrEngine, hookimpl from ocrmypdf._exec import tesseract -import traceback from ocrmypdf_easyocr._cv import detect_skew from ocrmypdf_easyocr._easyocr import tidy_easyocr_result from ocrmypdf_easyocr._pdf import easyocr_to_pikepdf -from typing import Optional, Tuple -import numpy.typing as npt - log = logging.getLogger(__name__) ISO_639_3_2: dict[str, str] = { @@ -94,6 +93,7 @@ Task = Tuple[npt.NDArray, multiprocessing.Value, threading.Event] + def _ocrProcess(q: multiprocessing.Queue[Task], options): reader: Optional[easyocr.Reader] = None @@ -101,7 +101,6 @@ def _ocrProcess(q: multiprocessing.Queue[Task], options): while True: (gray, outputDict, event) = q.get() - # Init reader on first OCR attempt: Wait until `options` variable is fully initialized. # Note: `options` variable is on the same process with the main thread. try: @@ -110,8 +109,7 @@ def _ocrProcess(q: multiprocessing.Queue[Task], options): languages = [ISO_639_3_2[lang] for lang in options.languages] reader = easyocr.Reader(languages, useGPU) outputDict["output"] = reader.readtext( - gray, - batch_size=options.easyocr_batch_size + gray, batch_size=options.easyocr_batch_size ) except Exception as e: traceback.print_exception(e) @@ -137,19 +135,20 @@ def check_options(options): # TODO : proper cleanup code for `ocrProcessList` - options._easyocr_struct = { - "manager": m, - "queue": q - } + options._easyocr_struct = {"manager": m, "queue": q} + @hookimpl def add_options(parser): - easyocr_options = parser.add_argument_group( - "EasyOCR", "Advanced control of EasyOCR" - ) + easyocr_options = parser.add_argument_group("EasyOCR", "EasyOCR options") easyocr_options.add_argument("--easyocr-no-gpu", action="store_false", dest="gpu") easyocr_options.add_argument("--easyocr-batch-size", type=int, default=4) easyocr_options.add_argument("--easyocr-workers", type=int, default=1) + easyocr_options.add_argument( + "--easyocr-debug-suppress-images", + action="store_true", + dest="easyocr_debug_suppress_images", + ) class EasyOCREngine(OcrEngine): @@ -208,8 +207,13 @@ def generate_pdf(input_file, output_pdf, output_text, options): text = " ".join([result.text for result in results]) output_text.write_text(text) - # easyocr_to_pdf(input_file, 1.0, results, output_pdf) - easyocr_to_pikepdf(input_file, 1.0, results, output_pdf) + easyocr_to_pikepdf( + input_file, + 1.0, + results, + output_pdf, + boxes=options.easyocr_debug_suppress_images, + ) @hookimpl diff --git a/ocrmypdf_easyocr/_pdf.py b/ocrmypdf_easyocr/_pdf.py index a436207..92169b0 100644 --- a/ocrmypdf_easyocr/_pdf.py +++ b/ocrmypdf_easyocr/_pdf.py @@ -274,6 +274,7 @@ def easyocr_to_pikepdf( image_scale: float, results: Iterable[EasyOCRResult], output_pdf: Path, + boxes: bool, ): """Convert EasyOCR results to a PDF with text annotations (no images). @@ -302,7 +303,7 @@ def easyocr_to_pikepdf( Font=Dictionary({"/f-0-0": register_glyphlessfont(pdf)}) ) - cs = generate_text_content_stream(results, scale, height, boxes=False) + cs = generate_text_content_stream(results, scale, height, boxes=boxes) pdf.pages[0].Contents = pdf.make_stream(unparse_content_stream(cs)) pdf.save(output_pdf) From 2b925307f20ea67025d3bb2699ce7dbdda339427 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Mon, 29 Jan 2024 01:32:07 -0800 Subject: [PATCH 6/6] Implement child process cleanup & PEP8 --- ocrmypdf_easyocr/__init__.py | 77 ++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/ocrmypdf_easyocr/__init__.py b/ocrmypdf_easyocr/__init__.py index 810ef96..ec34109 100644 --- a/ocrmypdf_easyocr/__init__.py +++ b/ocrmypdf_easyocr/__init__.py @@ -10,14 +10,16 @@ import os import threading import traceback -from typing import Optional, Tuple +from pathlib import Path +from typing import Optional, Sequence, Tuple import cv2 as cv import easyocr import numpy.typing as npt import pluggy -from ocrmypdf import OcrEngine, hookimpl +from ocrmypdf import Executor, OcrEngine, PdfContext, hookimpl from ocrmypdf._exec import tesseract +from ocrmypdf.builtin_plugins.optimize import optimize_pdf as default_optimize_pdf from ocrmypdf_easyocr._cv import detect_skew from ocrmypdf_easyocr._easyocr import tidy_easyocr_result @@ -91,29 +93,31 @@ "vie": "vi", } -Task = Tuple[npt.NDArray, multiprocessing.Value, threading.Event] +Task = Tuple[npt.NDArray, multiprocessing.Value, threading.Event] | None -def _ocrProcess(q: multiprocessing.Queue[Task], options): +def _ocr_process(q: multiprocessing.Queue[Task], options): reader: Optional[easyocr.Reader] = None - # TODO: signal _ocrProcess to quit after OCR completes. while True: - (gray, outputDict, event) = q.get() + message = q.get() + if message is None: + return # exit process + gray, output_dict, event = message # Init reader on first OCR attempt: Wait until `options` variable is fully initialized. # Note: `options` variable is on the same process with the main thread. try: if reader is None: - useGPU = options.gpu + use_gpu = options.gpu languages = [ISO_639_3_2[lang] for lang in options.languages] - reader = easyocr.Reader(languages, useGPU) - outputDict["output"] = reader.readtext( + reader = easyocr.Reader(languages, use_gpu) + output_dict["output"] = reader.readtext( gray, batch_size=options.easyocr_batch_size ) except Exception as e: traceback.print_exception(e) - outputDict["output"] = "" + output_dict["output"] = "" finally: event.set() @@ -123,19 +127,50 @@ def initialize(plugin_manager: pluggy.PluginManager): pass +class ProcessList: + def __init__(self, plist): + self.process_list = plist + + def __getstate__(self): + return [] + + @hookimpl def check_options(options): m = multiprocessing.Manager() q = multiprocessing.Queue(-1) - ocrProcessList = [] + ocr_process_list = [] for _ in range(options.easyocr_workers): - t = multiprocessing.Process(target=_ocrProcess, args=(q, options), daemon=True) + t = multiprocessing.Process(target=_ocr_process, args=(q, options), daemon=True) t.start() - ocrProcessList.append(t) - - # TODO : proper cleanup code for `ocrProcessList` + ocr_process_list.append(t) options._easyocr_struct = {"manager": m, "queue": q} + options._easyocr_plist = ProcessList(ocr_process_list) + + +@hookimpl +def optimize_pdf( + input_pdf: Path, + output_pdf: Path, + context: PdfContext, + executor: Executor, + linearize: bool, +) -> tuple[Path, Sequence[str]]: + options = context.options + for _ in range(options.easyocr_workers): + q = options._easyocr_struct["queue"] + q.put(None) # send stop message + for p in options._easyocr_plist.process_list: + p.join(3.0) # clean up child processes but don't wait forever + + return default_optimize_pdf( + input_pdf=input_pdf, + output_pdf=output_pdf, + context=context, + executor=executor, + linearize=linearize, + ) @hookimpl @@ -194,14 +229,14 @@ def generate_pdf(input_file, output_pdf, output_text, options): img = cv.imread(os.fspath(input_file)) gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY) - s = options._easyocr_struct - manager: multiprocessing.managers.SyncManager = s["manager"] - queue: multiprocessing.Queue[Task] = s["queue"] - outputDict = manager.dict() + sync_data = options._easyocr_struct + manager: multiprocessing.managers.SyncManager = sync_data["manager"] + queue: multiprocessing.Queue[Task] = sync_data["queue"] + output_dict = manager.dict() event = manager.Event() - queue.put((gray, outputDict, event)) + queue.put((gray, output_dict, event)) event.wait() - raw_results = outputDict["output"] + raw_results = output_dict["output"] results = [tidy_easyocr_result(r) for r in raw_results] text = " ".join([result.text for result in results])