diff --git a/ocrmypdf_easyocr/__init__.py b/ocrmypdf_easyocr/__init__.py index 81218e2..ec34109 100644 --- a/ocrmypdf_easyocr/__init__.py +++ b/ocrmypdf_easyocr/__init__.py @@ -6,14 +6,20 @@ from __future__ import annotations import logging +import multiprocessing.managers import os -from multiprocessing import Semaphore +import threading +import traceback +from pathlib import Path +from typing import Optional, Sequence, Tuple import cv2 as cv import easyocr +import numpy.typing as npt import pluggy -from ocrmypdf import OcrEngine, hookimpl +from ocrmypdf import Executor, OcrEngine, PdfContext, hookimpl from ocrmypdf._exec import tesseract +from ocrmypdf.builtin_plugins.optimize import optimize_pdf as default_optimize_pdf from ocrmypdf_easyocr._cv import detect_skew from ocrmypdf_easyocr._easyocr import tidy_easyocr_result @@ -21,8 +27,6 @@ log = logging.getLogger(__name__) -GPU_SEMAPHORE = Semaphore(3) - ISO_639_3_2: dict[str, str] = { "afr": "af", "alb": "sq", @@ -89,16 +93,92 @@ "vie": "vi", } +Task = Tuple[npt.NDArray, multiprocessing.Value, threading.Event] | None + + +def _ocr_process(q: multiprocessing.Queue[Task], options): + reader: Optional[easyocr.Reader] = None + + while True: + message = q.get() + if message is None: + return # exit process + gray, output_dict, event = message + + # Init reader on first OCR attempt: Wait until `options` variable is fully initialized. + # Note: `options` variable is on the same process with the main thread. + try: + if reader is None: + use_gpu = options.gpu + languages = [ISO_639_3_2[lang] for lang in options.languages] + reader = easyocr.Reader(languages, use_gpu) + output_dict["output"] = reader.readtext( + gray, batch_size=options.easyocr_batch_size + ) + except Exception as e: + traceback.print_exception(e) + output_dict["output"] = "" + finally: + event.set() + @hookimpl def initialize(plugin_manager: pluggy.PluginManager): pass +class ProcessList: + def __init__(self, plist): + self.process_list = plist + + def __getstate__(self): + return [] + + +@hookimpl +def check_options(options): + m = multiprocessing.Manager() + q = multiprocessing.Queue(-1) + ocr_process_list = [] + for _ in range(options.easyocr_workers): + t = multiprocessing.Process(target=_ocr_process, args=(q, options), daemon=True) + t.start() + ocr_process_list.append(t) + + options._easyocr_struct = {"manager": m, "queue": q} + options._easyocr_plist = ProcessList(ocr_process_list) + + +@hookimpl +def optimize_pdf( + input_pdf: Path, + output_pdf: Path, + context: PdfContext, + executor: Executor, + linearize: bool, +) -> tuple[Path, Sequence[str]]: + options = context.options + for _ in range(options.easyocr_workers): + q = options._easyocr_struct["queue"] + q.put(None) # send stop message + for p in options._easyocr_plist.process_list: + p.join(3.0) # clean up child processes but don't wait forever + + return default_optimize_pdf( + input_pdf=input_pdf, + output_pdf=output_pdf, + context=context, + executor=executor, + linearize=linearize, + ) + + @hookimpl def add_options(parser): easyocr_options = parser.add_argument_group("EasyOCR", "EasyOCR options") easyocr_options.add_argument("--easyocr-no-gpu", action="store_false", dest="gpu") + easyocr_options.add_argument("--easyocr-batch-size", type=int, default=4) + easyocr_options.add_argument("--easyocr-workers", type=int, default=1) easyocr_options.add_argument( "--easyocr-debug-suppress-images", action="store_true", @@ -146,15 +226,19 @@ def generate_hocr(input_file, output_hocr, output_text, options): @staticmethod def generate_pdf(input_file, output_pdf, output_text, options): - languages = [ISO_639_3_2[lang] for lang in options.languages] - img = cv.imread(os.fspath(input_file)) gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY) - with GPU_SEMAPHORE: - reader = easyocr.Reader(languages, gpu=options.gpu) - raw_results = reader.readtext(gray) - results = [tidy_easyocr_result(r) for r in raw_results] + sync_data = options._easyocr_struct + manager: multiprocessing.managers.SyncManager = sync_data["manager"] + queue: multiprocessing.Queue[Task] = sync_data["queue"] + output_dict = manager.dict() + event = manager.Event() + queue.put((gray, output_dict, event)) + event.wait() + raw_results = output_dict["output"] + + results = [tidy_easyocr_result(r) for r in raw_results] text = " ".join([result.text for result in results]) output_text.write_text(text)