Skip to content

Commit

Permalink
feat: pdf backend, table mode as options and artifacts path (#203)
Browse files Browse the repository at this point in the history
* feat: add more options in the CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update CLI docs

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* expose artifacts-path as argument

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
  • Loading branch information
dolfim-ibm authored Nov 4, 2024
1 parent af323c0 commit 40ad987
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 26 deletions.
38 changes: 34 additions & 4 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional
from typing import Annotated, Dict, Iterable, List, Optional, Type

import typer
from docling_core.utils.file import resolve_file_source

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
Expand All @@ -22,6 +25,7 @@
EasyOcrOptions,
OcrOptions,
PdfPipelineOptions,
TableFormerMode,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
Expand Down Expand Up @@ -58,9 +62,10 @@ def version_callback(value: bool):


# Define an enum for the backend options
class Backend(str, Enum):
class PdfBackend(str, Enum):
PYPDFIUM2 = "pypdfium2"
DOCLING = "docling"
DLPARSE_V1 = "dlparse_v1"
DLPARSE_V2 = "dlparse_v2"


# Define an enum for the ocr engines
Expand Down Expand Up @@ -151,6 +156,17 @@ def convert(
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
pdf_backend: Annotated[
PdfBackend, typer.Option(..., help="The PDF backend to use.")
] = PdfBackend.DLPARSE_V1,
table_mode: Annotated[
TableFormerMode,
typer.Option(..., help="The mode to use in the table structure model."),
] = TableFormerMode.FAST,
artifacts_path: Annotated[
Optional[Path],
typer.Option(..., help="If provided, the location of the model artifacts."),
] = None,
abort_on_error: Annotated[
bool,
typer.Option(
Expand Down Expand Up @@ -217,11 +233,25 @@ def convert(
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.mode = table_mode

if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path

match pdf_backend:
case PdfBackend.DLPARSE_V1:
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
case PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
case PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
case _:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend, # pdf_backend
backend=backend, # pdf_backend
)
}
doc_converter = DocumentConverter(
Expand Down
6 changes: 3 additions & 3 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from enum import Enum, auto
from enum import Enum
from pathlib import Path
from typing import List, Literal, Optional, Union

from pydantic import BaseModel, ConfigDict, Field


class TableFormerMode(str, Enum):
FAST = auto()
ACCURATE = auto()
FAST = "fast"
ACCURATE = "accurate"


class TableStructureOptions(BaseModel):
Expand Down
45 changes: 26 additions & 19 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,30 +32,37 @@ Here are the available options as of this writing (for an up-to-date listing, ru
```console
$ docling --help

Usage: docling [OPTIONS] source

Usage: docling [OPTIONS] source
╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │
│ [required] │
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --from [docx|pptx|html|image|pdf] Specify input formats to convert from. │
│ Defaults to all formats. │
│ [default: None] │
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
│ Markdown. │
│ [default: None] │
│ --ocr --no-ocr If enabled, the bitmap content will be │
│ processed using OCR. │
│ [default: ocr] │
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. [default: easyocr] │
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
│ processed using OCR. │
│ [default: no-abort-on-error] │
│ --output PATH Output directory where results are saved. │
│ [default: .] │
│ --version Show version information. │
│ --help Show this message and exit. │
│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │
│ Defaults to all formats. │
│ [default: None] │
│ --to [md|json|text|doctags] Specify output formats. Defaults to │
│ Markdown. │
│ [default: None] │
│ --ocr --no-ocr If enabled, the bitmap content will be │
│ processed using OCR. │
│ [default: ocr] │
│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. │
│ [default: easyocr] │
│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │
│ [default: dlparse_v1] │
│ --table-mode [fast|accurate] The mode to use in the table structure │
│ model. │
│ [default: fast] │
│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │
│ processed using OCR. │
│ [default: no-abort-on-error] │
│ --output PATH Output directory where results are │
│ saved. │
│ [default: .] │
│ --version Show version information. │
│ --help Show this message and exit. │
╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
```
</details>
Expand Down

0 comments on commit 40ad987

Please sign in to comment.