Skip to content

Commit

Permalink
feat(Docling): prefetch model artifacts (#964)
Browse files Browse the repository at this point in the history
Because

- Some EasyOCR models are needed by Docling to transform PDF to
Markdown. Without them, the first execution of the document component
fails because the output starts by a "Downloading detection model,
please wait..." print.
- This also prevented coverage for the Docling converter.
- The `use-docling` parameter in the document operator is less open to
changes that an enum `converter` selector.

This commit

- Adds the EasyOCR models to the Docker images.
- Corrects the integration test in the CI after [the latest changes in
instill-core
](instill-ai/instill-core#1172).
- Replaces the `use-docling` parameter by `converter`.

The following changes are made on the `Dockerfile`:
- `nobody:nogroup` needs to have a $HOME where the EasyOCR models will
be placed (internally, this engine looks for the models in
`~/.EasyOCR/model`).
- The workdir (`/pipeline-backend`) is owned by `nobody:nogroup` in the
dev image so we can run the coverage action without the root user.
  • Loading branch information
jvallesm authored Feb 4, 2025
1 parent 8ba570a commit c9ff323
Show file tree
Hide file tree
Showing 14 changed files with 172 additions and 211 deletions.
18 changes: 0 additions & 18 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,24 +37,6 @@ jobs:

- uses: actions/checkout@v3

- uses: FedericoCarboni/setup-ffmpeg@v3
id: setup-ffmpeg
with:
ffmpeg-version: 7.0.2

- name: Install onnxruntime library and headers
run: |
export ONNXRUNTIME_ROOT_PATH=$GITHUB_WORKSPACE/onnxruntime
LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name)
ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64")
wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH}
rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
echo "LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
echo "LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
echo "C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include" >> $GITHUB_ENV
- name: Generate coverage report
run: |
make build-dev
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ jobs:
EDITION=local-ce:test \
RAY_LATEST_TAG=latest \
RAY_RELEASE_TAG=${RAY_SERVER_VERSION} \
COMPONENT_ENV=.env.component-test \
docker compose -f docker-compose.yml -f docker-compose-latest.yml up -d --quiet-pull
COMPOSE_PROFILES=all \
EDITION=local-ce:test \
Expand Down
32 changes: 23 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,33 @@ COPY --from=build --chown=nobody:nogroup /usr/local/onnxruntime ${ONNXRUNTIME_RO
ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
RUN ln -s ${ONNXRUNTIME_ROOT_PATH}/lib/libonnxruntime.so* /usr/lib/

# Docling will need a $HOME-prefixed path to write cache files. We'll also put
# the prefetched model artifacts there.
ENV BASE_DOCLING_PATH=/home/nobody
RUN mkdir -p ${BASE_DOCLING_PATH}/.EasyOCR/model && chown -R nobody:nogroup ${BASE_DOCLING_PATH}

RUN apt update && \
apt install -y wget unzip && \
wget https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip && \
unzip latin_g2.zip -d ${BASE_DOCLING_PATH}/.EasyOCR/model/ && \
rm latin_g2.zip && \
wget https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip && \
unzip craft_mlt_25k.zip -d ${BASE_DOCLING_PATH}/.EasyOCR/model/ && \
rm craft_mlt_25k.zip && \
apt remove -y wget unzip && \
apt autoremove -y && \
rm -rf /var/lib/apt/lists/*

ENV DOCLING_ARTIFACTS_PATH=${BASE_DOCLING_PATH}/docling-artifacts
RUN echo "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline" > import_artifacts.py
RUN echo "StandardPdfPipeline.download_models_hf(local_dir='${DOCLING_ARTIFACTS_PATH}')" >> import_artifacts.py
RUN /opt/venv/bin/python import_artifacts.py && rm import_artifacts.py

USER nobody:nogroup

ARG SERVICE_NAME

ENV HOME=${BASE_DOCLING_PATH}
WORKDIR /${SERVICE_NAME}

ENV GODEBUG=tlsrsakex=1
Expand All @@ -108,12 +131,3 @@ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME} ./
# Set up ONNX model and environment variable
COPY --chown=nobody:nogroup ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx

# Prefetch Docling models and set environment variable with the path to the
# artifacts.
ENV DOCLING_ARTIFACTS_PATH=/${SERVICE_NAME}/pkg/component/resources/docling

RUN echo "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline" > import_artifacts.py
RUN echo "StandardPdfPipeline.download_models_hf(local_dir='${DOCLING_ARTIFACTS_PATH}')" >> import_artifacts.py
RUN /opt/venv/bin/python import_artifacts.py
RUN rm import_artifacts.py
36 changes: 26 additions & 10 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION XK6_SQL_VERSION XK6_SQL_POSTGRES_
# Install Python, create virtual environment, install pdfplumber and Docling
RUN apt update && \
apt install -y \
wget \
unzip \
build-essential \
xz-utils \
python3 \
Expand Down Expand Up @@ -43,16 +45,14 @@ RUN FFMPEG_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "arm64" || echo "amd64") &
# Install ONNX Runtime (latest release)
ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
RUN apt update && \
apt install -y wget jq && \
apt install -y jq && \
LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) && \
ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \
wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \
rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
apt remove -y wget jq && \
apt autoremove -y && \
rm -rf /var/lib/apt/lists/*
apt remove -y jq

# Set environment variables and create symlinks for ONNX Runtime
ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
Expand Down Expand Up @@ -90,15 +90,31 @@ ENV GODEBUG=tlsrsakex=1
COPY ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx

# Prefetch Docling models and set environment variable with the path to the
# artifacts.
ENV DOCLING_ARTIFACTS_PATH=/${SERVICE_NAME}/pkg/component/resources/docling

# Docling will need a $HOME-prefixed path to write cache files. We'll also put
# the prefetched model artifacts there.
ENV BASE_DOCLING_PATH=/home/nobody
RUN mkdir -p ${BASE_DOCLING_PATH}/.EasyOCR/model && chown -R nobody:nogroup ${BASE_DOCLING_PATH}
RUN wget https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip && \
unzip latin_g2.zip -d ${BASE_DOCLING_PATH}/.EasyOCR/model/ && \
rm latin_g2.zip
RUN wget https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip && \
unzip craft_mlt_25k.zip -d ${BASE_DOCLING_PATH}/.EasyOCR/model/ && \
rm craft_mlt_25k.zip

ENV DOCLING_ARTIFACTS_PATH=${BASE_DOCLING_PATH}/docling-artifacts
RUN echo "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline" > import_artifacts.py
RUN echo "StandardPdfPipeline.download_models_hf(local_dir='${DOCLING_ARTIFACTS_PATH}')" >> import_artifacts.py
RUN /opt/venv/bin/python import_artifacts.py
RUN rm import_artifacts.py
RUN /opt/venv/bin/python import_artifacts.py && rm import_artifacts.py

# Clean up apt
RUN apt remove -y wget unzip && \
apt autoremove -y && \
rm -rf /var/lib/apt/lists/*

# USER needs write permissions on the workspace to run the coverage report.
RUN chown -R nobody:nogroup /${SERVICE_NAME}

USER nobody:nogroup
ENV HOME=${BASE_DOCLING_PATH}

ENTRYPOINT ["tail", "-f", "/dev/null"]
4 changes: 0 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,12 @@ dbtest-pre:
coverage: ## Generate coverage report
@if [ "${DBTEST}" = "true" ]; then make dbtest-pre; fi
@docker run --rm \
-v $(PWD):/${SERVICE_NAME} \
-e GOTEST_FLAGS="${GOTEST_FLAGS}" \
--user $(id -u):$(id -g) \
--entrypoint= \
instill/${SERVICE_NAME}:dev \
go test -v -race ${GOTEST_TAGS} -coverpkg=./... -coverprofile=coverage.out -covermode=atomic -timeout 30m ./...
@if [ "${HTML}" = "true" ]; then \
docker run --rm \
-v $(PWD):/${SERVICE_NAME} \
--user $(id -u):$(id -g) \
--entrypoint= \
instill/${SERVICE_NAME}:dev \
go tool cover -func=coverage.out && \
Expand Down
19 changes: 10 additions & 9 deletions pkg/component/operator/document/v0/config/tasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,18 @@ TASK_CONVERT_TO_MARKDOWN:
resolution:
$ref: '#/$defs/resolution'
uiOrder: 4
use-docling-converter:
# TODO jvallesm: rather than using a boolean to switch between
# converters, use a converter selector.
default: false
converter:
title: Converter
description: |
When converting PDF to Markdown, use `docling` instead of
`pdfplumber`. This converter is more resource-intensive but usually
yields more accurate results.
The conversion engine used in the transformation. For now, it only
applies to PDF to Markdown conversions. `pdfplumber` is quicker than
Docling, but it typically produces less accurate results.
type: string
default: pdfplumber
enum:
- pdfplumber
- docling
uiOrder: 5
title: Use Docling to convert PDF to Markdown
type: boolean
required:
- document
title: Input
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ func (e *execution) convertDocumentToMarkdown(ctx context.Context, job *base.Job
if err != nil {
return err
}
transformerInputStruct := transformer.ConvertDocumentToMarkdownTransformerInput{
transformerInputStruct := transformer.ConvertDocumentToMarkdownInput{
Document: dataURI.String(),
DisplayImageTag: inputStruct.DisplayImageTag,
Filename: inputStruct.Filename,
DisplayAllPageImage: inputStruct.DisplayAllPageImage,
Resolution: inputStruct.Resolution,
UseDoclingConverter: inputStruct.UseDoclingConverter,
Converter: inputStruct.Converter,
}

transformerOutputStruct, err := transformer.ConvertDocumentToMarkdown(&transformerInputStruct, e.getMarkdownTransformer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,36 @@ import (

func TestConvertDocumentToMarkdown(t *testing.T) {
c := qt.New(t)
c.Parallel()

tests := []struct {
name string
filepath string
expected ConvertDocumentToMarkdownOutput
name string
filepath string
converter string
expected ConvertDocumentToMarkdownOutput
}{
{
name: "Convert PDF file",
filepath: "testdata/test.pdf",
name: "Convert PDF file - pdfplumber",
filepath: "testdata/test.pdf",
converter: "pdfplumber",
expected: ConvertDocumentToMarkdownOutput{
Body: "# This is test file for markdown\n",
Images: []format.Image{},
AllPageImages: []format.Image{},
Markdowns: []string{"# This is test file for markdown\n"},
},
},
{
name: "Convert PDF file - Docling",
filepath: "testdata/test.pdf",
converter: "docling",
expected: ConvertDocumentToMarkdownOutput{
Body: "This is test file for markdown",
Images: []format.Image{},
AllPageImages: []format.Image{},
Markdowns: []string{"This is test file for markdown"},
},
},
{
name: "Convert DOCX file",
filepath: "testdata/test.docx",
Expand Down Expand Up @@ -89,12 +103,12 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
},
}

bc := base.Component{}
ctx := context.Background()

for _, test := range tests {
c.Run(test.name, func(c *qt.C) {
component := Init(bc)
c.Parallel()

ctx := context.Background()
component := Init(base.Component{})
c.Assert(component, qt.IsNotNil)

execution, err := component.CreateExecution(base.ComponentExecution{
Expand All @@ -120,6 +134,7 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
return doc
}(),
DisplayImageTag: false,
Converter: test.converter,
}
}
return nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/component/operator/document/v0/io.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ type ConvertDocumentToMarkdownInput struct {
Filename string `instill:"filename"`
DisplayAllPageImage bool `instill:"display-all-page-image,default=false"`
Resolution int `instill:"resolution,default=300"`
UseDoclingConverter bool `instill:"use-docling-converter,default=false"`
Converter string `instill:"converter,default=pdfplumber"`
}

type ConvertDocumentToMarkdownOutput struct {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.types.doc import ImageRefMode, PictureItem


if __name__ == "__main__":
json_str = sys.stdin.buffer.read().decode('utf-8')
params = json.loads(json_str)
Expand Down
Loading

0 comments on commit c9ff323

Please sign in to comment.