feat(Docling): prefetch model artifacts (#964)

Because - Some EasyOCR models are needed by Docling to transform PDF to Markdown. Without them, the first execution of the document component fails because the output starts by a "Downloading detection model, please wait..." print. - This also prevented coverage for the Docling converter. - The `use-docling` parameter in the document operator is less open to changes that an enum `converter` selector. This commit - Adds the EasyOCR models to the Docker images. - Corrects the integration test in the CI after [the latest changes in instill-core ](instill-ai/instill-core#1172). - Replaces the `use-docling` parameter by `converter`. The following changes are made on the `Dockerfile`: - `nobody:nogroup` needs to have a $HOME where the EasyOCR models will be placed (internally, this engine looks for the models in `~/.EasyOCR/model`). - The workdir (`/pipeline-backend`) is owned by `nobody:nogroup` in the dev image so we can run the coverage action without the root user.
instill-ai · Feb 4, 2025 · c9ff323 · c9ff323
1 parent 8ba570a
commit c9ff323
Show file tree

Hide file tree

Showing 14 changed files with 172 additions and 211 deletions.
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -37,24 +37,6 @@ jobs:
 
       - uses: actions/checkout@v3
 
-      - uses: FedericoCarboni/setup-ffmpeg@v3
-        id: setup-ffmpeg
-        with:
-          ffmpeg-version: 7.0.2
-
-      - name: Install onnxruntime library and headers
-        run: |
-          export ONNXRUNTIME_ROOT_PATH=$GITHUB_WORKSPACE/onnxruntime
-          LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name)
-          ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64")
-          wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
-          tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
-          mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH}
-          rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz
-          echo "LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
-          echo "LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib" >> $GITHUB_ENV
-          echo "C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include" >> $GITHUB_ENV
-
       - name: Generate coverage report
         run: |
           make build-dev

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
@@ -88,6 +88,7 @@ jobs:
           EDITION=local-ce:test \
           RAY_LATEST_TAG=latest \
           RAY_RELEASE_TAG=${RAY_SERVER_VERSION} \
+          COMPONENT_ENV=.env.component-test \
           docker compose -f docker-compose.yml -f docker-compose-latest.yml up -d --quiet-pull
           COMPOSE_PROFILES=all \
           EDITION=local-ce:test \

diff --git a/Dockerfile b/Dockerfile
@@ -88,10 +88,33 @@ COPY --from=build --chown=nobody:nogroup /usr/local/onnxruntime ${ONNXRUNTIME_RO
 ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
 RUN ln -s ${ONNXRUNTIME_ROOT_PATH}/lib/libonnxruntime.so* /usr/lib/
 
+# Docling will need a $HOME-prefixed path to write cache files. We'll also put
+# the prefetched model artifacts there.
+ENV BASE_DOCLING_PATH=/home/nobody
+RUN mkdir -p ${BASE_DOCLING_PATH}/.EasyOCR/model && chown -R nobody:nogroup ${BASE_DOCLING_PATH}
+
+RUN apt update && \
+    apt install -y wget unzip && \
+    wget https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip && \
+    unzip latin_g2.zip -d ${BASE_DOCLING_PATH}/.EasyOCR/model/ && \
+    rm latin_g2.zip && \
+    wget https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip && \
+    unzip craft_mlt_25k.zip -d ${BASE_DOCLING_PATH}/.EasyOCR/model/ && \
+    rm craft_mlt_25k.zip && \
+    apt remove -y wget unzip && \
+    apt autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV DOCLING_ARTIFACTS_PATH=${BASE_DOCLING_PATH}/docling-artifacts
+RUN echo "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline" > import_artifacts.py
+RUN echo "StandardPdfPipeline.download_models_hf(local_dir='${DOCLING_ARTIFACTS_PATH}')" >> import_artifacts.py
+RUN /opt/venv/bin/python import_artifacts.py && rm import_artifacts.py
+
 USER nobody:nogroup
 
 ARG SERVICE_NAME
 
+ENV HOME=${BASE_DOCLING_PATH}
 WORKDIR /${SERVICE_NAME}
 
 ENV GODEBUG=tlsrsakex=1
@@ -108,12 +131,3 @@ COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME} ./
 # Set up ONNX model and environment variable
 COPY --chown=nobody:nogroup ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
 ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx
-
-# Prefetch Docling models and set environment variable with the path to the
-# artifacts.
-ENV DOCLING_ARTIFACTS_PATH=/${SERVICE_NAME}/pkg/component/resources/docling
-
-RUN echo "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline" > import_artifacts.py
-RUN echo "StandardPdfPipeline.download_models_hf(local_dir='${DOCLING_ARTIFACTS_PATH}')" >> import_artifacts.py
-RUN /opt/venv/bin/python import_artifacts.py
-RUN rm import_artifacts.py
diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -12,6 +12,8 @@ ARG TARGETOS TARGETARCH K6_VERSION XK6_VERSION XK6_SQL_VERSION XK6_SQL_POSTGRES_
 # Install Python, create virtual environment, install pdfplumber and Docling
 RUN apt update && \
     apt install -y \
+    wget \
+    unzip \
     build-essential \
     xz-utils \
     python3 \
@@ -43,16 +45,14 @@ RUN FFMPEG_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "arm64" || echo "amd64") &
 # Install ONNX Runtime (latest release)
 ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
 RUN apt update && \
-    apt install -y wget jq && \
+    apt install -y jq && \
     LATEST_VERSION=$(wget -qO- https://api.github.com/repos/microsoft/onnxruntime/releases/latest | jq -r .tag_name) && \
     ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \
     wget https://github.com/microsoft/onnxruntime/releases/download/${LATEST_VERSION}/onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
     tar -xzf onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
     mv onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \
     rm onnxruntime-linux-${ONNX_ARCH}-${LATEST_VERSION#v}.tgz && \
-    apt remove -y wget jq && \
-    apt autoremove -y && \
-    rm -rf /var/lib/apt/lists/*
+    apt remove -y jq
 
 # Set environment variables and create symlinks for ONNX Runtime
 ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
@@ -90,15 +90,31 @@ ENV GODEBUG=tlsrsakex=1
 COPY ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
 ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx
 
-# Prefetch Docling models and set environment variable with the path to the
-# artifacts.
-ENV DOCLING_ARTIFACTS_PATH=/${SERVICE_NAME}/pkg/component/resources/docling
-
+# Docling will need a $HOME-prefixed path to write cache files. We'll also put
+# the prefetched model artifacts there.
+ENV BASE_DOCLING_PATH=/home/nobody
+RUN mkdir -p ${BASE_DOCLING_PATH}/.EasyOCR/model && chown -R nobody:nogroup ${BASE_DOCLING_PATH}
+RUN wget https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip && \
+    unzip latin_g2.zip -d ${BASE_DOCLING_PATH}/.EasyOCR/model/ && \
+    rm latin_g2.zip
+RUN wget https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip && \
+    unzip craft_mlt_25k.zip -d ${BASE_DOCLING_PATH}/.EasyOCR/model/ && \
+    rm craft_mlt_25k.zip
+
+ENV DOCLING_ARTIFACTS_PATH=${BASE_DOCLING_PATH}/docling-artifacts
 RUN echo "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline" > import_artifacts.py
 RUN echo "StandardPdfPipeline.download_models_hf(local_dir='${DOCLING_ARTIFACTS_PATH}')" >> import_artifacts.py
-RUN /opt/venv/bin/python import_artifacts.py
-RUN rm import_artifacts.py
+RUN /opt/venv/bin/python import_artifacts.py && rm import_artifacts.py
+
+# Clean up apt
+RUN apt remove -y wget unzip && \
+    apt autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
+
+# USER needs write permissions on the workspace to run the coverage report.
+RUN chown -R nobody:nogroup /${SERVICE_NAME}
 
 USER nobody:nogroup
+ENV HOME=${BASE_DOCLING_PATH}
 
 ENTRYPOINT ["tail", "-f", "/dev/null"]
diff --git a/Makefile b/Makefile
@@ -69,16 +69,12 @@ dbtest-pre:
 coverage: ## Generate coverage report
 	@if [ "${DBTEST}" = "true" ]; then  make dbtest-pre; fi
 	@docker run --rm \
-		-v $(PWD):/${SERVICE_NAME} \
 		-e GOTEST_FLAGS="${GOTEST_FLAGS}" \
-		--user $(id -u):$(id -g) \
 		--entrypoint= \
 		instill/${SERVICE_NAME}:dev \
 			go test -v -race ${GOTEST_TAGS} -coverpkg=./... -coverprofile=coverage.out -covermode=atomic -timeout 30m ./...
 	@if [ "${HTML}" = "true" ]; then  \
 		docker run --rm \
-			-v $(PWD):/${SERVICE_NAME} \
-			--user $(id -u):$(id -g) \
 			--entrypoint= \
 			instill/${SERVICE_NAME}:dev \
 				go tool cover -func=coverage.out && \

diff --git a/pkg/component/operator/document/v0/config/tasks.yaml b/pkg/component/operator/document/v0/config/tasks.yaml
@@ -37,17 +37,18 @@ TASK_CONVERT_TO_MARKDOWN:
       resolution:
         $ref: '#/$defs/resolution'
         uiOrder: 4
-      use-docling-converter:
-        # TODO jvallesm: rather than using a boolean to switch between
-        # converters, use a converter selector.
-        default: false
+      converter:
+        title: Converter
         description: |
-          When converting PDF to Markdown, use `docling` instead of
-          `pdfplumber`. This converter is more resource-intensive but usually
-          yields more accurate results.
+          The conversion engine used in the transformation. For now, it only
+          applies to PDF to Markdown conversions. `pdfplumber` is quicker than
+          Docling, but it typically produces less accurate results.
+        type: string
+        default: pdfplumber
+        enum:
+          - pdfplumber
+          - docling
         uiOrder: 5
-        title: Use Docling to convert PDF to Markdown
-        type: boolean
     required:
       - document
     title: Input

diff --git a/pkg/component/operator/document/v0/convert_document_to_markdown.go b/pkg/component/operator/document/v0/convert_document_to_markdown.go
@@ -23,13 +23,13 @@ func (e *execution) convertDocumentToMarkdown(ctx context.Context, job *base.Job
 	if err != nil {
 		return err
 	}
-	transformerInputStruct := transformer.ConvertDocumentToMarkdownTransformerInput{
+	transformerInputStruct := transformer.ConvertDocumentToMarkdownInput{
 		Document:            dataURI.String(),
 		DisplayImageTag:     inputStruct.DisplayImageTag,
 		Filename:            inputStruct.Filename,
 		DisplayAllPageImage: inputStruct.DisplayAllPageImage,
 		Resolution:          inputStruct.Resolution,
-		UseDoclingConverter: inputStruct.UseDoclingConverter,
+		Converter:           inputStruct.Converter,
 	}
 
 	transformerOutputStruct, err := transformer.ConvertDocumentToMarkdown(&transformerInputStruct, e.getMarkdownTransformer)

diff --git a/pkg/component/operator/document/v0/convert_document_to_markdown_test.go b/pkg/component/operator/document/v0/convert_document_to_markdown_test.go
@@ -15,22 +15,36 @@ import (
 
 func TestConvertDocumentToMarkdown(t *testing.T) {
 	c := qt.New(t)
+	c.Parallel()
 
 	tests := []struct {
-		name     string
-		filepath string
-		expected ConvertDocumentToMarkdownOutput
+		name      string
+		filepath  string
+		converter string
+		expected  ConvertDocumentToMarkdownOutput
 	}{
 		{
-			name:     "Convert PDF file",
-			filepath: "testdata/test.pdf",
+			name:      "Convert PDF file - pdfplumber",
+			filepath:  "testdata/test.pdf",
+			converter: "pdfplumber",
 			expected: ConvertDocumentToMarkdownOutput{
 				Body:          "# This is test file for markdown\n",
 				Images:        []format.Image{},
 				AllPageImages: []format.Image{},
 				Markdowns:     []string{"# This is test file for markdown\n"},
 			},
 		},
+		{
+			name:      "Convert PDF file - Docling",
+			filepath:  "testdata/test.pdf",
+			converter: "docling",
+			expected: ConvertDocumentToMarkdownOutput{
+				Body:          "This is test file for markdown",
+				Images:        []format.Image{},
+				AllPageImages: []format.Image{},
+				Markdowns:     []string{"This is test file for markdown"},
+			},
+		},
 		{
 			name:     "Convert DOCX file",
 			filepath: "testdata/test.docx",
@@ -89,12 +103,12 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
 		},
 	}
 
-	bc := base.Component{}
-	ctx := context.Background()
-
 	for _, test := range tests {
 		c.Run(test.name, func(c *qt.C) {
-			component := Init(bc)
+			c.Parallel()
+
+			ctx := context.Background()
+			component := Init(base.Component{})
 			c.Assert(component, qt.IsNotNil)
 
 			execution, err := component.CreateExecution(base.ComponentExecution{
@@ -120,6 +134,7 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
 							return doc
 						}(),
 						DisplayImageTag: false,
+						Converter:       test.converter,
 					}
 				}
 				return nil

diff --git a/pkg/component/operator/document/v0/io.go b/pkg/component/operator/document/v0/io.go
@@ -10,7 +10,7 @@ type ConvertDocumentToMarkdownInput struct {
 	Filename            string          `instill:"filename"`
 	DisplayAllPageImage bool            `instill:"display-all-page-image,default=false"`
 	Resolution          int             `instill:"resolution,default=300"`
-	UseDoclingConverter bool            `instill:"use-docling-converter,default=false"`
+	Converter           string          `instill:"converter,default=pdfplumber"`
 }
 
 type ConvertDocumentToMarkdownOutput struct {

diff --git a/pkg/component/operator/document/v0/transformer/execution/docling_pdf_to_md_converter.py b/pkg/component/operator/document/v0/transformer/execution/docling_pdf_to_md_converter.py
@@ -9,7 +9,6 @@
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling_core.types.doc import ImageRefMode, PictureItem
 
-
 if __name__ == "__main__":
     json_str = sys.stdin.buffer.read().decode('utf-8')
     params = json.loads(json_str)