Enhanced Vision Parser and LLM functionality and introduced new image…

… processing capabilities with support for base64 and URL image modes
iamarunbrahma · Dec 31, 2024 · 5bc5025 · 5bc5025
1 parent c138c1e
commit 5bc5025
Show file tree

Hide file tree

Showing 16 changed files with 1,438 additions and 230 deletions.
diff --git a/Makefile b/Makefile
@@ -1,9 +1,12 @@
-.PHONY: lint format test release tag
+.PHONY: lint format test release tag format-nb
 
 lint:
 	ruff check . --fix
 
-format:
+format-nb:
+	black --ipynb examples/*.ipynb
+
+format: format-nb
 	black .
 
 test:

diff --git a/README.md b/README.md
@@ -27,21 +27,35 @@ Vision Parse harnesses the power of Vision Language Models to revolutionize docu
 
 ### Installation
 
-Install the package using pip (Recommended):
+**Install the core package using pip (Recommended):**
 
 ```bash
 pip install vision-parse
 ```
 
-Install the optional dependencies for OpenAI or Gemini:
+**Install the additional dependencies for OpenAI or Gemini:**
+
 ```bash
+# For OpenAI support
 pip install 'vision-parse[openai]'
 ```
 
 ```bash
+# For Gemini support
 pip install 'vision-parse[gemini]'
 ```
 
+```bash
+# To install all the additional dependencies
+pip install 'vision-parse[all]'
+```
+
+**Install the package from source:**
+
+```bash
+pip install 'git+https://github.com/iamarunbrahma/vision-parse.git#egg=vision-parse[all]'
+```
+
 ### Setting up Ollama (Optional)
 See [examples/ollama_setup.md](examples/ollama_setup.md) on how to setup Ollama locally.
 
@@ -56,8 +70,10 @@ from vision_parse import VisionParser
 parser = VisionParser(
     model_name="llama3.2-vision:11b", # For local models, you don't need to provide the api key
     temperature=0.4,
-    top_p=0.3,
-    extraction_complexity=False # Set to True for more detailed extraction
+    top_p=0.5,
+    image_mode="url", # Image mode can be "url", "base64" or None
+    detailed_extraction=False, # Set to True for more detailed extraction
+    enable_concurrency=False, # Set to True for parallel processing
 )
 
 # Convert PDF to markdown
@@ -69,26 +85,23 @@ for i, page_content in enumerate(markdown_pages):
     print(f"\n--- Page {i+1} ---\n{page_content}")
 ```
 
-### PDF Page Configuration
+### Customized Ollama Configuration
 
 ```python
 from vision_parse import VisionParser, PDFPageConfig
 
-# Configure PDF processing settings
-page_config = PDFPageConfig(
-    dpi=400,
-    color_space="RGB",
-    include_annotations=True,
-    preserve_transparency=False
-)
-
-# Initialize parser with custom page config
+# Initialize parser with Ollama configuration
 parser = VisionParser(
     model_name="llama3.2-vision:11b",
     temperature=0.7,
-    top_p=0.4,
-    extraction_complexity=False,
-    page_config=page_config
+    top_p=0.6,
+    image_mode="base64",
+    detailed_extraction=True,
+    ollama_config={
+        "OLLAMA_NUM_PARALLEL": "4",
+    },
+    enable_concurrency=True,
+    num_ctx=4096,
 )
 
 # Convert PDF to markdown
@@ -107,7 +120,9 @@ parser = VisionParser(
     api_key="your-openai-api-key", # Get the OpenAI API key from https://platform.openai.com/api-keys
     temperature=0.7,
     top_p=0.4,
-    extraction_complexity=True # Set to True for more detailed extraction
+    image_mode="url",
+    detailed_extraction=True, # Set to True for more detailed extraction
+    enable_concurrency=True,
 )
 
 # Initialize parser with Google Gemini model
@@ -116,11 +131,13 @@ parser = VisionParser(
     api_key="your-gemini-api-key", # Get the Gemini API key from https://aistudio.google.com/app/apikey
     temperature=0.7,
     top_p=0.4,
-    extraction_complexity=True # Set to True for more detailed extraction
+    image_mode="url",
+    detailed_extraction=True, # Set to True for more detailed extraction
+    enable_concurrency=True,
 )
 ```
 
-## Supported Models
+## ✅ Supported Models
 
 This package supports the following Vision LLM models:
 

diff --git a/examples/gemini_demo.ipynb b/examples/gemini_demo.ipynb
@@ -100,7 +100,8 @@
     "    temperature=0.9,\n",
     "    top_p=0.4,\n",
     "    max_output_tokens=2048,\n",
-    "    extraction_complexity=True\n",
+    "    image_mode=\"url\",\n",
+    "    detailed_extraction=True,\n",
     ")\n",
     "\n",
     "pdf_path = \"../tests/Texas-Holdem-Rules.pdf\"\n",

diff --git a/examples/ollama_demo.ipynb b/examples/ollama_demo.ipynb
@@ -75,7 +75,12 @@
     "    top_p=0.4,\n",
     "    num_ctx=4096,\n",
     "    num_predict=4096,\n",
-    "    extraction_complexity=True\n",
+    "    ollama_config={\n",
+    "        \"OLLAMA_NUM_PARALLEL\": \"10\",\n",
+    "    },\n",
+    "    image_mode=\"base64\",\n",
+    "    detailed_extraction=True,\n",
+    "    enable_concurrency=True,\n",
     ")\n",
     "\n",
     "pdf_path = \"../tests/Texas-Holdem-Rules.pdf\"\n",
@@ -102,7 +107,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.1"
+   "version": "3.13.0"
   }
  },
  "nbformat": 4,

diff --git a/examples/openai_demo.ipynb b/examples/openai_demo.ipynb
@@ -87,7 +87,9 @@
     "    top_p=0.4,\n",
     "    max_tokens=4096,\n",
     "    frequency_penalty=0.3,\n",
-    "    extraction_complexity=True\n",
+    "    image_mode=None,\n",
+    "    detailed_extraction=True,\n",
+    "    enable_concurrency=True,\n",
     ")\n",
     "\n",
     "pdf_path = \"../tests/Texas-Holdem-Rules.pdf\"\n",
@@ -115,7 +117,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.13.1"
+   "version": "3.13.0"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,10 @@ classifiers = [
 ]
 dependencies = [
     "jinja2>=3.0.0",
+    "nest-asyncio>=1.6.0",
+    "numpy>=2.0.0",
     "ollama>=0.4.4",
+    "opencv-python>=4.10.0.84",
     "pydantic>=2.0.0",
     "pymupdf>=1.22.0",
     "tenacity>=9.0.0",
@@ -39,15 +42,17 @@ Repository = "https://github.com/iamarunbrahma/vision-parse.git"
 [project.optional-dependencies]
 dev = [
     "black>=24.4.1",
+    "black[jupyter]>=24.8.0",
     "pytest>=8.3.4",
+    "pytest-asyncio>=0.23.5",
     "ruff>=0.8.3",
 ]
-openai = [
-    "openai==1.58.0",
-]
 gemini = [
     "google-generativeai==0.8.3",
 ]
+openai = [
+    "openai==1.58.0",
+]
 all = [
     "google-generativeai==0.8.3",
     "openai==1.58.0",

diff --git a/src/vision_parse/__init__.py b/src/vision_parse/__init__.py
@@ -1,17 +1,23 @@
 from .parser import VisionParser, PDFPageConfig, VisionParserError, UnsupportedFileError
 from .llm import LLMError, UnsupportedModelError
-from importlib.metadata import version
+from .utils import ImageExtractionError
+from importlib.metadata import version, PackageNotFoundError
+from .constants import SUPPORTED_MODELS
 
 try:
     __version__ = version("vision-parse")
-except Exception:
-    __version__ = "0.1.0"
+except PackageNotFoundError:
+    # Use a development version when package is not installed
+    __version__ = "0.0.0.dev0"
 
 __all__ = [
     "VisionParser",
     "PDFPageConfig",
+    "ImageExtractionError",
     "VisionParserError",
     "UnsupportedFileError",
     "UnsupportedModelError",
     "LLMError",
+    "SUPPORTED_MODELS",
+    "__version__",
 ]
diff --git a/src/vision_parse/constants.py b/src/vision_parse/constants.py
@@ -0,0 +1,13 @@
+from typing import Dict
+
+SUPPORTED_MODELS: Dict[str, str] = {
+    "llama3.2-vision:11b": "ollama",
+    "llama3.2-vision:70b": "ollama",
+    "llava:13b": "ollama",
+    "llava:34b": "ollama",
+    "gpt-4o": "openai",
+    "gpt-4o-mini": "openai",
+    "gemini-1.5-flash": "gemini",
+    "gemini-2.0-flash-exp": "gemini",
+    "gemini-1.5-pro": "gemini",
+}
diff --git a/src/vision_parse/img_analysis.prompt b/src/vision_parse/img_analysis.prompt
@@ -1,2 +1,14 @@
 Analyze this image and return a detailed JSON description including any text detected, images detected, tables detected, extracted text and confidence score for the extracted text.
-Confidence score for the extracted text should be a float value between 0 and 1. If you cannot determine certain details, leave those fields empty.
+- Confidence score for the extracted text should be a float value between 0 and 1. If you cannot determine certain details, leave those fields empty or zero.
+- Ensure markdown text formatting for extracted text is applied properly by analyzing the image.
+- Please ensure that the JSON object is valid and all the fields are present in the response as below:
+
+```json
+{
+    "text_detected": "Yes" | "No",
+    "images_detected": "Yes" | "No",
+    "tables_detected": "Yes" | "No",
+    "extracted_text": "Extracted text from the image",
+    "confidence_score_text": "Confidence score for the extracted text"
+}
+```