Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add zip handling #22

Merged
merged 8 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ It presently supports:
- Audio (EXIF metadata, and speech transcription)
- HTML (special handling of Wikipedia, etc.)
- Various other text-based formats (csv, json, xml, etc.)
- ZIP (Iterates over contents and converts each file)

# Installation

Expand Down
122 changes: 122 additions & 0 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import sys
import tempfile
import traceback
import zipfile
from typing import Any, Dict, List, Optional, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse

Expand Down Expand Up @@ -836,6 +837,124 @@ def _get_mlm_description(self, local_path, extension, client, model, prompt=None
return response.choices[0].message.content


class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.

The converter extracts the ZIP contents to a temporary directory, processes each file
using appropriate converters based on file extensions, and then combines the results
into a single markdown document. The temporary directory is cleaned up after processing.

Example output format:
```markdown
Content from the zip file `example.zip`:

## File: docs/readme.txt

This is the content of readme.txt
Multiple lines are preserved

## File: images/example.jpg

ImageSize: 1920x1080
DateTimeOriginal: 2024-02-15 14:30:00
Description: A beautiful landscape photo

## File: data/report.xlsx

## Sheet1
| Column1 | Column2 | Column3 |
|---------|---------|---------|
| data1 | data2 | data3 |
| data4 | data5 | data6 |
```

Key features:
- Maintains original file structure in headings
- Processes nested files recursively
- Uses appropriate converters for each file type
- Preserves formatting of converted content
- Cleans up temporary files after processing
"""

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
# Bail if not a ZIP
extension = kwargs.get("file_extension", "")
if extension.lower() != ".zip":
return None

# Get parent converters list if available
parent_converters = kwargs.get("_parent_converters", [])
if not parent_converters:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
)

extracted_zip_folder_name = (
f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
)
new_folder = os.path.normpath(
os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
)
md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"

# Safety check for path traversal
if not new_folder.startswith(os.path.dirname(local_path)):
return DocumentConverterResult(
title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}"
)

try:
# Extract the zip file
with zipfile.ZipFile(local_path, "r") as zipObj:
zipObj.extractall(path=new_folder)

# Process each extracted file
for root, dirs, files in os.walk(new_folder):
for name in files:
file_path = os.path.join(root, name)
relative_path = os.path.relpath(file_path, new_folder)

# Get file extension
_, file_extension = os.path.splitext(name)

# Update kwargs for the file
file_kwargs = kwargs.copy()
file_kwargs["file_extension"] = file_extension
file_kwargs["_parent_converters"] = parent_converters

# Try converting the file using available converters
for converter in parent_converters:
# Skip the zip converter to avoid infinite recursion
if isinstance(converter, ZipConverter):
continue

result = converter.convert(file_path, **file_kwargs)
if result is not None:
md_content += f"\n## File: {relative_path}\n\n"
md_content += result.text_content + "\n\n"
break

# Clean up extracted files if specified
if kwargs.get("cleanup_extracted", True):
shutil.rmtree(new_folder)

return DocumentConverterResult(title=None, text_content=md_content.strip())

except zipfile.BadZipFile:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
)
except Exception as e:
return DocumentConverterResult(
title=None,
text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
)


class FileConversionException(BaseException):
pass

Expand Down Expand Up @@ -879,6 +998,7 @@ def __init__(
self.register_page_converter(Mp3Converter())
self.register_page_converter(ImageConverter())
self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter())

def convert(
self, source: Union[str, requests.Response], **kwargs: Any
Expand Down Expand Up @@ -1034,6 +1154,8 @@ def _convert(

if "mlm_model" not in _kwargs and self._mlm_model is not None:
_kwargs["mlm_model"] = self._mlm_model
# Add the list of converters for nested processing
_kwargs["_parent_converters"] = self._page_converters

# If we hit an error log it and keep trying
try:
Expand Down
Binary file added tests/test_files/test_files.zip
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ def test_markitdown_local() -> None:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content

# Test ZIP file processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
for test_string in DOCX_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content

# Test Wikipedia processing
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
Expand Down
Loading