Skip to content

Commit

Permalink
Merge pull request #3 from microsoft/add_cli
Browse files Browse the repository at this point in the history
Added a simple CLI.
  • Loading branch information
afourney authored Nov 14, 2024
2 parents 8a29572 + 2ad821a commit 851c7cf
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 9 deletions.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ Source = "https://github.com/microsoft/markitdown"
[tool.hatch.version]
path = "src/markitdown/__about__.py"

[project.scripts]
markitdown = "markitdown.__main__:main"

[tool.hatch.envs.types]
extra-dependencies = [
"mypy>=1.0.0",
Expand Down
42 changes: 42 additions & 0 deletions src/markitdown/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
import sys
from ._markitdown import MarkItDown


def main():
if len(sys.argv) == 1:
markitdown = MarkItDown()
result = markitdown.convert_stream(sys.stdin.buffer)
print(result.text_content)
elif len(sys.argv) == 2:
markitdown = MarkItDown()
result = markitdown.convert(sys.argv[1])
print(result.text_content)
else:
sys.stderr.write(
"""
SYNTAX:
markitdown <OPTIONAL: FILENAME>
If FILENAME is empty, markitdown reads from stdin.
EXAMPLE:
markitdown example.pdf
OR
cat example.pdf | markitdown
OR
markitdown < example.pdf
""".strip()
+ "\n"
)


if __name__ == "__main__":
main()
27 changes: 18 additions & 9 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import subprocess
import sys
import tempfile
import traceback
from typing import Any, Dict, List, Optional, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse

Expand Down Expand Up @@ -913,7 +914,9 @@ def convert_local(
# Get extension alternatives from the path and puremagic
base, ext = os.path.splitext(path)
self._append_ext(extensions, ext)
self._append_ext(extensions, self._guess_ext_magic(path))

for g in self._guess_ext_magic(path):
self._append_ext(extensions, g)

# Convert
return self._convert(path, extensions, **kwargs)
Expand All @@ -940,7 +943,8 @@ def convert_stream(
fh.close()

# Use puremagic to check for more extension options
self._append_ext(extensions, self._guess_ext_magic(temp_path))
for g in self._guess_ext_magic(temp_path):
self._append_ext(extensions, g)

# Convert
result = self._convert(temp_path, extensions, **kwargs)
Expand Down Expand Up @@ -1032,10 +1036,10 @@ def _convert(
_kwargs["mlm_model"] = self._mlm_model

# If we hit an error log it and keep trying
# try:
res = converter.convert(local_path, **_kwargs)
# except Exception:
# error_trace = ("\n\n" + traceback.format_exc()).strip()
try:
res = converter.convert(local_path, **_kwargs)
except Exception:
error_trace = ("\n\n" + traceback.format_exc()).strip()

if res is not None:
# Normalize the content
Expand Down Expand Up @@ -1074,10 +1078,15 @@ def _guess_ext_magic(self, path):
# Use puremagic to guess
try:
guesses = puremagic.magic_file(path)
if len(guesses) > 0:
ext = guesses[0].extension.strip()
extensions = list()
for g in guesses:
ext = g.extension.strip()
if len(ext) > 0:
return ext
if not ext.startswith("."):
ext = "." + ext
if ext not in extensions:
extensions.append(ext)
return extensions
except FileNotFoundError:
pass
except IsADirectoryError:
Expand Down

0 comments on commit 851c7cf

Please sign in to comment.