Skip to content

Commit

Permalink
added in pdf extractor (#557)
Browse files Browse the repository at this point in the history
  • Loading branch information
benhoff authored and nmanovic committed Jul 11, 2019
1 parent 9651a19 commit ccbbf33
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Auto annotation using Faster R-CNN with Inception v2 (utils/open_model_zoo)
- Auto annotation using Pixel Link mobilenet v2 - text detection (utils/open_model_zoo)
- Ability to create a custom extractors for unsupported media types
- Added in PDF extractor

### Changed
- Outside and keyframe buttons in the side panel for all interpolation shapes (they were only for boxes before)
Expand Down
3 changes: 3 additions & 0 deletions cvat/apps/engine/media.mimetypes
Original file line number Diff line number Diff line change
Expand Up @@ -220,3 +220,6 @@ application/x-tarz tar.z
application/x-tzo tar.lzo
application/x-xz-compressed-tar txz
application/zip zip

# PDF
application/pdf pdf
60 changes: 60 additions & 0 deletions cvat/apps/engine/media_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,56 @@ def save_image(self, k, dest_path):
image.close()
return width, height

class PDFExtractor(MediaExtractor):
def __init__(self, source_path, dest_path, image_quality, step=1, start=0, stop=0):
if not source_path:
raise Exception('No PDF found')

from pdf2image import convert_from_path
self._temp_directory = tempfile.mkdtemp(prefix='cvat-')
super().__init__(
source_path=source_path[0],
dest_path=dest_path,
image_quality=image_quality,
step=1,
start=0,
stop=0,
)

self._dimensions = []
file_ = convert_from_path(self._source_path)
self._basename = os.path.splitext(os.path.basename(self._source_path))[0]
for page_num, page in enumerate(file_):
output = os.path.join(self._temp_directory, self._basename + f'{page_num}' + '.jpg')
self._dimensions.append(page.size)
page.save(output, 'JPEG')

self._length = len(os.listdir(self._temp_directory))

def _get_imagepath(self, k):
img_path = os.path.join(self._temp_directory, self._basename + f'{k}' + '.jpg')
return img_path

def __iter__(self):
i = 0
while os.path.exists(self._get_imagepath(i)):
yield self._get_imagepath(i)
i += 1

def __del__(self):
if self._temp_directory:
shutil.rmtree(self._temp_directory)

def __getitem__(self, k):
return self._get_imagepath(k)

def __len__(self):
return self._length

def save_image(self, k, dest_path):
shutil.copyfile(self[k], dest_path)
return self._dimensions[k]

#Note step, start, stop have no affect
class DirectoryExtractor(ImageListExtractor):
def __init__(self, source_path, dest_path, image_quality, step=1, start=0, stop=0):
Expand Down Expand Up @@ -180,6 +230,10 @@ def _is_image(path):
def _is_dir(path):
return os.path.isdir(path)

def _is_pdf(path):
mime = mimetypes.guess_type(path)
return mime[0] == 'application/pdf'

# 'has_mime_type': function receives 1 argument - path to file.
# Should return True if file has specified media type.
# 'extractor': class that extracts images from specified media.
Expand Down Expand Up @@ -213,4 +267,10 @@ def _is_dir(path):
'mode': 'annotation',
'unique': False,
},
'pdf': {
'has_mime_type': _is_pdf,
'extractor': PDFExtractor,
'mode': 'annotation',
'unique': True,
},
}
1 change: 1 addition & 0 deletions cvat/requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ djangorestframework==3.9.1
Pygments==2.3.1
drf-yasg==1.15.0
Shapely==1.6.4.post2
pdf2image==1.6.0

0 comments on commit ccbbf33

Please sign in to comment.