From 36d23e843fa2eeb446514af6c6f00fb742bd2086 Mon Sep 17 00:00:00 2001
From: Maya <maya17grd@gmail.com>
Date: Tue, 30 Nov 2021 23:05:22 +0300
Subject: [PATCH 1/3] fix

---
 cvat/apps/engine/media_extractors.py | 21 +++++++++++----------
 cvat/apps/engine/task.py             | 22 ++++++++++++----------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py
index 109fd7bad861..4a68bdca0659 100644
--- a/cvat/apps/engine/media_extractors.py
+++ b/cvat/apps/engine/media_extractors.py
@@ -179,14 +179,14 @@ def __init__(self, source_path, step=1, start=0, stop=None, dimension=DimensionT
         )
 
 class ArchiveReader(DirectoryReader):
-    def __init__(self, source_path, step=1, start=0, stop=None, dimension=DimensionType.DIM_2D):
+    def __init__(self, source_path, step=1, start=0, stop=None, dimension=DimensionType.DIM_2D, extract_dir=None):
         self._archive_source = source_path[0]
-        extract_dir = source_path[1] if len(source_path) > 1 else os.path.dirname(source_path[0])
-        Archive(self._archive_source).extractall(extract_dir)
-        if extract_dir == os.path.dirname(source_path[0]):
+        tmp_dir = extract_dir if extract_dir else os.path.dirname(source_path[0])
+        Archive(self._archive_source).extractall(tmp_dir)
+        if not extract_dir:
             os.remove(self._archive_source)
         super().__init__(
-            source_path=[extract_dir],
+            source_path=[tmp_dir],
             step=step,
             start=start,
             stop=stop,
@@ -194,7 +194,7 @@ def __init__(self, source_path, step=1, start=0, stop=None, dimension=DimensionT
         )
 
 class PdfReader(ImageListReader):
-    def __init__(self, source_path, step=1, start=0, stop=None, dimension=DimensionType.DIM_2D):
+    def __init__(self, source_path, step=1, start=0, stop=None, dimension=DimensionType.DIM_2D, extract_dir=None):
         if not source_path:
             raise Exception('No PDF found')
 
@@ -207,7 +207,7 @@ def _make_name():
                 yield '{}{:09d}.jpeg'.format(_basename, page_num)
 
         from pdf2image import convert_from_path
-        self._tmp_dir = os.path.dirname(source_path[0])
+        self._tmp_dir = extract_dir if extract_dir else os.path.dirname(source_path[0])
         os.makedirs(self._tmp_dir, exist_ok=True)
 
         # Avoid OOM: https://github.com/openvinotoolkit/cvat/issues/940
@@ -215,7 +215,8 @@ def _make_name():
             last_page=stop, paths_only=True,
             output_folder=self._tmp_dir, fmt="jpeg", output_file=_make_name())
 
-        os.remove(source_path[0])
+        if not extract_dir:
+            os.remove(source_path[0])
 
         super().__init__(
             source_path=paths,
@@ -226,9 +227,9 @@ def _make_name():
         )
 
 class ZipReader(ImageListReader):
-    def __init__(self, source_path, step=1, start=0, stop=None, dimension=DimensionType.DIM_2D):
+    def __init__(self, source_path, step=1, start=0, stop=None, dimension=DimensionType.DIM_2D, extract_dir=None):
         self._zip_source = zipfile.ZipFile(source_path[0], mode='r')
-        self.extract_dir = source_path[1] if len(source_path) > 1 else None
+        self.extract_dir = extract_dir
         file_list = [f for f in self._zip_source.namelist() if files_to_ignore(f) and get_mime(f) == 'image']
         super().__init__(file_list, step=step, start=start, stop=stop, dimension=dimension)
 
diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py
index 2003d541c559..de34b6b954aa 100644
--- a/cvat/apps/engine/task.py
+++ b/cvat/apps/engine/task.py
@@ -293,22 +293,24 @@ def _create_thread(tid, data, isImport=False):
             if extractor is not None:
                 raise Exception('Combined data types are not supported')
             source_paths=[os.path.join(upload_dir, f) for f in media_files]
-            if media_type in {'archive', 'zip'} and db_data.storage == models.StorageChoice.SHARE:
-                source_paths.append(db_data.get_upload_dirname())
-                upload_dir = db_data.get_upload_dirname()
-                db_data.storage = models.StorageChoice.LOCAL
             if isImport and media_type == 'image' and db_data.storage == models.StorageChoice.SHARE:
                 manifest_index = _get_manifest_frame_indexer(db_data.start_frame, db_data.get_frame_step())
                 db_data.start_frame = 0
                 data['stop_frame'] = None
                 db_data.frame_filter = ''
 
-            extractor = MEDIA_TYPES[media_type]['extractor'](
-                source_path=source_paths,
-                step=db_data.get_frame_step(),
-                start=db_data.start_frame,
-                stop=data['stop_frame'],
-            )
+            details = {
+                'source_path': source_paths,
+                'step': db_data.get_frame_step(),
+                'start': db_data.start_frame,
+                'stop': data['stop_frame'],
+            }
+            if media_type in {'archive', 'zip', 'pdf'} and db_data.storage == models.StorageChoice.SHARE:
+                details['extract_dir'] = db_data.get_upload_dirname()
+                upload_dir = db_data.get_upload_dirname()
+                db_data.storage = models.StorageChoice.LOCAL
+
+            extractor = MEDIA_TYPES[media_type]['extractor'](**details)
 
 
     validate_dimension = ValidateDimension()

From cdf16b2d64384f64376ccc95bd1089fbcb2ea206 Mon Sep 17 00:00:00 2001
From: Maya <maya17grd@gmail.com>
Date: Mon, 13 Dec 2021 13:16:50 +0300
Subject: [PATCH 2/3] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index af4c55bc2430..df03aa425501 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,6 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Order of labels in tasks and projects (<https://github.com/openvinotoolkit/cvat/pull/3987>)
 - Fixed task creating with large files via webpage (<https://github.com/openvinotoolkit/cvat/pull/3692>)
 - Added information to export CVAT_HOST when performing local installation for accessing over network (<https://github.com/openvinotoolkit/cvat/pull/4014>)
+- Original pdf file is deleted when using share(<https://github.com/openvinotoolkit/cvat/pull/3967>)
 
 ### Security
 - TDB

From 9e320396b7287d9e89fc16383d091c9f72500faa Mon Sep 17 00:00:00 2001
From: Maya <maya17grd@gmail.com>
Date: Thu, 16 Dec 2021 11:01:44 +0300
Subject: [PATCH 3/3] Add test

---
 cvat/apps/engine/tests/test_rest_api.py | 34 ++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/cvat/apps/engine/tests/test_rest_api.py b/cvat/apps/engine/tests/test_rest_api.py
index f5499c62cb41..20b2ee06daa3 100644
--- a/cvat/apps/engine/tests/test_rest_api.py
+++ b/cvat/apps/engine/tests/test_rest_api.py
@@ -2767,6 +2767,13 @@ def setUpClass(cls):
         shutil.rmtree(root_path)
         cls._image_sizes[filename] = image_sizes
 
+        file_name = 'test_1.pdf'
+        path = os.path.join(settings.SHARE_ROOT, file_name)
+        img_sizes, data = generate_pdf_file(file_name, page_count=5)
+        with open(path, "wb") as pdf_file:
+            pdf_file.write(data.read())
+        cls._image_sizes[file_name] = img_sizes
+
         generate_manifest_file(data_type='video', manifest_path=os.path.join(settings.SHARE_ROOT, 'videos', 'manifest.jsonl'),
             sources=[os.path.join(settings.SHARE_ROOT, 'videos', 'test_video_1.mp4')])
 
@@ -2804,6 +2811,9 @@ def tearDownClass(cls):
         path = os.path.join(settings.SHARE_ROOT, "manifest.jsonl")
         os.remove(path)
 
+        path = os.path.join(settings.SHARE_ROOT, "test_1.pdf")
+        os.remove(path)
+
     def _run_api_v1_tasks_id_data_post(self, tid, user, data):
         with ForceLogin(user, self.client):
             response = self.client.post('/api/v1/tasks/{}/data'.format(tid),
@@ -2886,10 +2896,12 @@ def _test_api_v1_tasks_id_data_spec(self, user, spec, data, expected_compressed_
             db_data = Task.objects.get(pk=task_id).data
             self.assertEqual(expected_storage_method, db_data.storage_method)
             self.assertEqual(expected_uploaded_data_location, db_data.storage)
-            # check if used share without copying inside and files doesn`t exist in ../raw/
+            # check if used share without copying inside and files doesn`t exist in ../raw/ and exist in share
             if expected_uploaded_data_location is StorageChoice.SHARE:
-                self.assertEqual(False,
-                    os.path.exists(os.path.join(db_data.get_upload_dirname(), next(iter(data.values())))))
+                raw_file_path = os.path.join(db_data.get_upload_dirname(), next(iter(data.values())))
+                share_file_path = os.path.join(settings.SHARE_ROOT, next(iter(data.values())))
+                self.assertEqual(False, os.path.exists(raw_file_path))
+                self.assertEqual(True, os.path.exists(share_file_path))
 
         # check preview
         response = self._get_preview(task_id, user)
@@ -2956,6 +2968,10 @@ def _test_api_v1_tasks_id_data_spec(self, user, spec, data, expected_compressed_
                 for f in source_files:
                     if zipfile.is_zipfile(f):
                         source_images.extend(self._extract_zip_chunk(f, dimension=dimension))
+                    elif isinstance(f, str) and f.endswith('.pdf'):
+                        with open(f, 'rb') as pdf_file:
+                            source_images.extend(convert_from_bytes(pdf_file.read(),
+                                fmt='png'))
                     elif isinstance(f, io.BytesIO) and \
                             str(getattr(f, 'name', None)).endswith('.pdf'):
                         source_images.extend(convert_from_bytes(f.getvalue(),
@@ -3475,6 +3491,18 @@ def _test_api_v1_tasks_id_data(self, user):
         self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET,
             image_sizes, StorageMethodChoice.CACHE, StorageChoice.SHARE)
 
+        task_spec.update([('name', 'task pdf in the shared folder #30')])
+        task_data = {
+            "server_files[0]": "test_1.pdf",
+            "image_quality": 70,
+            "copy_data": False,
+            "use_cache": True,
+        }
+        image_sizes = self._image_sizes[task_data["server_files[0]"]]
+
+        self._test_api_v1_tasks_id_data_spec(user, task_spec, task_data, self.ChunkType.IMAGESET, self.ChunkType.IMAGESET,
+            image_sizes, StorageMethodChoice.CACHE, StorageChoice.LOCAL)
+
     def test_api_v1_tasks_id_data_admin(self):
         self._test_api_v1_tasks_id_data(self.admin)