marqo-ai · pandu-k · Mar 16, 2023 · Mar 14, 2023 · Mar 14, 2023 · Mar 14, 2023
diff --git a/src/marqo/errors.py b/src/marqo/errors.py
@@ -59,7 +59,7 @@ def __str__(self) -> str:
 
 class MarqoWebError(Exception):
 
-    status_code: int = None
+    status_code: int = 500
     error_type: str = None
     message: str = None
     code: str = None

diff --git a/src/marqo/tensor_search/tensor_search.py b/src/marqo/tensor_search/tensor_search.py
@@ -452,8 +452,12 @@ def add_documents(config: Config, index_name: str, docs: List[dict], auto_refres
                     f"images for {batch_size} docs using {image_download_thread_count} threads ")
 
     if update_mode == 'replace' and use_existing_tensors:
-        # Get existing documents
-        doc_ids = [doc["_id"] for doc in docs if "_id" in doc]
+        doc_ids = []
+
+        # Iterate through the list in reverse, only latest doc with dupe id gets added.
+        for i in range(len(docs)-1, -1, -1):
+            if ("_id" in docs[i]) and (docs[i]["_id"] not in doc_ids):
+                doc_ids.append(docs[i]["_id"])
         existing_docs = _get_documents_for_upsert(config=config, index_name=index_name, document_ids=doc_ids)
 
     for i, doc in enumerate(docs):
@@ -493,6 +497,8 @@ def add_documents(config: Config, index_name: str, docs: List[dict], auto_refres
                 # have IDs:
                 elif len(matching_doc) == 0:
                     existing_doc = {"found": False}
+                else:
+                    raise errors.InternalError(message= f"Upsert: found {len(matching_doc)} matching docs for {doc_id} when only 1 or 0 should have been found.")
         else:
             indexing_instructions["update"]["_id"] = doc_id
 
@@ -915,7 +921,7 @@ def _get_documents_for_upsert(
         if len(result_list) == 0:
             continue
         if len(result_list) not in (2, 0):
-            raise errors.MarqoWebError(f"Bad request for existing documents. "
+            raise errors.InternalError(f"Internal error fetching old documents. "
                                        f"There are {len(result_list)} results for doc id {doc_id}.")
 
         for result in result_list:

diff --git a/tests/tensor_search/test_add_documents.py b/tests/tensor_search/test_add_documents.py
@@ -66,6 +66,52 @@ def test_add_plain_id_field(self):
                 "title 1": "content 1",
                 "desc 2": "content 2. blah blah blah"
             }
+
+    def test_add_documents_dupe_ids(self):
+        """ 
+        TODO
+        Should only use the latest inserted ID. Make sure it doesn't get the first/middle one
+        """
+
+        tensor_search.add_documents(config=self.config, index_name=self.index_name_1, docs=[
+            {
+                "_id": "3",
+                "title": "doc 3b"
+            },
+
+        ], auto_refresh=True)
+
+        doc_3_solo = tensor_search.get_document_by_id(
+            config=self.config, index_name=self.index_name_1,
+            document_id="3", show_vectors=True)
+
+        tensor_search.delete_index(config=self.config, index_name=self.index_name_1)
+        tensor_search.add_documents(config=self.config, index_name=self.index_name_1, docs=[
+            {
+                "_id": "1",
+                "title": "doc 1"
+            },
+            {
+                "_id": "2",
+                "title": "doc 2",
+            },
+            {
+                "_id": "3",
+                "title": "doc 3a",
+            },
+            {
+                "_id": "3",
+                "title": "doc 3b"
+            },
+
+        ], auto_refresh=True)
+
+        doc_3_duped = tensor_search.get_document_by_id(
+            config=self.config, index_name=self.index_name_1,
+            document_id="3", show_vectors=True)
+
+        self.assertEqual(doc_3_solo, doc_3_duped)
+
 
     def test_update_docs_update_chunks(self):
         """Updating a doc needs to update the corresponding chunks"

diff --git a/tests/tensor_search/test_add_documents_use_existing_tensors.py b/tests/tensor_search/test_add_documents_use_existing_tensors.py
@@ -20,7 +20,7 @@ def setUp(self) -> None:
         except IndexNotFoundError as s:
             pass
 
-    def test_use_existing_tensors_relience(self):
+    def test_use_existing_tensors_resilience(self):
         """should if one doc fails validation, the rest should still be inserted
         """
         d1 = {
@@ -59,7 +59,7 @@ def test_use_existing_tensors_no_id(self):
             assert item['result'] == 'created'
 
     def test_use_existing_tensors_non_existing(self):
-        """check parity between a doc created with and without use_existing_tensors,
+        """check parity between a doc created with and without use_existing_tensors, then overwritten,
         for a newly created doc.
         """
         tensor_search.add_documents(config=self.config, index_name=self.index_name_1, docs=[
@@ -68,6 +68,7 @@ def test_use_existing_tensors_non_existing(self):
                 "title 1": "content 1",
                 "desc 2": "content 2. blah blah blah"
             }], auto_refresh=True, use_existing_tensors=False)
+
         regular_doc = tensor_search.get_document_by_id(
             config=self.config, index_name=self.index_name_1,
             document_id="123", show_vectors=True)
@@ -85,6 +86,100 @@ def test_use_existing_tensors_non_existing(self):
             document_id="123", show_vectors=True)
         self.assertEqual(use_existing_tensors_doc, regular_doc)
 
+        tensor_search.delete_index(config=self.config, index_name=self.index_name_1)
+
+        tensor_search.add_documents(config=self.config, index_name=self.index_name_1, docs=[
+            {
+                "_id": "123",
+                "title 1": "content 1",
+                "desc 2": "content 2. blah blah blah"
+            }], auto_refresh=True, use_existing_tensors=True)
+        overwritten_doc = tensor_search.get_document_by_id(
+            config=self.config, index_name=self.index_name_1,
+            document_id="123", show_vectors=True)
+
+        self.assertEqual(use_existing_tensors_doc, overwritten_doc)
+
+    def test_use_existing_tensors_dupe_ids(self):
+        """ 
+        TODO
+        Should only use the latest inserted ID. Make sure it doesn't get the first/middle one
+        """
+
+        tensor_search.add_documents(config=self.config, index_name=self.index_name_1, docs=[
+            {
+                "_id": "3",
+                "title": "doc 3b"
+            },
+
+        ], auto_refresh=True)
+
+        doc_3_solo = tensor_search.get_document_by_id(
+            config=self.config, index_name=self.index_name_1,
+            document_id="3", show_vectors=True)
+
+        tensor_search.delete_index(config=self.config, index_name=self.index_name_1)
+        tensor_search.add_documents(config=self.config, index_name=self.index_name_1, docs=[
+            {
+                "_id": "1",
+                "title": "doc 1"
+            },
+            {
+                "_id": "2",
+                "title": "doc 2",
+            },
+            {
+                "_id": "3",
+                "title": "doc 3a",
+            },
+            {
+                "_id": "3",
+                "title": "doc 3b"
+            },
+
+        ], auto_refresh=True, use_existing_tensors=True)
+
+        doc_3_duped = tensor_search.get_document_by_id(
+            config=self.config, index_name=self.index_name_1,
+            document_id="3", show_vectors=True)
+
+        self.assertEqual(doc_3_solo, doc_3_duped)
+
+        tensor_search.delete_index(config=self.config, index_name=self.index_name_1)
+        tensor_search.add_documents(config=self.config, index_name=self.index_name_1, docs=[
+            {
+                "_id": "1",
+                "title": "doc 1"
+            },
+            {
+                "_id": "2",
+                "title": "doc 2",
+            },
+            {
+                "_id": "3",
+                "title": "doc 3a",
+            },
+            {
+                "_id": "3",
+                "title": "doc 3b"
+            },
+
+        ], auto_refresh=True, use_existing_tensors=True)
+
+        doc_3_overwritten = tensor_search.get_document_by_id(
+            config=self.config, index_name=self.index_name_1,
+            document_id="3", show_vectors=True)
+
+        # Needs to be 3b, not 3a
+        self.assertEqual(doc_3_duped, doc_3_overwritten)
+
+    def test_use_existing_tensors_untensorize_something(self):
+        """
+        TODO
+        During the initial index, one field is a tensor field
+        When we insert the doc again, with use_existing_tensors, we make it a non-tensor-field
+        """
+
     def test_use_existing_tensors_getting_non_tensorised(self):
         """
         During the initial index, one field is set as a non_tensor_field.