From e8fda700edd6f5bc5e6e4f57b7aa163a8adb11be Mon Sep 17 00:00:00 2001 From: Joseph McElroy Date: Fri, 28 Jul 2023 19:37:10 +0100 Subject: [PATCH] Elasticsearch datastore support (#343) * added es datastore * updated readmy and toml * updated readme * clean up code + implemented a few more methods * add tests + fix issues found * update documentation * update notebook * clean up notebook --------- Co-authored-by: Sebastian Montero --- README.md | 32 +- datastore/factory.py | 10 +- .../providers/elasticsearch_datastore.py | 351 +++++++++ docs/providers/elasticsearch/setup.md | 37 + examples/docker/elasticsearch/README.md | 7 + .../docker/elasticsearch/docker-compose.yaml | 27 + examples/providers/elasticsearch/search.ipynb | 724 ++++++++++++++++++ poetry.lock | 38 +- pyproject.toml | 1 + .../test_elasticsearch_datastore.py | 158 ++++ 10 files changed, 1376 insertions(+), 9 deletions(-) create mode 100644 datastore/providers/elasticsearch_datastore.py create mode 100644 docs/providers/elasticsearch/setup.md create mode 100644 examples/docker/elasticsearch/README.md create mode 100644 examples/docker/elasticsearch/docker-compose.yaml create mode 100644 examples/providers/elasticsearch/search.ipynb create mode 100644 tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py diff --git a/README.md b/README.md index edd6f516a..2586bc2f6 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ This README provides detailed information on how to set up, develop, and deploy - [General Environment Variables](#general-environment-variables) - [Choosing a Vector Database](#choosing-a-vector-database) - [Pinecone](#pinecone) + - [Elasticsearch](#elasticsearch) - [Weaviate](#weaviate) - [Zilliz](#zilliz) - [Milvus](#milvus) @@ -166,6 +167,18 @@ Follow these steps to quickly set up and run the ChatGPT Retrieval Plugin: export PG_USER= export PG_PASSWORD= export PG_DATABASE= + + # Elasticsearch + export ELASTICSEARCH_URL= (either specify host or cloud_id) + export ELASTICSEARCH_CLOUD_ID= + + export ELASTICSEARCH_USERNAME= + export ELASTICSEARCH_PASSWORD= + export ELASTICSEARCH_API_KEY= + + export ELASTICSEARCH_INDEX= + export ELASTICSEARCH_REPLICAS= + export ELASTICSEARCH_SHARDS= ``` 10. Run the API locally: `poetry run start` @@ -277,11 +290,11 @@ poetry install The API requires the following environment variables to work: -| Name | Required | Description | -| ---------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `DATASTORE` | Yes | This specifies the vector database provider you want to use to store and query embeddings. You can choose from `chroma`, `pinecone`, `weaviate`, `zilliz`, `milvus`, `qdrant`, `redis`, `azuresearch`, `supabase`, `postgres`, `analyticdb`. | -| `BEARER_TOKEN` | Yes | This is a secret token that you need to authenticate your requests to the API. You can generate one using any tool or method you prefer, such as [jwt.io](https://jwt.io/). | -| `OPENAI_API_KEY` | Yes | This is your OpenAI API key that you need to generate embeddings using the `text-embedding-ada-002` model. You can get an API key by creating an account on [OpenAI](https://openai.com/). | +| Name | Required | Description | +| ---------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `DATASTORE` | Yes | This specifies the vector database provider you want to use to store and query embeddings. You can choose from `elasticsearch`, `chroma`, `pinecone`, `weaviate`, `zilliz`, `milvus`, `qdrant`, `redis`, `azuresearch`, `supabase`, `postgres`, `analyticdb`. | +| `BEARER_TOKEN` | Yes | This is a secret token that you need to authenticate your requests to the API. You can generate one using any tool or method you prefer, such as [jwt.io](https://jwt.io/). | +| `OPENAI_API_KEY` | Yes | This is your OpenAI API key that you need to generate embeddings using the `text-embedding-ada-002` model. You can get an API key by creating an account on [OpenAI](https://openai.com/). | ### Using the plugin with Azure OpenAI @@ -352,6 +365,10 @@ For detailed setup instructions, refer to [`/docs/providers/llama/setup.md`](/do [AnalyticDB](https://www.alibabacloud.com/help/en/analyticdb-for-postgresql/latest/product-introduction-overview) is a distributed cloud-native vector database designed for storing documents and vector embeddings. It is fully compatible with PostgreSQL syntax and managed by Alibaba Cloud. AnalyticDB offers a powerful vector compute engine, processing billions of data vectors and providing features such as indexing algorithms, structured and unstructured data capabilities, real-time updates, distance metrics, scalar filtering, and time travel searches. For detailed setup instructions, refer to [`/docs/providers/analyticdb/setup.md`](/docs/providers/analyticdb/setup.md). +#### Elasticsearch + +[Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html) currently supports storing vectors through the `dense_vector` field type and uses them to calculate document scores. Elasticsearch 8.0 builds on this functionality to support fast, approximate nearest neighbor search (ANN). This represents a much more scalable approach, allowing vector search to run efficiently on large datasets. For detailed setup instructions, refer to [`/docs/providers/elasticsearch/setup.md`](/docs/providers/elasticsearch/setup.md). + ### Running the API locally To run the API locally, you first need to set the requisite environment variables with the `export` command: @@ -489,6 +506,7 @@ The scripts are: - [`process_zip`](scripts/process_zip/): This script processes a file dump of documents in a zip file and stores them in the vector database with some metadata. The format of the zip file should be a flat zip file folder of docx, pdf, txt, md, pptx or csv files. You can provide custom metadata as a JSON string and flags to screen for PII and extract metadata. ## Pull Request (PR) Checklist + If you'd like to contribute, please follow the checklist below when submitting a PR. This will help us review and merge your changes faster! Thank you for contributing! 1. **Type of PR**: Indicate the type of PR by adding a label in square brackets at the beginning of the title, such as `[Bugfix]`, `[Feature]`, `[Enhancement]`, `[Refactor]`, or `[Documentation]`. @@ -533,7 +551,7 @@ feature/advanced-chunking-strategy-123 While the ChatGPT Retrieval Plugin is designed to provide a flexible solution for semantic search and retrieval, it does have some limitations: -- **Keyword search limitations**: The embeddings generated by the `text-embedding-ada-002` model may not always be effective at capturing exact keyword matches. As a result, the plugin might not return the most relevant results for queries that rely heavily on specific keywords. Some vector databases, like Pinecone, Weaviate and Azure Cognitive Search, use hybrid search and might perform better for keyword searches. +- **Keyword search limitations**: The embeddings generated by the `text-embedding-ada-002` model may not always be effective at capturing exact keyword matches. As a result, the plugin might not return the most relevant results for queries that rely heavily on specific keywords. Some vector databases, like Elasticsearch, Pinecone, Weaviate and Azure Cognitive Search, use hybrid search and might perform better for keyword searches. - **Sensitive data handling**: The plugin does not automatically detect or filter sensitive data. It is the responsibility of the developers to ensure that they have the necessary authorization to include content in the Retrieval Plugin and that the content complies with data privacy requirements. - **Scalability**: The performance of the plugin may vary depending on the chosen vector database provider and the size of the dataset. Some providers may offer better scalability and performance than others. - **Language support**: The plugin currently uses OpenAI's `text-embedding-ada-002` model, which is optimized for use in English. However, it is still robust enough to generate good results for a variety of languages. @@ -585,3 +603,5 @@ We would like to extend our gratitude to the following contributors for their co - [Postgres](https://www.postgresql.org/) - [egor-romanov](https://github.com/egor-romanov) - [mmmaia](https://github.com/mmmaia) +- [Elasticsearch](https://www.elastic.co/) + - [joemcelroy](https://github.com/joemcelroy) diff --git a/datastore/factory.py b/datastore/factory.py index adde49d76..32b968850 100644 --- a/datastore/factory.py +++ b/datastore/factory.py @@ -56,8 +56,14 @@ async def get_datastore() -> DataStore: from datastore.providers.analyticdb_datastore import AnalyticDBDataStore return AnalyticDBDataStore() + case "elasticsearch": + from datastore.providers.elasticsearch_datastore import ( + ElasticsearchDataStore, + ) + + return ElasticsearchDataStore() case _: raise ValueError( f"Unsupported vector database: {datastore}. " - f"Try one of the following: llama, pinecone, weaviate, milvus, zilliz, redis, or qdrant" - ) \ No newline at end of file + f"Try one of the following: llama, elasticsearch, pinecone, weaviate, milvus, zilliz, redis, or qdrant" + ) diff --git a/datastore/providers/elasticsearch_datastore.py b/datastore/providers/elasticsearch_datastore.py new file mode 100644 index 000000000..6380735c5 --- /dev/null +++ b/datastore/providers/elasticsearch_datastore.py @@ -0,0 +1,351 @@ +import os +from typing import Dict, List, Any, Optional + +import elasticsearch +from elasticsearch import Elasticsearch, helpers +from loguru import logger + +from datastore.datastore import DataStore +from models.models import ( + DocumentChunk, + DocumentChunkWithScore, + DocumentMetadataFilter, + QueryResult, + QueryWithEmbedding, +) +from services.date import to_unix_timestamp + +ELASTICSEARCH_URL = os.environ.get("ELASTICSEARCH_URL", "http://localhost:9200") +ELASTICSEARCH_CLOUD_ID = os.environ.get("ELASTICSEARCH_CLOUD_ID") +ELASTICSEARCH_USERNAME = os.environ.get("ELASTICSEARCH_USERNAME") +ELASTICSEARCH_PASSWORD = os.environ.get("ELASTICSEARCH_PASSWORD") +ELASTICSEARCH_API_KEY = os.environ.get("ELASTICSEARCH_API_KEY") + +ELASTICSEARCH_INDEX = os.environ.get("ELASTICSEARCH_INDEX") +ELASTICSEARCH_REPLICAS = int(os.environ.get("ELASTICSEARCH_REPLICAS", "1")) +ELASTICSEARCH_SHARDS = int(os.environ.get("ELASTICSEARCH_SHARDS", "1")) + +VECTOR_SIZE = 1536 +UPSERT_BATCH_SIZE = 100 + + +class ElasticsearchDataStore(DataStore): + def __init__( + self, + index_name: Optional[str] = None, + vector_size: int = VECTOR_SIZE, + similarity: str = "cosine", + replicas: int = ELASTICSEARCH_REPLICAS, + shards: int = ELASTICSEARCH_SHARDS, + recreate_index: bool = True, + ): + """ + Args: + index_name: Name of the index to be used + vector_size: Size of the embedding stored in a collection + similarity: + Any of "cosine" / "l2_norm" / "dot_product". + + """ + assert similarity in [ + "cosine", + "l2_norm", + "dot_product", + ], "Similarity must be one of 'cosine' / 'l2_norm' / 'dot_product'." + assert replicas > 0, "Replicas must be greater than or equal to 0." + assert shards > 0, "Shards must be greater than or equal to 0." + + self.client = connect_to_elasticsearch( + ELASTICSEARCH_URL, + ELASTICSEARCH_CLOUD_ID, + ELASTICSEARCH_API_KEY, + ELASTICSEARCH_USERNAME, + ELASTICSEARCH_PASSWORD, + ) + assert ( + index_name != "" or ELASTICSEARCH_INDEX != "" + ), "Please provide an index name." + self.index_name = index_name or ELASTICSEARCH_INDEX or "" + + replicas = replicas or ELASTICSEARCH_REPLICAS + shards = shards or ELASTICSEARCH_SHARDS + + # Set up the collection so the documents might be inserted or queried + self._set_up_index(vector_size, similarity, replicas, shards, recreate_index) + + async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]: + """ + Takes in a list of document chunks and inserts them into the database. + Return a list of document ids. + """ + actions = [] + for _, chunkList in chunks.items(): + for chunk in chunkList: + actions = ( + actions + + self._convert_document_chunk_to_es_document_operation(chunk) + ) + + self.client.bulk(operations=actions, index=self.index_name) + return list(chunks.keys()) + + async def _query( + self, + queries: List[QueryWithEmbedding], + ) -> List[QueryResult]: + """ + Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores. + """ + searches = self._convert_queries_to_msearch_query(queries) + results = self.client.msearch(searches=searches) + return [ + QueryResult( + query=query.query, + results=[ + self._convert_hit_to_document_chunk_with_score(hit) + for hit in result["hits"]["hits"] + ], + ) + for query, result in zip(queries, results["responses"]) + ] + + async def delete( + self, + ids: Optional[List[str]] = None, + filter: Optional[DocumentMetadataFilter] = None, + delete_all: Optional[bool] = None, + ) -> bool: + """ + Removes vectors by ids, filter, or everything in the datastore. + Returns whether the operation was successful. + """ + + # Delete all vectors from the index if delete_all is True + if delete_all: + try: + logger.info(f"Deleting all vectors from index") + self.client.delete_by_query( + index=self.index_name, query={"match_all": {}} + ) + logger.info(f"Deleted all vectors successfully") + return True + except Exception as e: + logger.error(f"Error deleting all vectors: {e}") + raise e + + # Convert the metadata filter object to a dict with elasticsearch filter expressions + es_filters = self._get_es_filters(filter) + # Delete vectors that match the filter from the index if the filter is not empty + if es_filters != {}: + try: + logger.info(f"Deleting vectors with filter {es_filters}") + self.client.delete_by_query(index=self.index_name, query=es_filters) + logger.info(f"Deleted vectors with filter successfully") + except Exception as e: + logger.error(f"Error deleting vectors with filter: {e}") + raise e + + if ids: + try: + documents_to_delete = [doc_id for doc_id in ids] + logger.info(f"Deleting {len(documents_to_delete)} documents") + res = self.client.delete_by_query( + index=self.index_name, + query={"terms": {"metadata.document_id": documents_to_delete}}, + ) + logger.info(f"Deleted documents successfully") + except Exception as e: + logger.error(f"Error deleting documents: {e}") + raise e + + return True + + def _get_es_filters( + self, filter: Optional[DocumentMetadataFilter] = None + ) -> Dict[str, Any]: + if filter is None: + return {} + + es_filters = { + "bool": { + "must": [], + } + } + + # For each field in the MetadataFilter, check if it has a value and add the corresponding pinecone filter expression + # For start_date and end_date, uses the range query - gte and lte operators respectively + # For other fields, uses the term query + for field, value in filter.dict().items(): + if value is not None: + if field == "start_date": + es_filters["bool"]["must"].append( + {"range": {"created_at": {"gte": to_unix_timestamp(value)}}} + ) + elif field == "end_date": + es_filters["bool"]["must"].append( + {"range": {"created_at": {"lte": to_unix_timestamp(value)}}} + ) + else: + es_filters["bool"]["must"].append( + {"term": {f"metadata.{field}": value}} + ) + + return es_filters + + def _convert_document_chunk_to_es_document_operation( + self, document_chunk: DocumentChunk + ) -> List[Dict]: + created_at = ( + to_unix_timestamp(document_chunk.metadata.created_at) + if document_chunk.metadata.created_at is not None + else None + ) + + action_and_metadata = { + "index": { + "_index": self.index_name, + "_id": document_chunk.id, + } + } + + source = { + "id": document_chunk.id, + "text": document_chunk.text, + "metadata": document_chunk.metadata.dict(), + "created_at": created_at, + "embedding": document_chunk.embedding, + } + + return [action_and_metadata, source] + + def _convert_queries_to_msearch_query(self, queries: List[QueryWithEmbedding]): + searches = [] + + for query in queries: + searches.append({"index": self.index_name}) + searches.append( + { + "_source": True, + "knn": { + "field": "embedding", + "query_vector": query.embedding, + "k": query.top_k, + "num_candidates": query.top_k, + }, + "size": query.top_k, + } + ) + + return searches + + def _convert_hit_to_document_chunk_with_score(self, hit) -> DocumentChunkWithScore: + return DocumentChunkWithScore( + id=hit["_id"], + text=hit["_source"]["text"], # type: ignore + metadata=hit["_source"]["metadata"], # type: ignore + embedding=hit["_source"]["embedding"], # type: ignore + score=hit["_score"], + ) + + def _set_up_index( + self, + vector_size: int, + similarity: str, + replicas: int, + shards: int, + recreate_index: bool, + ) -> None: + if recreate_index: + self._recreate_index(similarity, vector_size, replicas, shards) + + try: + index_mapping = self.client.indices.get_mapping(index=self.index_name) + current_similarity = index_mapping[self.index_name]["mappings"]["properties"]["embedding"]["similarity"] # type: ignore + current_vector_size = index_mapping[self.index_name]["mappings"]["properties"]["embedding"]["dims"] # type: ignore + + if current_similarity != similarity: + raise ValueError( + f"Collection '{self.index_name}' already exists in Elasticsearch, " + f"but it is configured with a similarity '{current_similarity}'. " + f"If you want to use that collection, but with a different " + f"similarity, please set `recreate_index=True` argument." + ) + + if current_vector_size != vector_size: + raise ValueError( + f"Collection '{self.index_name}' already exists in Elasticsearch, " + f"but it is configured with a vector size '{current_vector_size}'. " + f"If you want to use that collection, but with a different " + f"vector size, please set `recreate_index=True` argument." + ) + except elasticsearch.exceptions.NotFoundError: + self._recreate_index(similarity, vector_size, replicas, shards) + + def _recreate_index( + self, similarity: str, vector_size: int, replicas: int, shards: int + ) -> None: + settings = { + "index": { + "number_of_shards": shards, + "number_of_replicas": replicas, + "refresh_interval": "1s", + } + } + mappings = { + "properties": { + "embedding": { + "type": "dense_vector", + "dims": vector_size, + "index": True, + "similarity": similarity, + } + } + } + + self.client.indices.delete( + index=self.index_name, ignore_unavailable=True, allow_no_indices=True + ) + self.client.indices.create( + index=self.index_name, mappings=mappings, settings=settings + ) + + +def connect_to_elasticsearch( + elasticsearch_url=None, cloud_id=None, api_key=None, username=None, password=None +): + # Check if both elasticsearch_url and cloud_id are defined + if elasticsearch_url and cloud_id: + raise ValueError( + "Both elasticsearch_url and cloud_id are defined. Please provide only one." + ) + + # Initialize connection parameters dictionary + connection_params = {} + + # Define the connection based on the provided parameters + if elasticsearch_url: + connection_params["hosts"] = [elasticsearch_url] + elif cloud_id: + connection_params["cloud_id"] = cloud_id + else: + raise ValueError("Please provide either elasticsearch_url or cloud_id.") + + # Add authentication details based on the provided parameters + if api_key: + connection_params["api_key"] = api_key + elif username and password: + connection_params["basic_auth"] = (username, password) + else: + logger.warning( + "No authentication details provided. Please consider using an api_key or username and password to secure your connection." + ) + + # Establish the Elasticsearch client connection + es_client = Elasticsearch(**connection_params) + try: + es_client.info() + except Exception as e: + logger.error(f"Error connecting to Elasticsearch: {e}") + raise e + + return es_client diff --git a/docs/providers/elasticsearch/setup.md b/docs/providers/elasticsearch/setup.md new file mode 100644 index 000000000..9a3d5ea12 --- /dev/null +++ b/docs/providers/elasticsearch/setup.md @@ -0,0 +1,37 @@ +# Elasticsearch + +Elasticsearch is a search engine based on the Lucene library. It provides a distributed, full-text and vector search engine with an HTTP web interface and schema-free JSON documents. To use Elasticsearch as your vector database, start by [installing Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html) or signing up for a free trial of [Elastic Cloud](https://www.elastic.co/cloud/). + +The app will create an Elasticsearch index for you automatically when you run it for the first time. Just pick a name for your index and set it as an environment variable. + +**Environment Variables:** + +| Name | Required | Description | +| --------------------- | -------- | -------------------------------------------------------------------------------------------------------------------- | +| `DATASTORE` | Yes | Datastore name, set this to `elasticsearch` | +| `BEARER_TOKEN` | Yes | Your secret token for authenticating requests to the API | +| `OPENAI_API_KEY` | Yes | Your OpenAI API key for generating embeddings with the `text-embedding-ada-002` model | +| `ELASTICSEARCH_INDEX` | Yes | Your chosen Elasticsearch index name. **Note:** Index name must consist of lower case alphanumeric characters or '-' | + +**Connection Evironment Variables:** +Depending on your Elasticsearch setup, you may need to set one of the following environment variables to connect to your Elasticsearch instance. If you are using Elastic Cloud, you can connect via `ELASTICSEARCH_CLOUD_ID`. If you are using a local instance of Elasticsearch, you will need to set `ELASTICSEARCH_URL`. + +You can authenticate to Elasticsearch using either `ELASTICSEARCH_USERNAME` and `ELASTICSEARCH_PASSWORD` or `ELASTICSEARCH_API_KEY`. If you are using Elastic Cloud, you can find this in Kibana. + +| Name | Required | Description | +| ------------------------ | -------- | ------------------------------------------------------------------------------------------------ | +| `ELASTICSEARCH_URL` | Yes | Your Elasticsearch URL. If installed locally, this would be https://localhost:9200 | +| `ELASTICSEARCH_CLOUD_ID` | Yes | Your cloud id, linked to your deployment. This can be found in the deployment's console | +| `ELASTICSEARCH_USERNAME` | Yes | Your username for authenticating requests to the API. Commonly 'elastic'. | +| `ELASTICSEARCH_PASSWORD` | Yes | Your password for authenticating requests to the API | +| `ELASTICSEARCH_API_KEY` | Yes | Alternatively you can authenticate using api-key. This can be created in Kibana stack management | + +## Running Elasticsearch Integration Tests + +A suite of integration tests is available to verify the Elasticsearch integration. To run the tests, run the docker compose found in the `examples/docker/elasticsearch` folder with `docker-compose up`. This will start Elasticsearch in single node, security off mode, listening on `http://localhost:9200`. + +Then, launch the test suite with this command: + +```bash +pytest ./tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py +``` diff --git a/examples/docker/elasticsearch/README.md b/examples/docker/elasticsearch/README.md new file mode 100644 index 000000000..c3ed3533c --- /dev/null +++ b/examples/docker/elasticsearch/README.md @@ -0,0 +1,7 @@ +## Running Elasticsearch + +```bash +docker-compose up -d +``` + +should now be running at http://localhost:9200 diff --git a/examples/docker/elasticsearch/docker-compose.yaml b/examples/docker/elasticsearch/docker-compose.yaml new file mode 100644 index 000000000..23d83c85b --- /dev/null +++ b/examples/docker/elasticsearch/docker-compose.yaml @@ -0,0 +1,27 @@ +version: "3.7" + +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.8.2 + container_name: elasticsearch + environment: + - discovery.type=single-node + - node.name=elasticsearch + - xpack.security.enabled=false + ulimits: + memlock: + soft: -1 + hard: -1 + ports: + - "9200:9200" + networks: + - esnet + volumes: + - esdata:/usr/share/elasticsearch/data + +networks: + esnet: + +volumes: + esdata: + driver: local diff --git a/examples/providers/elasticsearch/search.ipynb b/examples/providers/elasticsearch/search.ipynb new file mode 100644 index 000000000..31768262d --- /dev/null +++ b/examples/providers/elasticsearch/search.ipynb @@ -0,0 +1,724 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using Elasticsearch as a datastore\n", + "\n", + "In this walkthrough we will see how to use the retrieval API with a Elasticsearch datastore for *search / question-answering*.\n", + "\n", + "Before running this notebook you should have already initialized the retrieval API and have it running locally or elsewhere. See readme for instructions on how to do this." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## App Quickstart" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Install Python 3.10 if not already installed.\n", + "\n", + "2. Clone the `retrieval-app` repository:\n", + "\n", + "```\n", + "git clone git@github.com:openai/retrieval-app.git\n", + "```\n", + "\n", + "3. Navigate to the app directory:\n", + "\n", + "```\n", + "cd /path/to/retrieval-app\n", + "```\n", + "\n", + "4. Install `poetry`:\n", + "\n", + "```\n", + "pip install poetry\n", + "```\n", + "\n", + "5. Create a new virtual environment:\n", + "\n", + "```\n", + "poetry env use python3.10\n", + "```\n", + "\n", + "6. Install the `retrieval-app` dependencies:\n", + "\n", + "```\n", + "poetry install\n", + "```\n", + "\n", + "7. Set app environment variables:\n", + "\n", + "* `BEARER_TOKEN`: Secret token used by the app to authorize incoming requests. We will later include this in the request `headers`. The token can be generated however you prefer, such as using [jwt.io](https://jwt.io/).\n", + "\n", + "* `OPENAI_API_KEY`: The OpenAI API key used for generating embeddings with the `text-embedding-ada-002` model. [Get an API key here](https://platform.openai.com/account/api-keys)!\n", + "\n", + "8. Set Elasticsearch-specific environment variables:\n", + "\n", + "* `DATASTORE`: set to `elasticsearch`.\n", + "\n", + "9. Set the Elasticsearch connection specific environment variables. Either set `ELASTICSEARCH_CLOUD_ID` or `ELASTICSEARCH_URL`.\n", + "* `ELASTICSEARCH_CLOUD_ID`: Set to your deployment cloud id. You can find this in the [Elasticsearch console](https://cloud.elastic.co).\n", + "\n", + "* `ELASTICSEARCH_URL`: Set to your Elasticsearch URL, looks like `https://:@:`. You can find this in the [Elasticsearch console](https://cloud.elastic.co).\n", + "\n", + "10. Set the Elasticsearch authentication specific environment variables. Either set `ELASTICSEARCH_USERNAME` and `ELASTICSEARCH_PASSWORD` or `ELASTICSEARCH_API_KEY`.\n", + "\n", + "* `ELASTICSEARCH_USERNAME`: Set to your Elasticsearch username. You can find this in the [Elasticsearch console](https://cloud.elastic.co). Typically this is set to `elastic`.\n", + "\n", + "* `ELASTICSEARCH_PASSWORD`: Set to your Elasticsearch password. You can find this in the [Elasticsearch console](https://cloud.elastic.co) in security.\n", + "\n", + "* `ELASTICSEARCH_API_KEY`: Set to your Elasticsearch API key. You can set one up in Kibana Stack management page.\n", + "\n", + "11. Set the Elasticsearch index specific environment variables.\n", + "\n", + "* `ELASTICSEARCH_INDEX`: Set to the name of the Elasticsearch index you want to use.\n", + "\n", + "12. Run the app with:\n", + "\n", + "```\n", + "poetry run start\n", + "```\n", + "\n", + "If running the app locally you should see something like:\n", + "\n", + "```\n", + "INFO: Uvicorn running on http://0.0.0.0:8000\n", + "INFO: Application startup complete.\n", + "```\n", + "\n", + "In that case, the app is automatically connected to our index (specified by `ELASTICSEARCH_INDEX`), if no index with that name existed beforehand, the app creates one for us.\n", + "\n", + "Now we're ready to move on to populating our index with some data." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Required Libraries" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are a few Python libraries we must `pip install` for this notebook to run, those are:" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install -qU datasets pandas tqdm" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing Data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we will use the **S**tanford **Qu**estion **A**nswering **D**ataset (SQuAD2), which we download from Hugging Face Datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "data = load_dataset(\"squad_v2\", split=\"train\")\n", + "data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transform the data into a Pandas dataframe for simpler preprocessing." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtitlecontextquestionanswers
056be85543aeaaa14008c9063BeyoncéBeyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...When did Beyonce start becoming popular?{'text': ['in the late 1990s'], 'answer_start'...
156be85543aeaaa14008c9065BeyoncéBeyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...What areas did Beyonce compete in when she was...{'text': ['singing and dancing'], 'answer_star...
256be85543aeaaa14008c9066BeyoncéBeyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...When did Beyonce leave Destiny's Child and bec...{'text': ['2003'], 'answer_start': [526]}
356bf6b0f3aeaaa14008c9601BeyoncéBeyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...In what city and state did Beyonce grow up?{'text': ['Houston, Texas'], 'answer_start': [...
456bf6b0f3aeaaa14008c9602BeyoncéBeyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...In which decade did Beyonce become famous?{'text': ['late 1990s'], 'answer_start': [276]}
\n", + "
" + ], + "text/plain": [ + " id title \\\n", + "0 56be85543aeaaa14008c9063 Beyoncé \n", + "1 56be85543aeaaa14008c9065 Beyoncé \n", + "2 56be85543aeaaa14008c9066 Beyoncé \n", + "3 56bf6b0f3aeaaa14008c9601 Beyoncé \n", + "4 56bf6b0f3aeaaa14008c9602 Beyoncé \n", + "\n", + " context \\\n", + "0 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b... \n", + "1 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b... \n", + "2 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b... \n", + "3 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b... \n", + "4 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b... \n", + "\n", + " question \\\n", + "0 When did Beyonce start becoming popular? \n", + "1 What areas did Beyonce compete in when she was... \n", + "2 When did Beyonce leave Destiny's Child and bec... \n", + "3 In what city and state did Beyonce grow up? \n", + "4 In which decade did Beyonce become famous? \n", + "\n", + " answers \n", + "0 {'text': ['in the late 1990s'], 'answer_start'... \n", + "1 {'text': ['singing and dancing'], 'answer_star... \n", + "2 {'text': ['2003'], 'answer_start': [526]} \n", + "3 {'text': ['Houston, Texas'], 'answer_start': [... \n", + "4 {'text': ['late 1990s'], 'answer_start': [276]} " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = data.to_pandas()\n", + "data.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains a lot of duplicate `context` paragraphs, this is because each `context` can have many relevant questions. We don't want these duplicates so we remove like so:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "19029\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtitlecontextquestionanswers
056be85543aeaaa14008c9063BeyoncéBeyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...When did Beyonce start becoming popular?{'text': ['in the late 1990s'], 'answer_start'...
1556be86cf3aeaaa14008c9076BeyoncéFollowing the disbandment of Destiny's Child i...After her second solo album, what other entert...{'text': ['acting'], 'answer_start': [207]}
2756be88473aeaaa14008c9080BeyoncéA self-described \"modern-day feminist\", Beyonc...In her music, what are some recurring elements...{'text': ['love, relationships, and monogamy']...
3956be892d3aeaaa14008c908bBeyoncéBeyoncé Giselle Knowles was born in Houston, T...Beyonce's younger sibling also sang with her i...{'text': ['Destiny's Child'], 'answer_start': ...
5256be8a583aeaaa14008c9094BeyoncéBeyoncé attended St. Mary's Elementary School ...What town did Beyonce go to school in?{'text': ['Fredericksburg'], 'answer_start': [...
\n", + "
" + ], + "text/plain": [ + " id title \\\n", + "0 56be85543aeaaa14008c9063 Beyoncé \n", + "15 56be86cf3aeaaa14008c9076 Beyoncé \n", + "27 56be88473aeaaa14008c9080 Beyoncé \n", + "39 56be892d3aeaaa14008c908b Beyoncé \n", + "52 56be8a583aeaaa14008c9094 Beyoncé \n", + "\n", + " context \\\n", + "0 Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b... \n", + "15 Following the disbandment of Destiny's Child i... \n", + "27 A self-described \"modern-day feminist\", Beyonc... \n", + "39 Beyoncé Giselle Knowles was born in Houston, T... \n", + "52 Beyoncé attended St. Mary's Elementary School ... \n", + "\n", + " question \\\n", + "0 When did Beyonce start becoming popular? \n", + "15 After her second solo album, what other entert... \n", + "27 In her music, what are some recurring elements... \n", + "39 Beyonce's younger sibling also sang with her i... \n", + "52 What town did Beyonce go to school in? \n", + "\n", + " answers \n", + "0 {'text': ['in the late 1990s'], 'answer_start'... \n", + "15 {'text': ['acting'], 'answer_start': [207]} \n", + "27 {'text': ['love, relationships, and monogamy']... \n", + "39 {'text': ['Destiny's Child'], 'answer_start': ... \n", + "52 {'text': ['Fredericksburg'], 'answer_start': [... " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = data.drop_duplicates(subset=[\"context\"])\n", + "print(len(data))\n", + "data.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The format required by the apps `upsert` function is a list of documents like:\n", + "\n", + "```json\n", + "[\n", + " {\n", + " \"id\": \"abc\",\n", + " \"text\": \"some important document text\",\n", + " \"metadata\": {\n", + " \"field1\": \"optional metadata goes here\",\n", + " \"field2\": 54\n", + " }\n", + " },\n", + " {\n", + " \"id\": \"123\",\n", + " \"text\": \"some other important text\",\n", + " \"metadata\": {\n", + " \"field1\": \"another metadata\",\n", + " \"field2\": 71,\n", + " \"field3\": \"not all metadatas need the same structure\"\n", + " }\n", + " }\n", + " ...\n", + "]\n", + "```\n", + "\n", + "Every document *must* have a `\"text\"` field. The `\"id\"` and `\"metadata\"` fields are optional.\n", + "\n", + "To create this format for our SQuAD data we do:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documents = [\n", + " {\n", + " 'id': r['id'],\n", + " 'text': r['context'],\n", + " 'metadata': {\n", + " 'title': r['title']\n", + " }\n", + " } for r in data.to_dict(orient='records')\n", + "]\n", + "documents[:3]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing the Docs" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, it's time to initiate the indexing process, also known as upserting, for our documents. To perform these requests to the retrieval app API, we must provide authorization using the BEARER_TOKEN we defined earlier. Below is how we accomplish this:" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "BEARER_TOKEN = os.environ.get(\"BEARER_TOKEN\") or \"BEARER_TOKEN_HERE\"\n", + "\n", + "headers = {\n", + " \"Authorization\": f\"Bearer {BEARER_TOKEN}\"\n", + "}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will execute bulk inserts in batches set by the `batch_size`.\n", + "\n", + "Now that all our SQuAD2 records have been successfully indexed, we can proceed with the querying phase." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:16<00:00, 16.88s/it]\n" + ] + } + ], + "source": [ + "from tqdm.auto import tqdm\n", + "import requests\n", + "from requests.adapters import HTTPAdapter, Retry\n", + "\n", + "batch_size = 100\n", + "endpoint_url = \"http://localhost:8000\"\n", + "s = requests.Session()\n", + "\n", + "# we setup a retry strategy to retry on 5xx errors\n", + "retries = Retry(\n", + " total=5, # number of retries before raising error\n", + " backoff_factor=0.1,\n", + " status_forcelist=[500, 502, 503, 504]\n", + ")\n", + "s.mount('http://', HTTPAdapter(max_retries=retries))\n", + "\n", + "for i in tqdm(range(0, 10, batch_size)):\n", + " i_end = min(len(documents), i+batch_size)\n", + " # make post request that allows up to 5 retries\n", + " res = s.post(\n", + " f\"{endpoint_url}/upsert\",\n", + " headers=headers,\n", + " json={\n", + " \"documents\": documents[i:i_end]\n", + " }\n", + " )" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Making Queries" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By passing one or more queries to the /query endpoint, we can easily conduct a query on the datastore. For this task, we can utilize a few questions from SQuAD2." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19029" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "queries = data['question'].tolist()\n", + "# format into the structure needed by the /query endpoint\n", + "queries = [{'query': queries[i]} for i in range(len(queries))]\n", + "len(queries)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res = requests.post(\n", + " \"http://0.0.0.0:8000/query\",\n", + " headers=headers,\n", + " json={\n", + " 'queries': queries[:3]\n", + " }\n", + ")\n", + "res" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point, we have the ability to iterate through the responses and observe the outcomes obtained for each query:" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------------------------------------\n", + "When did Beyonce start becoming popular?\n", + "\n", + "0.93: On December 13, 2013, Beyoncé unexpectedly released her eponymous fifth studio album on the iTunes Store without any prior announcement or promotion. The album debuted atop the Billboard 200 chart, giving Beyoncé her fifth consecutive number-one album in the US. This made her the first woman in the chart's history to have her first five studio albums debut at number one. Beyoncé received critical acclaim and commercial success, selling one million digital copies worldwide in six days; The New York Times noted the album's unconventional, unexpected release as significant. Musically an electro-R&B album, it concerns darker themes previously unexplored in her work, such as \"bulimia, postnatal depression [and] the fears and insecurities of marriage and motherhood\". The single \"Drunk in Love\", featuring Jay Z, peaked at number two on the Billboard Hot 100 chart.\n", + "0.93: Beyoncé's first solo recording was a feature on Jay Z's \"'03 Bonnie & Clyde\" that was released in October 2002, peaking at number four on the U.S. Billboard Hot 100 chart. Her first solo album Dangerously in Love was released on June 24, 2003, after Michelle Williams and Kelly Rowland had released their solo efforts. The album sold 317,000 copies in its first week, debuted atop the Billboard 200, and has since sold 11 million copies worldwide. The album's lead single, \"Crazy in Love\", featuring Jay Z, became Beyoncé's first number-one single as a solo artist in the US. The single \"Baby Boy\" also reached number one, and singles, \"Me, Myself and I\" and \"Naughty Girl\", both reached the top-five.\n", + "0.93: Beyoncé is believed to have first started a relationship with Jay Z after a collaboration on \"'03 Bonnie & Clyde\", which appeared on his seventh album The Blueprint 2: The Gift & The Curse (2002). Beyoncé appeared as Jay Z's girlfriend in the music video for the song, which would further fuel speculation of their relationship. On April 4, 2008, Beyoncé and Jay Z were married without publicity. As of April 2014, the couple have sold a combined 300 million records together. The couple are known for their private relationship, although they have appeared to become more relaxed in recent years. Beyoncé suffered a miscarriage in 2010 or 2011, describing it as \"the saddest thing\" she had ever endured. She returned to the studio and wrote music in order to cope with the loss.\n", + "----------------------------------------------------------------------\n", + "\n", + "\n", + "----------------------------------------------------------------------\n", + "After her second solo album, what other entertainment venture did Beyonce explore?\n", + "\n", + "0.93: Following the disbandment of Destiny's Child in June 2005, she released her second solo album, B'Day (2006), which contained hits \"Déjà Vu\", \"Irreplaceable\", and \"Beautiful Liar\". Beyoncé also ventured into acting, with a Golden Globe-nominated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2006) and Obsessed (2009). Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and earned a record-setting six Grammy Awards in 2010, including Song of the Year for \"Single Ladies (Put a Ring on It)\".\n", + "0.92: Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, \"to live life, to be inspired by things again\". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances.\n", + "0.92: Beyoncé took a hiatus from music in 2010 and took over management of her career; her fourth album 4 (2011) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. Her critically acclaimed fifth studio album, Beyoncé (2013), was distinguished from previous releases by its experimental production and exploration of darker themes.\n", + "----------------------------------------------------------------------\n", + "\n", + "\n", + "----------------------------------------------------------------------\n", + "In her music, what are some recurring elements in them?\n", + "\n", + "0.91: Beyoncé's music is generally R&B, but she also incorporates pop, soul and funk into her songs. 4 demonstrated Beyoncé's exploration of 90s-style R&B, as well as further use of soul and hip hop than compared to previous releases. While she almost exclusively releases English songs, Beyoncé recorded several Spanish songs for Irreemplazable (re-recordings of songs from B'Day for a Spanish-language audience), and the re-release of B'Day. To record these, Beyoncé was coached phonetically by American record producer Rudy Perez.\n", + "0.9: The feminism and female empowerment themes on Beyoncé's second solo album B'Day were inspired by her role in Dreamgirls and by singer Josephine Baker. Beyoncé paid homage to Baker by performing \"Déjà Vu\" at the 2006 Fashion Rocks concert wearing Baker's trademark mini-hula skirt embellished with fake bananas. Beyoncé's third solo album I Am... Sasha Fierce was inspired by Jay Z and especially by Etta James, whose \"boldness\" inspired Beyoncé to explore other musical genres and styles. Her fourth solo album, 4, was inspired by Fela Kuti, 1990s R&B, Earth, Wind & Fire, DeBarge, Lionel Richie, Teena Marie with additional influences by The Jackson 5, New Edition, Adele, Florence and the Machine, and Prince.\n", + "0.9: She has received co-writing credits for most of the songs recorded with Destiny's Child and her solo efforts. Her early songs were personally driven and female-empowerment themed compositions like \"Independent Women\" and \"Survivor\", but after the start of her relationship with Jay Z she transitioned to more man-tending anthems such as \"Cater 2 U\". Beyoncé has also received co-producing credits for most of the records in which she has been involved, especially during her solo efforts. However, she does not formulate beats herself, but typically comes up with melodies and ideas during production, sharing them with producers.\n", + "----------------------------------------------------------------------\n", + "\n", + "\n" + ] + } + ], + "source": [ + "for query_result in res.json()['results']:\n", + " query = query_result['query']\n", + " answers = []\n", + " scores = []\n", + " for result in query_result['results']:\n", + " answers.append(result['text'])\n", + " scores.append(round(result['score'], 2))\n", + " print(\"-\"*70+\"\\n\"+query+\"\\n\\n\"+\"\\n\".join([f\"{s}: {a}\" for a, s in zip(answers, scores)])+\"\\n\"+\"-\"*70+\"\\n\\n\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The top results are all relevant as we would have hoped. We can see that the `score` is a measure of how relevant the document is to the query. The higher the score the more relevant the document is to the query." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "chatgpt-retrieval-plugin-S7h-2AWq-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.3" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "1979a773a5778de9a5fa593a629dff0ab3c80c2563810d3e6a8dfb123dc01c7d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index 2afe8df38..09edc2df7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -926,6 +926,42 @@ files = [ {file = "duckdb-0.8.0.tar.gz", hash = "sha256:c68da35bab5072a64ada2646a5b343da620ddc75a7a6e84aa4a1e0628a7ec18f"}, ] +[[package]] +name = "elastic-transport" +version = "8.4.0" +description = "Transport classes and utilities shared among Python Elastic client libraries" +optional = false +python-versions = ">=3.6" +files = [ + {file = "elastic-transport-8.4.0.tar.gz", hash = "sha256:b9ad708ceb7fcdbc6b30a96f886609a109f042c0b9d9f2e44403b3133ba7ff10"}, + {file = "elastic_transport-8.4.0-py3-none-any.whl", hash = "sha256:19db271ab79c9f70f8c43f8f5b5111408781a6176b54ab2e54d713b6d9ceb815"}, +] + +[package.dependencies] +certifi = "*" +urllib3 = ">=1.26.2,<2" + +[package.extras] +develop = ["aiohttp", "mock", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "trustme"] + +[[package]] +name = "elasticsearch" +version = "8.8.2" +description = "Python client for Elasticsearch" +optional = false +python-versions = ">=3.6, <4" +files = [ + {file = "elasticsearch-8.8.2-py3-none-any.whl", hash = "sha256:bffd6ce4faaacf90e6f617241773b3da8fb94e2e83554f5508e2fab92ca79643"}, + {file = "elasticsearch-8.8.2.tar.gz", hash = "sha256:bed8cf8fcc6c3be7c254b579de4c29afab021f373c832246f912d37aef3c6bd5"}, +] + +[package.dependencies] +elastic-transport = ">=8,<9" + +[package.extras] +async = ["aiohttp (>=3,<4)"] +requests = ["requests (>=2.4.0,<3.0.0)"] + [[package]] name = "environs" version = "9.5.0" @@ -4187,4 +4223,4 @@ postgresql = ["psycopg2cffi"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "8f3842f331bebb624220dc5558ab920c2526d8f22b4cd9f15748b54f5f33a0f3" +content-hash = "9140339a3000bae678247d4ec1c314ed4842957d435a7ae28cc928c7678f84c0" diff --git a/pyproject.toml b/pyproject.toml index 0a8f588ee..f28f76dd1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-de pgvector = "^0.1.7" psycopg2cffi = {version = "^2.9.0", optional = true} loguru = "^0.7.0" +elasticsearch = "8.8.2" [tool.poetry.scripts] start = "server.main:start" diff --git a/tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py b/tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py new file mode 100644 index 000000000..8ac451dad --- /dev/null +++ b/tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py @@ -0,0 +1,158 @@ +import pytest +from models.models import ( + DocumentChunkMetadata, + DocumentMetadataFilter, + DocumentChunk, + QueryWithEmbedding, + Source, +) +from datastore.providers.elasticsearch_datastore import ( + ElasticsearchDataStore, +) +import time + +DIM_SIZE = 1536 + + +@pytest.fixture +def elasticsearch_datastore(): + return ElasticsearchDataStore() + + +def sample_embedding(one_element_poz: int): + embedding = [0] * DIM_SIZE + embedding[one_element_poz % DIM_SIZE] = 1 + return embedding + + +def sample_embeddings(num: int, one_element_start: int = 0): + embeddings = [] + for x in range(num): + embedding = [0] * DIM_SIZE + embedding[(x + one_element_start) % DIM_SIZE] = 1 + embeddings.append(embedding) + return embeddings + + +@pytest.fixture +def document_chunk_one(): + doc_id = "abc" + doc_chunks = [] + + ids = ["123", "456", "789"] + texts = [ + "Aenean euismod bibendum laoreet", + "Vivamus non enim vitae tortor", + "Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae", + ] + sources = [Source.email, Source.file, Source.chat] + created_ats = [ + "1929-10-28T09:30:00-05:00", + "2009-01-03T16:39:57-08:00", + "2021-01-21T10:00:00-02:00", + ] + authors = ["Fred Smith", "Bob Doe", "Appleton Doe"] + + embeddings = sample_embeddings(len(texts)) + + for i in range(3): + chunk = DocumentChunk( + id=ids[i], + text=texts[i], + metadata=DocumentChunkMetadata( + document_id=doc_id, + source=sources[i], + created_at=created_ats[i], + author=authors[i], + ), + embedding=embeddings[i], # type: ignore + ) + + doc_chunks.append(chunk) + + return {doc_id: doc_chunks} + + +async def test_upsert(elasticsearch_datastore, document_chunk_one): + await elasticsearch_datastore.delete(delete_all=True) + res = await elasticsearch_datastore._upsert(document_chunk_one) + assert res == list(document_chunk_one.keys()) + time.sleep(1) + + results = elasticsearch_datastore.client.search( + index=elasticsearch_datastore.index_name, query={"match_all": {}} + ) + assert results["hits"]["total"]["value"] == 3 + elasticsearch_datastore.client.indices.delete( + index=elasticsearch_datastore.index_name + ) + + +async def test_upsert_query_all(elasticsearch_datastore, document_chunk_one): + await elasticsearch_datastore.delete(delete_all=True) + res = await elasticsearch_datastore._upsert(document_chunk_one) + assert res == list(document_chunk_one.keys()) + time.sleep(1) + + query = QueryWithEmbedding( + query="Aenean", + top_k=10, + embedding=sample_embedding(0), # type: ignore + ) + query_results = await elasticsearch_datastore._query(queries=[query]) + + assert 1 == len(query_results) + assert 3 == len(query_results[0].results) + + +async def test_delete_with_document_id(elasticsearch_datastore, document_chunk_one): + await elasticsearch_datastore.delete(delete_all=True) + res = await elasticsearch_datastore._upsert(document_chunk_one) + time.sleep(1) + assert res == list(document_chunk_one.keys()) + await elasticsearch_datastore.delete([res[0]]) + time.sleep(1) + + query = QueryWithEmbedding( + query="Aenean", + top_k=9, + embedding=sample_embedding(0), # type: ignore + ) + query_results = await elasticsearch_datastore._query(queries=[query]) + + assert 1 == len(query_results) + assert 0 == len(query_results[0].results) + + elasticsearch_datastore.client.indices.delete( + index=elasticsearch_datastore.index_name + ) + + +async def test_delete_with_source_filter(elasticsearch_datastore, document_chunk_one): + await elasticsearch_datastore.delete(delete_all=True) + res = await elasticsearch_datastore._upsert(document_chunk_one) + assert res == list(document_chunk_one.keys()) + time.sleep(1) + + await elasticsearch_datastore.delete( + filter=DocumentMetadataFilter( + source=Source.email, + ) + ) + + time.sleep(1) + + query = QueryWithEmbedding( + query="Aenean", + top_k=9, + embedding=sample_embedding(0), # type: ignore + ) + query_results = await elasticsearch_datastore._query(queries=[query]) + + assert 1 == len(query_results) + assert 2 == len(query_results[0].results) + assert "456" == query_results[0].results[0].id + + elasticsearch_datastore.client.indices.delete( + index=elasticsearch_datastore.index_name + )