Added a framework for end-to-end tests (#1022)

* Added a framework for end-to-end tests * Only contains sample queries, not a full suite. All tests make use of the integration test docker cluster. * Can run the tests with "sbt e2etest/test" Signed-off-by: Norman Jordan <norman.jordan@improving.com> * Added documentation for integ-test cluster * Documented how queries are processed in the integ-test cluster * Documented how to use the Query Workbench with the integ-test cluster * Removed the shading of Jackson libraries (fixes #973) Signed-off-by: Norman Jordan <norman.jordan@improving.com> --------- Signed-off-by: Norman Jordan <norman.jordan@improving.com>
opensearch-project · Jan 29, 2025 · 98579e1 · 98579e1
1 parent 3832906
commit 98579e1
Show file tree

Hide file tree

Showing 42 changed files with 1,229 additions and 32 deletions.
diff --git a/build.sbt b/build.sbt
@@ -55,9 +55,6 @@ lazy val testScalastyle = taskKey[Unit]("testScalastyle")
 // - .inAll applies the rule to all dependencies, not just direct dependencies
 val packagesToShade = Seq(
   "com.amazonaws.cloudwatch.**",
-  "com.fasterxml.jackson.core.**",
-  "com.fasterxml.jackson.dataformat.**",
-  "com.fasterxml.jackson.databind.**",
   "com.google.**",
   "com.sun.jna.**",
   "com.thoughtworks.paranamer.**",
@@ -325,6 +322,28 @@ lazy val integtest = (project in file("integ-test"))
 lazy val integration = taskKey[Unit]("Run integration tests")
 lazy val awsIntegration = taskKey[Unit]("Run AWS integration tests")
 
+lazy val e2etest = (project in file("e2e-test"))
+  .dependsOn(flintCommons % "test->package", flintSparkIntegration % "test->package", pplSparkIntegration % "test->package", sparkSqlApplication % "test->package")
+  .settings(
+    commonSettings,
+    name := "e2e-test",
+    scalaVersion := scala212,
+    libraryDependencies ++= Seq(
+      "org.scalatest" %% "scalatest" % "3.2.15" % "test",
+      "org.apache.spark" %% "spark-connect-client-jvm" % "3.5.3" % "test",
+      "com.amazonaws" % "aws-java-sdk-s3" % "1.12.568" % "test",
+      "com.softwaremill.sttp.client3" %% "core" % "3.10.2" % "test",
+      "com.softwaremill.sttp.client3" %% "play2-json" % "3.10.2",
+      "com.typesafe.play" %% "play-json" % "2.9.2" % "test",
+    ),
+    libraryDependencies ++= deps(sparkVersion),
+    javaOptions ++= Seq(
+      s"-DappJar=${(sparkSqlApplication / assembly).value.getAbsolutePath}",
+      s"-DextensionJar=${(flintSparkIntegration / assembly).value.getAbsolutePath}",
+      s"-DpplJar=${(pplSparkIntegration / assembly).value.getAbsolutePath}",
+    )
+  )
+
 lazy val standaloneCosmetic = project
   .settings(
     name := "opensearch-spark-standalone",

diff --git a/docker/integ-test/configuration-updater/apply-configuration.sh b/docker/integ-test/configuration-updater/apply-configuration.sh
@@ -20,13 +20,26 @@ curl -q \
      -H 'Content-Type: application/json' \
      -d '{"name": "integ-test", "versioning": {"enabled": true, "excludePrefixes": [], "excludeFolders": false}, "locking": true}' \
      http://minio-S3:9001/api/v1/buckets
-# Create the access key
+# Create the test-resources bucket
 curl -q \
      -b /tmp/minio-cookies.txt \
      -X POST \
      -H 'Content-Type: application/json' \
-     -d "{\"policy\": \"\", \"accessKey\": \"${S3_ACCESS_KEY}\", \"secretKey\": \"${S3_SECRET_KEY}\", \"description\": \"\", \"comment\": \"\", \"name\": \"\", \"expiry\": null}" \
-     http://minio-S3:9001/api/v1/service-account-credentials
+     -d '{"name": "test-resources", "versioning": {"enabled": false, "excludePrefixes": [], "excludeFolders": false}, "locking": true}' \
+     http://minio-S3:9001/api/v1/buckets
+# Create the access key
+curl -q \
+     -b /tmp/minio-cookies.txt \
+     -X GET
+     "http://minio-S3:9001/api/v1/service-accounts/${S3_ACCESS_KEY}"
+if [ "$?" -ne "0" ]; then
+  curl -q \
+       -b /tmp/minio-cookies.txt \
+       -X POST \
+       -H 'Content-Type: application/json' \
+       -d "{\"policy\": \"\", \"accessKey\": \"${S3_ACCESS_KEY}\", \"secretKey\": \"${S3_SECRET_KEY}\", \"description\": \"\", \"comment\": \"\", \"name\": \"\", \"expiry\": null}" \
+       http://minio-S3:9001/api/v1/service-account-credentials
+fi
 
 # Login to OpenSearch Dashboards
 echo ">>> Login to OpenSearch dashboards"
@@ -43,31 +56,38 @@ if [ "$?" -eq "0" ]; then
 else
   echo "    >>> Login failed"
 fi
+
 # Create the S3/Glue datasource
-echo ">>> Creating datasource"
 curl -q \
      -b /tmp/opensearch-cookies.txt \
-     -X POST \
-     -H 'Content-Type: application/json' \
-     -H 'Osd-Version: 2.18.0' \
-     -H 'Osd-Xsrf: fetch' \
-     -d "{\"name\": \"mys3\", \"allowedRoles\": [], \"connector\": \"s3glue\", \"properties\": {\"glue.auth.type\": \"iam_role\", \"glue.auth.role_arn\": \"arn:aws:iam::123456789012:role/S3Access\", \"glue.indexstore.opensearch.uri\": \"http://opensearch:9200\", \"glue.indexstore.opensearch.auth\": \"basicauth\", \"glue.indexstore.opensearch.auth.username\": \"admin\", \"glue.indexstore.opensearch.auth.password\": \"${OPENSEARCH_ADMIN_PASSWORD}\"}}" \
-     http://opensearch-dashboards:5601/api/directquery/dataconnections
-if [ "$?" -eq "0" ]; then
-  echo "    >>> S3 datasource created"
-else
-  echo "    >>> Failed to create S3 datasource"
-fi
+     -X GET \
+     http://localhost:5601/api/directquery/dataconnections/mys3
+if [ "$?" -ne "0" ]; then
+  echo ">>> Creating datasource"
+  curl -q \
+       -b /tmp/opensearch-cookies.txt \
+       -X POST \
+       -H 'Content-Type: application/json' \
+       -H 'Osd-Version: 2.18.0' \
+       -H 'Osd-Xsrf: fetch' \
+       -d "{\"name\": \"mys3\", \"allowedRoles\": [], \"connector\": \"s3glue\", \"properties\": {\"glue.auth.type\": \"iam_role\", \"glue.auth.role_arn\": \"arn:aws:iam::123456789012:role/S3Access\", \"glue.indexstore.opensearch.uri\": \"http://opensearch:9200\", \"glue.indexstore.opensearch.auth\": \"basicauth\", \"glue.indexstore.opensearch.auth.username\": \"admin\", \"glue.indexstore.opensearch.auth.password\": \"${OPENSEARCH_ADMIN_PASSWORD}\"}}" \
+       http://opensearch-dashboards:5601/api/directquery/dataconnections
+  if [ "$?" -eq "0" ]; then
+    echo "    >>> S3 datasource created"
+  else
+    echo "    >>> Failed to create S3 datasource"
+  fi
 
-echo ">>> Setting cluster settings"
-curl -v \
-     -u "admin:${OPENSEARCH_ADMIN_PASSWORD}" \
-     -X PUT \
-     -H 'Content-Type: application/json' \
-     -d '{"persistent": {"plugins.query.executionengine.spark.config": "{\"applicationId\":\"integ-test\",\"executionRoleARN\":\"arn:aws:iam::xxxxx:role/emr-job-execution-role\",\"region\":\"us-west-2\", \"sparkSubmitParameters\": \"--conf spark.dynamicAllocation.enabled=false\"}"}}' \
-     http://opensearch:9200/_cluster/settings
-if [ "$?" -eq "0" ]; then
-  echo "    >>> Successfully set cluster settings"
-else
-  echo "    >>> Failed to set cluster settings"
+  echo ">>> Setting cluster settings"
+  curl -v \
+       -u "admin:${OPENSEARCH_ADMIN_PASSWORD}" \
+       -X PUT \
+       -H 'Content-Type: application/json' \
+       -d '{"persistent": {"plugins.query.executionengine.spark.config": "{\"applicationId\":\"integ-test\",\"executionRoleARN\":\"arn:aws:iam::xxxxx:role/emr-job-execution-role\",\"region\":\"us-west-2\", \"sparkSubmitParameters\": \"--conf spark.dynamicAllocation.enabled=false\"}"}}' \
+       http://opensearch:9200/_cluster/settings
+  if [ "$?" -eq "0" ]; then
+    echo "    >>> Successfully set cluster settings"
+  else
+    echo "    >>> Failed to set cluster settings"
+  fi
 fi
diff --git a/docker/integ-test/docker-compose.yml b/docker/integ-test/docker-compose.yml
@@ -103,9 +103,8 @@ services:
         FLINT_JAR: ${FLINT_JAR}
         PPL_JAR: ${PPL_JAR}
         SQL_APP_JAR: ${SQL_APP_JAR}
-    depends_on:
-      metastore:
-        condition: service_completed_successfully
+    entrypoint: /bin/bash
+    command: exit
 
   opensearch:
     build: ./opensearch

diff --git a/docs/docker/integ-test/images/datasource-browser.png b/docs/docker/integ-test/images/datasource-browser.png
diff --git a/docs/docker/integ-test/images/datasource-drop-down.png b/docs/docker/integ-test/images/datasource-drop-down.png
diff --git a/docs/docker/integ-test/images/datasource-selector.png b/docs/docker/integ-test/images/datasource-selector.png
diff --git a/docs/docker/integ-test/images/queries-for-async-api.png b/docs/docker/integ-test/images/queries-for-async-api.png
diff --git a/docs/docker/integ-test/images/queries-for-spark-master.png b/docs/docker/integ-test/images/queries-for-spark-master.png
diff --git a/docs/docker/integ-test/images/query-workbench-query.png b/docs/docker/integ-test/images/query-workbench-query.png
diff --git a/docs/docker/integ-test/query-execution.md b/docs/docker/integ-test/query-execution.md
@@ -0,0 +1,112 @@
+# Query Execution with the Integration Test Docker Cluster
+
+The integration test docker cluster can be used for the following tests:
+* SQL/PPL queries on Spark using local tables
+* SQL/PPL queries on Spark using external tables with data stored in MinIO(S3)
+* SQL/PPL queries on OpenSearch of OpenSearch indices
+* SQL/PPL async queries on OpenSearch of data stored in S3
+
+In all cases, SQL or PPL queries be used and the processing is very similar. At most there may be a minor
+difference in the query request.
+
+## SQL/PPL Queries on Spark Using Local Tables
+
+Connect directly to the Spark master node and execute a query. Could connect using Spark Connect, submitting
+a job or even running `spark-shell` on the Docker container. Execute `sql()` calls on the SparkSession object.
+
+Local tables are tables that were created in Spark that are not external tables. The metadata and data is stored
+in the Spark master container.
+
+Spark will begin query processing by assuming that the query is a PPL query. If it fails to parse in PPL, then
+it will fall back to parsing it as a SQL query.
+
+After parsing the query, Spark will lookup the metadata for the table(s) and perform the query. The only other
+container that may be involved in processing the request is the Spark worker container.
+
+## SQL/PPL Queries on Spark Using External Tables with Data Stored in MinIO(S3)
+
+Connect directly to the Spark master node and execute a query. Could connect using Spark Connect, submitting
+a job or even running `spark-shell` on the Docker container. Execute `sql()` calls on the SparkSession object.
+
+External tables are tables that were created in Spark that have an `s3a://` location. The metadata is stored in
+Hive and the data is stored in MinIO(S3).
+
+Spark will begin query processing by assuming that the query is a PPL query. If it fails to parse in PPL, then
+it will fall back to parsing it as a SQL query.
+
+After parsing the query, Spark will lookup the metadata for the table(s) from Hive and perform the query. It
+will retrieve table data from MinIO(S3).
+
+![Queries for Spark Master](images/queries-for-spark-master.png "Queries for Spark Master")
+
+## SQL/PPL Queries on OpenSearch of OpenSearch Indices
+
+Connect directly to the OpenSearch container to submit queries. Use the
+[SQL and PPL API](https://opensearch.org/docs/latest/search-plugins/sql/sql-ppl-api/).
+
+The indices are stored in the OpenSearch container.
+
+## SQL/PPL Async Queries on OpenSearch of Data Stored in S3
+
+Connect directly to the OpenSearch container to submit queries. Use the
+[Async Query Interface](https://github.com/opensearch-project/sql/blob/main/docs/user/interfaces/asyncqueryinterface.rst).
+This type of query simulates querying an S3/Glue datasource in OpenSearch.
+
+The table metadata is stored in Hive and the table data is stored in MinIO(S3).
+
+There are three phases to query processing:
+1. Setup
+2. Processing
+3. Results Retrieval
+
+OpenSearch will use two special indices.
+1. `.query_execution_request_[DATASOURCE_NAME]` - In the integration test Docker cluster, the datasource is
+   named `mys3`. When an Async Query request is received, an entry is added to this index. The entry contains
+   the query as well as its state. The state is updated as the request is processed.
+2. `query_execution_result_[DATASOURCE_NAME]` - In the integration test Docker cluster, the datasource is
+   named `mys3`. An entry is added to this index when the results are ready. The entry contains the results of
+   the query.
+
+Temporary Docker containers are used. They are Apache Spark containers and run a jobs locally.
+
+![Queries for Async Query API](images/queries-for-async-api.png "Queries for Async Query API")
+
+### Setup
+
+The setup phase is started when OpenSearch receives an Async Query API request and continues until the query
+ID and session ID are returned to the client.
+
+1. Check if the index `.query_execution_request_[DATASOURCE_NAME]` exists.
+2. If `.query_execution_request_[DATASOURCE_NAME]` does not exist, then create it.
+3. Insert the request into `.query_execution_request_[DATASOURCE_NAME]`
+4. Return the query ID and session ID
+
+### Processing
+
+The processing phase started when checking if there is a container running for the request's session and
+continues until the results are added to the `query_execution_result_[DATASOURCE_NAME]`.
+
+1. Check if there is a Spark container already running for the request's session
+2. If a Spark container is not running for the request's session, then use Docker to start one.
+   1. Docker initializes and starts the Spark container for the session
+3. Spark container checks if the index `query_execution_result_[DATASOURCE_NAME]` exists.
+4. If the index `query_execution_result_[DATASOURCE_NAME]` does not exist, then create it.
+5. Spark container searches the `.query_execution_request_[DATASOURCE_NAME]` index for the next request
+   in the session to process.
+6. Spark container identifies the tables in the query and get their metadata from the Hive container
+7. Spark container retrieves the table data from the MinIO(S3) container
+8. Spark container writes the results to the index `query_execution_result_[DATASOURCE_NAME]`
+
+The Spark container will keep looping from steps 5-8 until it reaches its timeout (currently set to 180 seconds).
+Once the timeout is received, the Spark container will shutdown.
+
+### Results Retrieval
+
+The results retrieval phase can happen any time after the results for the query have been added to the index
+`query_execution_result_[DATASOURCE_NAME]`.
+
+1. Client request the results of a previously submitted query from the OpenSearch container using the query ID
+   received earlier.
+2. OpenSearch container searches the index `query_execution_result_[DATASOURCE_NAME]` for the results of the
+   query.
+3. OpenSearch container returns the query results to the client.
diff --git a/docs/docker/integ-test/using-query-workbench.md b/docs/docker/integ-test/using-query-workbench.md
@@ -0,0 +1,33 @@
+# Using the Query Workbench in OpenSearch Dashboards
+
+The integration test Docker cluster contains an OpenSearch Dashboards container. This container can be used
+as a web interface for querying data in the cluster.
+
+[Query Workbench Documentation](https://opensearch.org/docs/latest/dashboards/query-workbench/)
+
+## Logging in to OpenSearch Dashboards
+
+* URL - `http://localhsot:5601`
+* Username: `admin`
+* Password: The password is in the file `docker/integ-test/.env`. It is the value of `OPENSEARCH_ADMIN_PASSWORD`.
+
+## Querying the S3/Glue Datasource
+
+1. Navigate to the Query Workbench
+2. Choose `Data source Connections` in the top left
+
+   ![Data source Connections](images/datasource-selector.png "Data source Connections")
+3. In the drop-down below `Data source Connections`, select the S3/Glue datasource. It is named `mys3`.
+
+   ![Data source Drop-down](images/datasource-drop-down.png "Data source Drop-down")
+4. It may take some time to load the namespaces in the datasource. `mys3` only contains the namespace `default`.
+5. If you like, you can browse the tables in the `default` namespace by clicking on `default`.
+
+   ![Data source Browser](images/datasource-browser.png "Data source Browser")
+6. Execute a Query
+
+   ![Query Interface](images/query-workbench-query.png "Query Interface")
+   1. Choose the query language by clicking on `SQL` or `PPL`
+   2. Enter a query in the text box
+   3. Click `Run` to execute the query
+   4. The results are displayed in the bottom right part of the page