Merge branch 'georgia-tech-db:staging' into staging

kslohith · Oct 20, 2023 · a84ab0c · a84ab0c
2 parents 83ba8f6 + a6fdd6a
commit a84ab0c
Show file tree

Hide file tree

Showing 102 changed files with 1,148 additions and 584 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -234,7 +234,7 @@ jobs:
               else
                 pip install ".[dev,pinecone,chromadb]" # ray < 2.5.0 does not work with python 3.11 ray-project/ray#33864
               fi
-              python -c "import yaml;f = open('evadb/evadb.yml', 'r+');config_obj = yaml.load(f, Loader=yaml.FullLoader);config_obj['experimental']['ray'] = True;f.seek(0);f.write(yaml.dump(config_obj));f.truncate();"
+              python -c "import evadb;cur=evadb.connect().cursor();cur.query('SET ray=True;')"
             else
               if [ $PY_VERSION != "3.11" ]; then
                 pip install ".[dev,ludwig,qdrant,pinecone,chromadb]"

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,37 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### [Deprecated]
 ### [Removed]
 
+##  [0.3.8] - 2023-10-18
+
+* PR #1303: v0.3.8 - new release 
+* PR #1302: Reenable batch for release 
+* PR #1301: Add Documentation for UDF Unit Testing and Mocking 
+* PR #1232: Starting the change for XGBoost integration into EVADb. 
+* PR #1294: fix: improve testcase 
+* PR #1293: fix: make the table/function catalog insert operation atomic 
+* PR #1295: feat: add support for show databases 
+* PR #1296: feat: function_metadata supports boolean and float  
+* PR #1290: fix: text_summarization uses drop udf 
+* PR #1240: Add stable diffusion integration 
+* PR #1285: Update custom-ai-function.rst 
+* PR #1234: Added basic functionalities of REST apis 
+* PR #1281: Clickhouse integration 
+* PR #1273: Update custom-ai-function.rst 
+* PR #1274: Fix Notebook and Ray testcases at staging 
+* PR #1264: SHOW command for retrieveing configurations 
+* PR #1270: fix: Catalog init introduces significant overhead  
+* PR #1267: Improve the error message when there is a typo in the column name in the query.  
+* PR #1261: Remove dimensions from `TEXT` and `FLOAT` 
+* PR #1256: Remove table names from column names for `df
+* PR #1253: Collection of fixes for the staging branch 
+* PR #1246: feat: insertion update index 
+* PR #1245: Documentation on vector stores + vector benchmark 
+* PR #1244: feat: create index from projection 
+* PR #1233: GitHub Data Source Integration 
+* PR #1115: Add support for Neuralforecast 
+* PR #1241: Bump Version to v0.3.8+dev 
+* PR #1239: release 0.3.7 
+
 ##  [0.3.7] - 2023-09-30
 
 * PR #1239: release 0.3.7 

diff --git a/README.md b/README.md
@@ -116,7 +116,7 @@ Our target audience is software developers who may not necessarily have a backgr
 
 <details>
 <ul>
-<li>Connect EvaDB to your SQL and vector database systems with the <a href="https://evadb.readthedocs.io/en/stable/source/reference/databases/postgres.html">`CREATE DATABASE`</a> and <a href="https://evadb.readthedocs.io/en/stable/source/reference/evaql/create.html#create-index">`CREATE INDEX`</a> statements.</li>
+<li>Connect EvaDB to your SQL and vector database systems with the <a href="https://evadb.readthedocs.io/en/stable/source/reference/databases/postgres.html">`CREATE DATABASE`</a> and <a href="https://evadb.readthedocs.io/en/stable/source/reference/evaql/create_index.html">`CREATE INDEX`</a> statements.</li>
 <li>Write SQL queries with AI functions to get inference results:</li>
    <ul>
    <li>Pick a pre-trained AI model from Hugging Face, Open AI, Ultralytics, PyTorch, and built-in AI frameworks for generative AI, NLP, and vision applications;</li>  

diff --git a/apps/pandas_qa/pandas_qa.py b/apps/pandas_qa/pandas_qa.py
@@ -53,10 +53,10 @@ def receive_user_input() -> Dict:
 
     # get OpenAI key if needed
     try:
-        api_key = os.environ["OPENAI_KEY"]
+        api_key = os.environ["OPENAI_API_KEY"]
     except KeyError:
         api_key = str(input("🔑 Enter your OpenAI key: "))
-        os.environ["OPENAI_KEY"] = api_key
+        os.environ["OPENAI_API_KEY"] = api_key
 
     return user_input
 

diff --git a/apps/youtube_qa/youtube_qa.py b/apps/youtube_qa/youtube_qa.py
@@ -93,10 +93,10 @@ def receive_user_input() -> Dict:
 
     # get OpenAI key if needed
     try:
-        api_key = os.environ["OPENAI_KEY"]
+        api_key = os.environ["OPENAI_API_KEY"]
     except KeyError:
         api_key = str(input("🔑 Enter your OpenAI key: "))
-        os.environ["OPENAI_KEY"] = api_key
+        os.environ["OPENAI_API_KEY"] = api_key
 
     return user_input
 

diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -89,6 +89,8 @@ parts:
             title: Model Training with Ludwig
           - file: source/reference/ai/model-train-sklearn
             title: Model Training with Sklearn
+          - file: source/reference/ai/model-train-xgboost
+            title: Model Training with XGBoost
           - file: source/reference/ai/model-forecasting
             title: Time Series Forecasting
           - file: source/reference/ai/hf
@@ -135,6 +137,8 @@ parts:
             title: Code Style
           - file: source/dev-guide/contribute/troubleshoot
             title: Troubleshooting
+          - file: source/dev-guide/contribute/unit-test
+            title: Unit Testing UDFs in EvaDB
 
       - file: source/dev-guide/debugging
         title: Debugging EvaDB

diff --git a/docs/source/dev-guide/contribute/unit-test.rst b/docs/source/dev-guide/contribute/unit-test.rst
@@ -0,0 +1,80 @@
+Unit Testing UDFs in EvaDB
+===========================
+
+Introduction
+------------
+
+Unit testing is a crucial aspect of software development. When working with User Defined Functions (UDFs) in EvaDB, it's essential to ensure that they work correctly. This guide will walk you through the process of writing unit tests for UDFs and using mocking to simulate external dependencies.
+
+Setting Up Test Environment
+---------------------------
+
+Before writing tests, set up a test environment. This often involves creating a test database or table and populating it with sample data.
+
+.. code-block:: python
+
+   def setUp(self) -> None:
+       self.evadb = get_evadb_for_testing()
+       self.evadb.catalog().reset()
+       create_table_query = """CREATE TABLE IF NOT EXISTS TestTable (
+            prompt TEXT(100));
+       """
+       execute_query_fetch_all(self.evadb, create_table_query)
+       test_prompts = ["sample prompt"]
+       for prompt in test_prompts:
+           insert_query = f"""INSERT INTO TestTable (prompt) VALUES ('{prompt}')"""
+           execute_query_fetch_all(self.evadb, insert_query)
+
+Mocking External Dependencies
+-----------------------------
+
+When testing UDFs that rely on external services or APIs, use mocking to simulate these dependencies.
+
+.. code-block:: python
+
+   @patch("requests.get")
+   @patch("external_library.Method", return_value={"data": [{"url": "mocked_url"}]})
+   def test_udf(self, mock_method, mock_requests_get):
+       # Mock the response from the external service
+       mock_response = MagicMock()
+       mock_response.content = "mocked content"
+       mock_requests_get.return_value = mock_response
+
+       # Rest of the test code...
+
+Writing the Test
+----------------
+
+After setting up the environment and mocking dependencies, write the test for the UDF.
+
+.. code-block:: python
+
+   function_name = "ImageDownloadUDF"
+   query = f"SELECT {function_name}(prompt) FROM TestTable;"
+   output = execute_query_fetch_all(self.evadb, query)
+   expected_output = [...]  # Expected output
+   self.assertEqual(output, expected_output)
+
+Cleaning Up After Tests
+-----------------------
+
+Clean up any resources used during testing, such as database tables.
+
+.. code-block:: python
+
+   def tearDown(self) -> None:
+       execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS TestTable;")
+
+Running the Tests
+-----------------
+
+Run the tests using a test runner like `unittest`.
+
+.. code-block:: bash
+
+   python -m unittest path_to_your_test_module.py
+
+Conclusion
+----------
+
+Unit testing UDFs in EvaDB ensures their correctness and robustness. Mocking allows for simulating external dependencies, making tests faster and more deterministic.
diff --git a/docs/source/overview/concepts.rst b/docs/source/overview/concepts.rst
@@ -46,7 +46,7 @@ Here are some illustrative **AI queries** for a ChatGPT-based video question ans
     --- The 'transcripts' table has a column called 'text' with the transcript text
     --- Since ChatGPT is a built-in function in EvaDB, we don't have to define it
     --- We can directly use ChatGPT() in any query
-    --- We will only need to set the OPENAI_KEY as an environment variable
+    --- We will only need to set the OPENAI_API_KEY as an environment variable
     SELECT ChatGPT('Is this video summary related to Ukraine russia war', text) 
         FROM TEXT_SUMMARY;
 

diff --git a/docs/source/reference/ai/custom-ai-function.rst b/docs/source/reference/ai/custom-ai-function.rst
@@ -44,7 +44,7 @@ The abstract method `setup` must be implemented in your function. The setup func
 Any additional arguments needed for creating the function must be passed as arguments to the setup function. (Please refer to the 
 `ChatGPT <https://github.com/georgia-tech-db/evadb/blob/master/evadb/functions/chatgpt.py>`__ function example).
 
-The additional arguments are passed with the CREATE command. Please refer to `CREATE <https://evadb.readthedocs.io/en/stable/source/reference/evaql/create.html#create-function>`_ command documentation.
+The additional arguments are passed with the CREATE command. Please refer to `CREATE <https://evadb.readthedocs.io/en/stable/source/reference/evaql/create_function.html>`_ command documentation.
 
 The custom setup operations for the function can be written inside the function in the child class. If there is no need for any custom logic, then you can just simply write "pass" in the function definition.
 
@@ -258,9 +258,6 @@ The following code can be used to create an Object Detection function using Yolo
         try_to_import_openai()
         import openai
 
-        #setting up the key
-        openai.api_key = ConfigurationManager().get_value("third_party", "OPENAI_KEY")
-
         #getting the data
         content = text_df[text_df.columns[0]]
         responses = []

diff --git a/docs/source/reference/ai/model-train-xgboost.rst b/docs/source/reference/ai/model-train-xgboost.rst
@@ -0,0 +1,26 @@
+.. _xgboost:
+
+Model Training with XGBoost
+============================
+
+1. Installation
+---------------
+
+To use the `Flaml XGBoost AutoML framework <https://microsoft.github.io/FLAML/docs/Examples/AutoML-for-XGBoost/>`_, we need to install the extra Flaml dependency in your EvaDB virtual environment.
+
+.. code-block:: bash
+
+   pip install "flaml[automl]"
+
+2. Example Query
+----------------
+
+.. code-block:: sql
+
+   CREATE FUNCTION IF NOT EXISTS PredictRent FROM
+   ( SELECT number_of_rooms, number_of_bathrooms, days_on_market, rental_price FROM HomeRentals )
+   TYPE XGBoost
+   PREDICT 'rental_price';
+
+In the above query, you are creating a new customized function by training a model from the ``HomeRentals`` table using the ``Flaml XGBoost`` framework.
+The ``rental_price`` column will be the target column for predication, while the rest columns from the ``SELET`` query are the inputs. 
diff --git a/docs/source/usecases/question-answering.rst b/docs/source/usecases/question-answering.rst
@@ -57,7 +57,7 @@ EvaDB has built-in support for ``ChatGPT`` function from ``OpenAI``. You will ne
 
     # Set OpenAI key
     import os
-    os.environ["OPENAI_KEY"] = "sk-..."
+    os.environ["OPENAI_API_KEY"] = "sk-..."
 
 .. note::
 

diff --git a/evadb/binder/statement_binder.py b/evadb/binder/statement_binder.py
@@ -32,7 +32,9 @@
 from evadb.binder.statement_binder_context import StatementBinderContext
 from evadb.catalog.catalog_type import ColumnType, TableType
 from evadb.catalog.catalog_utils import get_metadata_properties, is_document_table
+from evadb.catalog.sql_config import RESTRICTED_COL_NAMES
 from evadb.configuration.constants import EvaDB_INSTALLATION_DIR
+from evadb.executor.execution_context import Context
 from evadb.expression.abstract_expression import AbstractExpression, ExpressionType
 from evadb.expression.function_expression import FunctionExpression
 from evadb.expression.tuple_value_expression import TupleValueExpression
@@ -101,7 +103,9 @@ def _bind_create_function_statement(self, node: CreateFunctionStatement):
                         outputs.append(column)
                     else:
                         inputs.append(column)
-            elif string_comparison_case_insensitive(node.function_type, "sklearn"):
+            elif string_comparison_case_insensitive(
+                node.function_type, "sklearn"
+            ) or string_comparison_case_insensitive(node.function_type, "XGBoost"):
                 assert (
                     "predict" in arg_map
                 ), f"Creating {node.function_type} functions expects 'predict' metadata."
@@ -201,6 +205,12 @@ def _bind_delete_statement(self, node: DeleteTableStatement):
 
     @bind.register(CreateTableStatement)
     def _bind_create_statement(self, node: CreateTableStatement):
+        # we don't allow certain keywords in the column_names
+        for col in node.column_list:
+            assert (
+                col.name.lower() not in RESTRICTED_COL_NAMES
+            ), f"EvaDB does not allow to create a table with column name {col.name}"
+
         if node.query is not None:
             self.bind(node.query)
 
@@ -264,6 +274,11 @@ def _bind_tuple_expr(self, node: TupleValueExpression):
 
     @bind.register(FunctionExpression)
     def _bind_func_expr(self, node: FunctionExpression):
+        # setup the context
+        # we read the GPUs from the catalog and populate in the context
+        gpus_ids = self._catalog().get_configuration_catalog_value("gpu_ids")
+        node._context = Context(gpus_ids)
+
         # handle the special case of "extract_object"
         if node.name.upper() == str(FunctionType.EXTRACT_OBJECT):
             handle_bind_extract_object_function(node, self)
@@ -331,9 +346,18 @@ def _bind_func_expr(self, node: FunctionExpression):
                 )
                 # certain functions take additional inputs like yolo needs the model_name
                 # these arguments are passed by the user as part of metadata
-                node.function = lambda: function_class(
-                    **get_metadata_properties(function_obj)
-                )
+                # we also handle the special case of ChatGPT where we need to send the
+                # OpenAPI key as part of the parameter if not provided by the user
+                properties = get_metadata_properties(function_obj)
+                if string_comparison_case_insensitive(node.name, "CHATGPT"):
+                    # if the user didn't provide any API_KEY, check if we have one in the catalog
+                    if "OPENAI_API_KEY" not in properties.keys():
+                        openapi_key = self._catalog().get_configuration_catalog_value(
+                            "OPENAI_API_KEY"
+                        )
+                        properties["openai_api_key"] = openapi_key
+
+                node.function = lambda: function_class(**properties)
             except Exception as e:
                 err_msg = (
                     f"{str(e)}. Please verify that the function class name in the "