From 4017befa80fd862040af0ddf68c7ba31a78cb8ac Mon Sep 17 00:00:00 2001 From: Fikre Mengistu Date: Fri, 24 Nov 2023 14:40:20 -0500 Subject: [PATCH 1/5] supporting spaces in column names for csv files --- evadb/parser/lark_visitor/_common_clauses_ids.py | 5 +++++ test/integration_tests/short/test_load_executor.py | 4 ++-- test/util.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/evadb/parser/lark_visitor/_common_clauses_ids.py b/evadb/parser/lark_visitor/_common_clauses_ids.py index 4dd3080dd..6f4b78c7c 100644 --- a/evadb/parser/lark_visitor/_common_clauses_ids.py +++ b/evadb/parser/lark_visitor/_common_clauses_ids.py @@ -43,6 +43,11 @@ def full_id(self, tree): return (self.visit(tree.children[0]), self.visit(tree.children[1])) def uid(self, tree): + if (hasattr(tree.children[0],"type") and tree.children[0].type == "REVERSE_QUOTE_ID"): + # tree.children[0].value = tree.children[0].value.replace("`","") + temp = str(tree.children[0]).replace("`","") + tree.children[0].type = "simple_id" + return temp return self.visit(tree.children[0]) def full_column_name(self, tree): diff --git a/test/integration_tests/short/test_load_executor.py b/test/integration_tests/short/test_load_executor.py index afc45c592..317704300 100644 --- a/test/integration_tests/short/test_load_executor.py +++ b/test/integration_tests/short/test_load_executor.py @@ -84,7 +84,7 @@ def test_should_load_csv_in_table(self): CREATE TABLE IF NOT EXISTS MyVideoCSV ( id INTEGER UNIQUE, - frame_id INTEGER, + `frame_id` INTEGER, video_id INTEGER, dataset_name TEXT(30), label TEXT(30), @@ -100,7 +100,7 @@ def test_should_load_csv_in_table(self): execute_query_fetch_all(self.evadb, load_query) # execute a select query - select_query = """SELECT id, frame_id, video_id, + select_query = """SELECT id, `frame_id`, video_id, dataset_name, label, bbox, object_id FROM MyVideoCSV;""" diff --git a/test/util.py b/test/util.py index 3a23a6ff5..7f9c3e4ea 100644 --- a/test/util.py +++ b/test/util.py @@ -304,7 +304,7 @@ def create_sample_csv(num_frames=NUM_FRAMES): random_coords = 200 + 300 * np.random.random(4) sample_meta[index] = { "id": index, - "frame_id": frame_id, + "frame id": frame_id, "video_id": video_id, "dataset_name": "test_dataset", "label": sample_labels[np.random.choice(len(sample_labels))], From f9942e49dc9133e1bacc9651af7abd71bd84847a Mon Sep 17 00:00:00 2001 From: Fikre Mengistu Date: Fri, 24 Nov 2023 15:33:24 -0500 Subject: [PATCH 2/5] refactoring and adding test case --- .../lark_visitor/_common_clauses_ids.py | 5 +- .../short/test_load_executor.py | 47 ++++++++++++++++++- test/util.py | 32 ++++++++++++- 3 files changed, 78 insertions(+), 6 deletions(-) diff --git a/evadb/parser/lark_visitor/_common_clauses_ids.py b/evadb/parser/lark_visitor/_common_clauses_ids.py index 6f4b78c7c..e1c59a428 100644 --- a/evadb/parser/lark_visitor/_common_clauses_ids.py +++ b/evadb/parser/lark_visitor/_common_clauses_ids.py @@ -44,10 +44,9 @@ def full_id(self, tree): def uid(self, tree): if (hasattr(tree.children[0],"type") and tree.children[0].type == "REVERSE_QUOTE_ID"): - # tree.children[0].value = tree.children[0].value.replace("`","") - temp = str(tree.children[0]).replace("`","") tree.children[0].type = "simple_id" - return temp + non_tick_string = str(tree.children[0]).replace("`","") + return non_tick_string return self.visit(tree.children[0]) def full_column_name(self, tree): diff --git a/test/integration_tests/short/test_load_executor.py b/test/integration_tests/short/test_load_executor.py index 317704300..cd663491a 100644 --- a/test/integration_tests/short/test_load_executor.py +++ b/test/integration_tests/short/test_load_executor.py @@ -19,6 +19,7 @@ from test.util import ( create_dummy_csv_batches, create_sample_csv, + create_csv_with_comlumn_name_spaces, create_sample_video, file_remove, get_evadb_for_testing, @@ -45,6 +46,7 @@ def setUp(self): f"{EvaDB_ROOT_DIR}/test/data/uadetrac/small-data/MVI_20011/*.jpg" ) self.csv_file_path = create_sample_csv() + self.csv_file_with_spaces_path = create_csv_with_comlumn_name_spaces() def tearDown(self): shutdown_ray() @@ -84,7 +86,7 @@ def test_should_load_csv_in_table(self): CREATE TABLE IF NOT EXISTS MyVideoCSV ( id INTEGER UNIQUE, - `frame_id` INTEGER, + frame_id INTEGER, video_id INTEGER, dataset_name TEXT(30), label TEXT(30), @@ -100,7 +102,7 @@ def test_should_load_csv_in_table(self): execute_query_fetch_all(self.evadb, load_query) # execute a select query - select_query = """SELECT id, `frame_id`, video_id, + select_query = """SELECT id, frame_id, video_id, dataset_name, label, bbox, object_id FROM MyVideoCSV;""" @@ -117,6 +119,47 @@ def test_should_load_csv_in_table(self): drop_query = "DROP TABLE IF EXISTS MyVideoCSV;" execute_query_fetch_all(self.evadb, drop_query) + ################################### + # integration tests for csv files with spaces in column names + def test_should_load_csv_in_table_with_spaces_in_column_name(self): + # loading a csv requires a table to be created first + create_table_query = """ + + CREATE TABLE IF NOT EXISTS MyVideoCSV ( + id INTEGER UNIQUE, + `frame id` INTEGER, + `video id` INTEGER, + `dataset name` TEXT(30), + label TEXT(30), + bbox NDARRAY FLOAT32(4), + `object id` INTEGER + ); + + """ + execute_query_fetch_all(self.evadb, create_table_query) + + # load the CSV + load_query = f"LOAD CSV '{self.csv_file_with_spaces_path}' INTO MyVideoCSV;" + execute_query_fetch_all(self.evadb, load_query) + + # execute a select query + select_query = """SELECT id, `frame id`, `video id`, + `dataset name`, label, bbox, + `object id` + FROM MyVideoCSV;""" + + actual_batch = execute_query_fetch_all(self.evadb, select_query) + actual_batch.sort() + + # assert the batches are equal + expected_batch = next(create_dummy_csv_batches()) + expected_batch.modify_column_alias("myvideocsv") + self.assertEqual(actual_batch, expected_batch) + + # clean up + drop_query = "DROP TABLE IF EXISTS MyVideoCSV;" + execute_query_fetch_all(self.evadb, drop_query) + if __name__ == "__main__": unittest.main() diff --git a/test/util.py b/test/util.py index 7f9c3e4ea..f21830385 100644 --- a/test/util.py +++ b/test/util.py @@ -304,7 +304,7 @@ def create_sample_csv(num_frames=NUM_FRAMES): random_coords = 200 + 300 * np.random.random(4) sample_meta[index] = { "id": index, - "frame id": frame_id, + "frame_id": frame_id, "video_id": video_id, "dataset_name": "test_dataset", "label": sample_labels[np.random.choice(len(sample_labels))], @@ -318,6 +318,36 @@ def create_sample_csv(num_frames=NUM_FRAMES): df_sample_meta.to_csv(os.path.join(get_tmp_dir(), "dummy.csv"), index=False) return os.path.join(get_tmp_dir(), "dummy.csv") +def create_csv_with_comlumn_name_spaces(num_frames=NUM_FRAMES): + try: + os.remove(os.path.join(get_tmp_dir(), "dummy.csv")) + except FileNotFoundError: + pass + + sample_meta = {} + + index = 0 + sample_labels = ["car", "pedestrian", "bicycle"] + num_videos = 2 + for video_id in range(num_videos): + for frame_id in range(num_frames): + random_coords = 200 + 300 * np.random.random(4) + sample_meta[index] = { + "id": index, + "frame id": frame_id, + "video id": video_id, + "dataset name": "test_dataset", + "label": sample_labels[np.random.choice(len(sample_labels))], + "bbox": ",".join([str(coord) for coord in random_coords]), + "object id": np.random.choice(3), + } + + index += 1 + + df_sample_meta = pd.DataFrame.from_dict(sample_meta, "index") + df_sample_meta.to_csv(os.path.join(get_tmp_dir(), "dummy.csv"), index=False) + return os.path.join(get_tmp_dir(), "dummy.csv") + def create_dummy_csv_batches(target_columns=None): if target_columns: From 292f9e6c3cf2900fad8e6f3bc9f27f4d8423d9f5 Mon Sep 17 00:00:00 2001 From: Fikre Mengistu Date: Fri, 24 Nov 2023 18:49:01 -0500 Subject: [PATCH 3/5] moving csv creation to inside new test method --- test/integration_tests/short/test_load_executor.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/integration_tests/short/test_load_executor.py b/test/integration_tests/short/test_load_executor.py index cd663491a..2494ca0ce 100644 --- a/test/integration_tests/short/test_load_executor.py +++ b/test/integration_tests/short/test_load_executor.py @@ -46,7 +46,6 @@ def setUp(self): f"{EvaDB_ROOT_DIR}/test/data/uadetrac/small-data/MVI_20011/*.jpg" ) self.csv_file_path = create_sample_csv() - self.csv_file_with_spaces_path = create_csv_with_comlumn_name_spaces() def tearDown(self): shutdown_ray() @@ -126,7 +125,7 @@ def test_should_load_csv_in_table_with_spaces_in_column_name(self): create_table_query = """ CREATE TABLE IF NOT EXISTS MyVideoCSV ( - id INTEGER UNIQUE, + id INTEGER UNIQUE, `frame id` INTEGER, `video id` INTEGER, `dataset name` TEXT(30), @@ -139,7 +138,7 @@ def test_should_load_csv_in_table_with_spaces_in_column_name(self): execute_query_fetch_all(self.evadb, create_table_query) # load the CSV - load_query = f"LOAD CSV '{self.csv_file_with_spaces_path}' INTO MyVideoCSV;" + load_query = f"LOAD CSV '{create_csv_with_comlumn_name_spaces()}' INTO MyVideoCSV;" execute_query_fetch_all(self.evadb, load_query) # execute a select query From 2f5a678a0c42e9605e0d19a317d39d7352110a5e Mon Sep 17 00:00:00 2001 From: americast Date: Fri, 24 Nov 2023 20:07:37 -0500 Subject: [PATCH 4/5] linted --- evadb/parser/lark_visitor/_common_clauses_ids.py | 7 +++++-- test/integration_tests/short/test_load_executor.py | 6 ++++-- test/util.py | 3 ++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/evadb/parser/lark_visitor/_common_clauses_ids.py b/evadb/parser/lark_visitor/_common_clauses_ids.py index e1c59a428..5267761cf 100644 --- a/evadb/parser/lark_visitor/_common_clauses_ids.py +++ b/evadb/parser/lark_visitor/_common_clauses_ids.py @@ -43,9 +43,12 @@ def full_id(self, tree): return (self.visit(tree.children[0]), self.visit(tree.children[1])) def uid(self, tree): - if (hasattr(tree.children[0],"type") and tree.children[0].type == "REVERSE_QUOTE_ID"): + if ( + hasattr(tree.children[0], "type") + and tree.children[0].type == "REVERSE_QUOTE_ID" + ): tree.children[0].type = "simple_id" - non_tick_string = str(tree.children[0]).replace("`","") + non_tick_string = str(tree.children[0]).replace("`", "") return non_tick_string return self.visit(tree.children[0]) diff --git a/test/integration_tests/short/test_load_executor.py b/test/integration_tests/short/test_load_executor.py index 2494ca0ce..542012211 100644 --- a/test/integration_tests/short/test_load_executor.py +++ b/test/integration_tests/short/test_load_executor.py @@ -17,9 +17,9 @@ import unittest from pathlib import Path from test.util import ( + create_csv_with_comlumn_name_spaces, create_dummy_csv_batches, create_sample_csv, - create_csv_with_comlumn_name_spaces, create_sample_video, file_remove, get_evadb_for_testing, @@ -138,7 +138,9 @@ def test_should_load_csv_in_table_with_spaces_in_column_name(self): execute_query_fetch_all(self.evadb, create_table_query) # load the CSV - load_query = f"LOAD CSV '{create_csv_with_comlumn_name_spaces()}' INTO MyVideoCSV;" + load_query = ( + f"LOAD CSV '{create_csv_with_comlumn_name_spaces()}' INTO MyVideoCSV;" + ) execute_query_fetch_all(self.evadb, load_query) # execute a select query diff --git a/test/util.py b/test/util.py index f21830385..7df662b4c 100644 --- a/test/util.py +++ b/test/util.py @@ -318,6 +318,7 @@ def create_sample_csv(num_frames=NUM_FRAMES): df_sample_meta.to_csv(os.path.join(get_tmp_dir(), "dummy.csv"), index=False) return os.path.join(get_tmp_dir(), "dummy.csv") + def create_csv_with_comlumn_name_spaces(num_frames=NUM_FRAMES): try: os.remove(os.path.join(get_tmp_dir(), "dummy.csv")) @@ -334,7 +335,7 @@ def create_csv_with_comlumn_name_spaces(num_frames=NUM_FRAMES): random_coords = 200 + 300 * np.random.random(4) sample_meta[index] = { "id": index, - "frame id": frame_id, + "frame id": frame_id, "video id": video_id, "dataset name": "test_dataset", "label": sample_labels[np.random.choice(len(sample_labels))], From fed6659d0105fac94dc3113c87d478e359ba9b32 Mon Sep 17 00:00:00 2001 From: Andy Xu Date: Sun, 3 Dec 2023 04:06:43 -0500 Subject: [PATCH 5/5] Add column with space support in faq. --- docs/source/overview/faq.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/source/overview/faq.rst b/docs/source/overview/faq.rst index f2d4b61ae..b0dacfa41 100644 --- a/docs/source/overview/faq.rst +++ b/docs/source/overview/faq.rst @@ -34,3 +34,16 @@ If a query runs a complex AI task (e.g., sentiment analysis) on a large table, t top pgrep evadb_server +Can column names have space? +---------------------------- + +For column names with space, you can use reverse quote to contain the column names. Below are example `CREATE TABLE` and `SELECT` queries: + +.. code-block:: sql + + CREATE TABLE IF NOT EXISTS MyVideoCSV ( + id INTEGER UNIQUE, + `frame id` INTEGER, + ); + + SELECT id, `frame id` FROM MyVideoCSV;