Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: expand hash length to avoid collision #17

Merged
merged 1 commit into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sqlframe/base/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def _create_hash_from_expression(self, expression: exp.Expression) -> str:
from sqlframe.base.session import _BaseSession

value = expression.sql(dialect=_BaseSession().input_dialect).encode("utf-8")
hash = f"t{zlib.crc32(value)}"[:6]
hash = f"t{zlib.crc32(value)}"[:9]
return self.session._normalize_string(hash)

def _get_select_expressions(
Expand Down
20 changes: 10 additions & 10 deletions tests/unit/standalone/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def test_hash_select_expression(standalone_employee: StandaloneDataFrame):
expression = exp.select("cola").from_("table")
assert standalone_employee._create_hash_from_expression(expression) == "t17051"
assert standalone_employee._create_hash_from_expression(expression) == "t17051938"


def test_columns(standalone_employee: StandaloneDataFrame):
Expand All @@ -20,29 +20,29 @@ def test_columns(standalone_employee: StandaloneDataFrame):
def test_cache(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable):
df = standalone_employee.select("fname").cache()
expected_statements = [
"DROP VIEW IF EXISTS t31563",
"CACHE LAZY TABLE t31563 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563`.`fname` AS `fname` FROM `t31563` AS `t31563`",
"DROP VIEW IF EXISTS t31563989",
"CACHE LAZY TABLE t31563989 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563989`.`fname` AS `fname` FROM `t31563989` AS `t31563989`",
]
compare_sql(df, expected_statements)


def test_persist_default(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable):
df = standalone_employee.select("fname").persist()
expected_statements = [
"DROP VIEW IF EXISTS t31563",
"CACHE LAZY TABLE t31563 OPTIONS('storageLevel' = 'MEMORY_AND_DISK_SER') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563`.`fname` AS `fname` FROM `t31563` AS `t31563`",
"DROP VIEW IF EXISTS t31563989",
"CACHE LAZY TABLE t31563989 OPTIONS('storageLevel' = 'MEMORY_AND_DISK_SER') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563989`.`fname` AS `fname` FROM `t31563989` AS `t31563989`",
]
compare_sql(df, expected_statements)


def test_persist_storagelevel(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable):
df = standalone_employee.select("fname").persist("DISK_ONLY_2")
expected_statements = [
"DROP VIEW IF EXISTS t31563",
"CACHE LAZY TABLE t31563 OPTIONS('storageLevel' = 'DISK_ONLY_2') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563`.`fname` AS `fname` FROM `t31563` AS `t31563`",
"DROP VIEW IF EXISTS t31563989",
"CACHE LAZY TABLE t31563989 OPTIONS('storageLevel' = 'DISK_ONLY_2') AS SELECT CAST(`a1`.`fname` AS STRING) AS `fname` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"SELECT `t31563989`.`fname` AS `fname` FROM `t31563989` AS `t31563989`",
]
compare_sql(df, expected_statements)

Expand Down
12 changes: 6 additions & 6 deletions tests/unit/standalone/test_dataframe_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ def test_insertInto_byName(standalone_employee: StandaloneDataFrame, compare_sql
def test_insertInto_cache(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable):
df = standalone_employee.cache().write.insertInto("table_name")
expected_statements = [
"DROP VIEW IF EXISTS t12441",
"CACHE LAZY TABLE t12441 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"INSERT INTO table_name SELECT `t12441`.`employee_id` AS `employee_id`, `t12441`.`fname` AS `fname`, `t12441`.`lname` AS `lname`, `t12441`.`age` AS `age`, `t12441`.`store_id` AS `store_id` FROM `t12441` AS `t12441`",
"DROP VIEW IF EXISTS t12441709",
"CACHE LAZY TABLE t12441709 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"INSERT INTO table_name SELECT `t12441709`.`employee_id` AS `employee_id`, `t12441709`.`fname` AS `fname`, `t12441709`.`lname` AS `lname`, `t12441709`.`age` AS `age`, `t12441709`.`store_id` AS `store_id` FROM `t12441709` AS `t12441709`",
]
compare_sql(df, expected_statements)

Expand Down Expand Up @@ -94,9 +94,9 @@ def test_mode_override(standalone_employee: StandaloneDataFrame, compare_sql: t.
def test_saveAsTable_cache(standalone_employee: StandaloneDataFrame, compare_sql: t.Callable):
df = standalone_employee.cache().write.saveAsTable("table_name")
expected_statements = [
"DROP VIEW IF EXISTS t12441",
"CACHE LAZY TABLE t12441 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"CREATE TABLE table_name AS SELECT `t12441`.`employee_id` AS `employee_id`, `t12441`.`fname` AS `fname`, `t12441`.`lname` AS `lname`, `t12441`.`age` AS `age`, `t12441`.`store_id` AS `store_id` FROM `t12441` AS `t12441`",
"DROP VIEW IF EXISTS t12441709",
"CACHE LAZY TABLE t12441709 OPTIONS('storageLevel' = 'MEMORY_AND_DISK') AS SELECT `a1`.`employee_id` AS `employee_id`, CAST(`a1`.`fname` AS STRING) AS `fname`, CAST(`a1`.`lname` AS STRING) AS `lname`, `a1`.`age` AS `age`, `a1`.`store_id` AS `store_id` FROM VALUES (1, 'Jack', 'Shephard', 37, 1), (2, 'John', 'Locke', 65, 1), (3, 'Kate', 'Austen', 37, 2), (4, 'Claire', 'Littleton', 27, 2), (5, 'Hugo', 'Reyes', 29, 100) AS `a1`(`employee_id`, `fname`, `lname`, `age`, `store_id`)",
"CREATE TABLE table_name AS SELECT `t12441709`.`employee_id` AS `employee_id`, `t12441709`.`fname` AS `fname`, `t12441709`.`lname` AS `lname`, `t12441709`.`age` AS `age`, `t12441709`.`store_id` AS `store_id` FROM `t12441709` AS `t12441709`",
]
compare_sql(df, expected_statements)

Expand Down
2 changes: 1 addition & 1 deletion tests/unit/standalone/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def test_sql_with_aggs(standalone_session: StandaloneSession, compare_sql: t.Cal
df = standalone_session.sql(query).groupBy(F.col("cola")).agg(F.sum("colb"))
compare_sql(
df,
"WITH t26614 AS (SELECT `table`.`cola` AS `cola`, `table`.`colb` AS `colb` FROM `table` AS `table`), t23454 AS (SELECT cola, colb FROM t26614) SELECT cola, SUM(colb) FROM t23454 GROUP BY cola",
"WITH t26614157 AS (SELECT `table`.`cola` AS `cola`, `table`.`colb` AS `colb` FROM `table` AS `table`), t38889420 AS (SELECT cola, colb FROM t26614157) SELECT cola, SUM(colb) FROM t38889420 GROUP BY cola",
pretty=False,
optimize=False,
)
Expand Down