From c22d899160f0f54774f9c4a052bb55051e2e4341 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 19 Feb 2024 10:30:37 +0100 Subject: [PATCH 1/9] update arraymancer dep, remove our `len` for `Tensor` --- datamancer.nimble | 2 +- src/datamancer/dataframe.nim | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/datamancer.nimble b/datamancer.nimble index 6867d84..d8d7176 100644 --- a/datamancer.nimble +++ b/datamancer.nimble @@ -11,7 +11,7 @@ srcDir = "src" requires "nim >= 1.2.0" requires "https://github.com/Vindaar/seqmath >= 0.1.11" -requires "arraymancer >= 0.7.1" +requires "arraymancer >= 0.7.28" task test, "Run standard tests": exec "nim c -r tests/testDf.nim" diff --git a/src/datamancer/dataframe.nim b/src/datamancer/dataframe.nim index 349c5b5..8214d30 100644 --- a/src/datamancer/dataframe.nim +++ b/src/datamancer/dataframe.nim @@ -121,12 +121,6 @@ proc shallowCopy*[C: ColumnLike](df: DataTable[C]): DataTable[C] = # ---------- General convenience helpers ---------- -func len*[T](t: Tensor[T]): int = - ## Helper proc for 1D `Tensor[T]` to return the length of the vector, which - ## corresponds to a length of a DF column. - assert t.rank == 1 - result = t.size - proc contains*[C: ColumnLike](df: DataTable[C], key: string): bool = ## Checks if the `key` names column in the `DataFrame`. result = df.data.hasKey(key) From a3ef72f39c787a6e3357f1f517ee1d4c1d5a2fed Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 19 Feb 2024 11:37:03 +0100 Subject: [PATCH 2/9] [formula] check if proc to match types forbidden by {.error.}, ignore --- src/datamancer/formula.nim | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/datamancer/formula.nim b/src/datamancer/formula.nim index 282b137..22e9194 100644 --- a/src/datamancer/formula.nim +++ b/src/datamancer/formula.nim @@ -629,10 +629,29 @@ proc maybeAddSpecialTypes(possibleTypes: var PossibleTypes, n: NimNode) = if strVal in ["<", ">", ">=", "<=", "==", "!="]: for dtype in Dtypes: possibleTypes.add ProcType(inputTypes: @[ident(dtype), - ident(dtype)], + ident(dtype)], isGeneric: false, resType: some(ident("bool"))) +proc isForbiddenByErrorPragma*(n: NimNode): bool = + ## Checks whether the given procedure is actually forbidden by usage of the `{.error: "".}` pragma. + ## This happens e.g. in arraymancer for the `+` and similar operations between Tensor and Scalar + ## nowadays. + ## + ## An example of such a body: + ## + ## StmtList + ## CommentStmt "Mathematical addition of tensors and scalars is undefined. Must use a broadcasted addition instead" + ## Pragma + ## ExprColonExpr + ## Ident "error" + ## StrLit "To add a tensor to a scalar you must use the `+.` operator (instead of a plain `+` operator)" + ## + let body = n.body + if body.kind == nnkStmtList: # magic procs can have an empty body! + result = body[^1].kind == nnkPragma and body[^1][0].kind == nnkExprColonExpr and + body[^1][0][0].kind in {nnkIdent, nnkSym} and body[^1][0][0].strVal == "error" + proc findType(n: NimNode, numArgs: int): PossibleTypes = ## This procedure tries to find type information about a given NimNode. var possibleTypes = PossibleTypes() @@ -684,6 +703,8 @@ proc findType(n: NimNode, numArgs: int): PossibleTypes = let tImpl = ch.getImpl case tImpl.kind of nnkProcDef, nnkFuncDef: + if tImpl.isForbiddenByErrorPragma(): continue # Forbidden by `{.error: "".}`, skip this + let pt = determineTypeFromProc(tImpl, numArgs) if pt.isSome: possibleTypes.add pt.get From 0ff1347d44e9b786d21efba8fc935e5cd6084aa0 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 19 Feb 2024 11:37:29 +0100 Subject: [PATCH 3/9] [tests] update test cases for {.error.} pragma regressions Without the previous commit of checking for the error pragma the original tests did now not pass. We were generating code `Tensor + Scalar` in the previous version for this. But that was never actually intended. Technically it was simply allowed and 'correct' given our logic and available tools in arraymancer. --- tests/testDf.nim | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/testDf.nim b/tests/testDf.nim index 1a0eee0..6e6a851 100644 --- a/tests/testDf.nim +++ b/tests/testDf.nim @@ -1987,15 +1987,32 @@ suite "Formulas": ## This test is really only to test that the `mutate` formula shown here is ## actually compiled correctly into a mapping operation, with or without ## user given `~` + let mpgDf = readCsv("data/mpg.csv") + block WithName: + let df = mpgDf + .group_by("class") + .mutate(f{float -> float: "subMeanHwy" ~ idx(`cty`) + mean(df["hwy"])}) + .arrange("class") + check df.len == 234 + check df["subMeanHwy", float][0 ..< 5] == [40.8, 39.8, 40.8, 39.8, 39.8].toTensor block: - let df = readCsv("data/mpg.csv") + ## Check that it also works correctly without `idx`! + let df = mpgDf .group_by("class") .mutate(f{float -> float: "subMeanHwy" ~ `cty` + mean(df["hwy"])}) .arrange("class") check df.len == 234 check df["subMeanHwy", float][0 ..< 5] == [40.8, 39.8, 40.8, 39.8, 39.8].toTensor block: - let df = readCsv("data/mpg.csv") + ## And same with direct `col` + let df = mpgDf + .group_by("class") + .mutate(f{float -> float: "subMeanHwy" ~ `cty` + mean(col("hwy"))}) + .arrange("class") + check df.len == 234 + check df["subMeanHwy", float][0 ..< 5] == [40.8, 39.8, 40.8, 39.8, 39.8].toTensor + block NoName: ## without an explicit name + let df = mpgDf .group_by("class") .mutate(f{float -> float: `cty` + mean(df["hwy"])}) .arrange("class") From 03a5b2a6cfa9bac709e49459c73c8d80ae0b89be Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 19 Feb 2024 11:38:39 +0100 Subject: [PATCH 4/9] update changelog --- changelog.org | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/changelog.org b/changelog.org index fbaeead..f304df5 100644 --- a/changelog.org +++ b/changelog.org @@ -1,3 +1,9 @@ +* v0.4.1 +- remove our ~len~ for ~Tensor~ due to addition in arraymaner now + (update arraymancer dep to ~>= 0.7.28~) +- improvement to formula logic to detect procedures with ~{.error: + "".}~ pragma usage and ignore them for type matching (necessary due + to forbidding e.g. ~+~ in arraymancer for ~Tensor + Scalar~ now) * v0.4.0 - add ~shuffle~ to shuffle a DF. Either using stdlib global RNG or given RNG From a6ba1a7651f386252528c98553e24b2f365bb7b4 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 19 Feb 2024 11:40:26 +0100 Subject: [PATCH 5/9] update github CI to run 1.6, 2.0 and devel --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c712c1e..d249aaf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [version-1-4, devel] + branch: [version-1-6, version-2-0, devel] target: [linux, macos, windows] include: - target: linux From 8bb28a93822aef52a367cb35a45b51a0edf90604 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 19 Feb 2024 13:04:42 +0100 Subject: [PATCH 6/9] [tests] disable `max(c"x")` using tests for 1.6, 2.0 On versions other than devel, `max` here is being bound to the new `max` that takes `varargs[Tensor]` and produces `Tensor`, breaking the formula! --- tests/testDf.nim | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/tests/testDf.nim b/tests/testDf.nim index 6e6a851..f97bd54 100644 --- a/tests/testDf.nim +++ b/tests/testDf.nim @@ -6,6 +6,11 @@ from os import removeFile when not declared(AssertionDefect): type AssertionDefect = AssertionError +template onlyDevel(body: untyped): untyped = + ## Used to disable some tests on older nim versions + when (NimMajor, NimMinor, NimPatch) >= (2, 1, 0): + body + suite "Column": test "Constant columns": let c = constantColumn(12, 100) @@ -901,18 +906,35 @@ suite "DataTable tests": test "Filter - comparisons using function": let x = toSeq(0 .. 100) let df = toDf(x) - let dfFilter = df.filter(f{float: c"x" >= max(c"x") * 0.5}) - check dfFilter["x"].toTensor(int) == toTensor toSeq(50 .. 100) - + block: + let dfFilter = df.filter(f{float: c"x" >= max(col("x")) * 0.5}) + check dfFilter["x"].toTensor(int) == toTensor toSeq(50 .. 100) + onlyDevel: + block: + let dfFilter = df.filter(f{float: c"x" >= max(col("x")) * 0.5}) + check dfFilter["x"].toTensor(int) == toTensor toSeq(50 .. 100) test "Filter - data types": let x = toSeq(0 .. 100) let df = toDf(x) - let dfFiltered = df.filter(f{float: c"x" >= max(c"x") * 0.5}) - check dfFiltered["x"].kind == colInt - let dfReduced1 = df.summarize(f{int: max(c"x")}) - check dfReduced1["(max x)"].kind == colInt - let dfReduced2 = df.summarize(f{float: max(c"x")}) - check dfReduced2["(max x)"].kind == colFloat + block: + let dfFiltered = df.filter(f{float: c"x" >= max(col("x")) * 0.5}) + check dfFiltered["x"].kind == colInt + let dfReduced1 = df.summarize(f{int: max(col("x"))}) + check dfReduced1["(max (col x))"].kind == colInt + let dfReduced2 = df.summarize(f{float: max(col("x"))}) + check dfReduced2["(max (col x))"].kind == colFloat + ## Test that `c"x"` works on devel. On older Nim (1.6, 2.0) the `max` is bound directly + ## to `max(varargs[Tensor[T]]): Tensor[T]`, which breaks the formula! + ## This `max` is a recent addition in arraymancer. + ## On devel this seems fixed. + onlyDevel: + block: + let dfFiltered = df.filter(f{float: c"x" >= max(c"x") * 0.5}) + check dfFiltered["x"].kind == colInt + let dfReduced1 = df.summarize(f{int: max(c"x")}) + check dfReduced1["(max x)"].kind == colInt + let dfReduced2 = df.summarize(f{float: max(c"x")}) + check dfReduced2["(max x)"].kind == colFloat test "Transmute - float arithmetic": let x = toSeq(0 ..< 100) From 06eaca7b351087e4c75003b1084eb65f9c9168a4 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 19 Feb 2024 13:06:53 +0100 Subject: [PATCH 7/9] update changelog --- changelog.org | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/changelog.org b/changelog.org index f304df5..32ea516 100644 --- a/changelog.org +++ b/changelog.org @@ -3,7 +3,14 @@ (update arraymancer dep to ~>= 0.7.28~) - improvement to formula logic to detect procedures with ~{.error: "".}~ pragma usage and ignore them for type matching (necessary due - to forbidding e.g. ~+~ in arraymancer for ~Tensor + Scalar~ now) + to forbidding e.g. ~+~ in arraymancer for ~Tensor + Scalar~ now) + +*Note*: If you are on Nim 1.6 or 2.0 (but not after) and you are using + a formula with ~max(c"foo")~ where ~foo~ is a column name, you will + have to replace that by ~max(col("foo"))~. Due to a new addition of a + ~max(varargs[Tensor]]): Tensor~ type procedure in arraymancer, for + some reason older Nim versions bind the symbol ~max~ fully to that + ~varargs~ version. That breaks previous formulas. * v0.4.0 - add ~shuffle~ to shuffle a DF. Either using stdlib global RNG or given RNG From 161802f7eb522b68e3592e55843f7c7b85cf23ce Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 19 Feb 2024 13:18:41 +0100 Subject: [PATCH 8/9] [tests] update other tests accordingly --- tests/tests.nim | 17 ++++++++++++--- tests/testsFormula.nim | 47 ++++++++++++++++++++++++++++++++---------- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/tests/tests.nim b/tests/tests.nim index cdf3256..80914e7 100644 --- a/tests/tests.nim +++ b/tests/tests.nim @@ -11,6 +11,11 @@ proc almostEq(a, b: float, epsilon = 1e-8): bool = if not result: echo "Comparison failed: a = ", a, ", b = ", b +template onlyDevel(body: untyped): untyped = + ## Used to disable some tests on older nim versions + when (NimMajor, NimMinor, NimPatch) >= (2, 1, 0): + body + suite "Value": let v1 = %~ 1 @@ -210,9 +215,15 @@ suite "Formula": check h.val == %~ false check h.name == "(== (%~ tup.a) (%~ tup.b))" - let f2 = f{float: "min" << min(c"runTimes")} - check $f2 == "min" # LHS of formula - check f2.name == "(<< min (min runTimes))" + block: + let f2 = f{float: "min" << min(col("runTimes"))} + check $f2 == "min" # LHS of formula + check f2.name == "(<< min (min (col runTimes)))" + onlyDevel: + block: + let f2 = f{float: "min" << min(c"runTimes")} + check $f2 == "min" # LHS of formula + check f2.name == "(<< min (min runTimes))" test "Evaluate raw formula (no DF column dependency)": diff --git a/tests/testsFormula.nim b/tests/testsFormula.nim index 0626cfd..cdf27de 100644 --- a/tests/testsFormula.nim +++ b/tests/testsFormula.nim @@ -11,6 +11,11 @@ template fails(body: untyped): untyped = else: check true +template onlyDevel(body: untyped): untyped = + ## Used to disable some tests on older nim versions + when (NimMajor, NimMinor, NimPatch) >= (2, 1, 0): + body + when (NimMajor, NimMinor, NimPatch) < (1, 6, 0): proc isNan(x: float): bool = result = classify(x) == fcNaN @@ -79,9 +84,9 @@ suite "Formulas": # - prefix, automatic type deduction let fn = f{ not idx("f") } check fn.evaluate(df).bCol == [true, false, true].toTensor - block: - let fn = f{ idx("x") >= max(col("x")) * 0.5 } - + onlyDevel: + block: + let fn = f{ idx("x") >= max(col("x")) * 0.5 } block: let fn = f{ parseInt(idx("a")) > 2 } @@ -117,16 +122,32 @@ suite "Formulas": # - type deduction based on `idx` in specific argument of a typically overloaded # symbol. Can be deduced due to only single overload matching the arguments proc someInt(): int = 2 - proc max(x: int, y: string, z: float, b: int): int = - result = 5 - let fn = f{ max(idx("a"), "hello", 5.5, someInt()) } - check fn.evaluate(df).iCol == [5, 5, 5].toTensor + + ## On none devel, `max` and `min` are broken + onlyDevel: + block: + proc max(x: int, y: string, z: float, b: int): int = + result = 5 + + let fn = f{ max(idx("a"), "hello", 5.5, someInt()) } + check fn.evaluate(df).iCol == [5, 5, 5].toTensor + block: + proc getAnInt(x: int, y: string, z: float, b: int): int = + result = 5 + + let fn = f{ getAnInt(idx("a"), "hello", 5.5, someInt()) } + check fn.evaluate(df).iCol == [5, 5, 5].toTensor block: # - automatically determines that `a` should be read as `int` # - formula is mapping - let fn = f{ max(idx("a"), 2) } - check fn.evaluate(df).iCol == [2, 2, 3].toTensor + onlyDevel: + block: + let fn = f{ max(idx("a"), 2) } + check fn.evaluate(df).iCol == [2, 2, 3].toTensor + block: # on 1.6, 2.0 requires explicit type hint. Should work though! + let fn = f{int: max(idx("a"), 2) } + check fn.evaluate(df).iCol == [2, 2, 3].toTensor test "Formula with an if expression accessing multiple columns": block: @@ -174,8 +195,12 @@ suite "Formulas": check fn.evaluate(df).fCol == [5.5, 5.5, 5.5].toTensor test "`max` overload is resolved in context of infix with float": - block: - let fn = f{ `a` >= max(`a`) * 0.5 } + onlyDevel: + block: + let fn = f{ `a` >= max(`a`) * 0.5 } + check fn.evaluate(df).bCol == [false, true, true].toTensor + block: # For 1.6, 2.0 does not work anymore! + let fn = f{float: `a` >= max(col("a")) * 0.5 } check fn.evaluate(df).bCol == [false, true, true].toTensor block: From 66c393e5556f1b8accffcffd8642b84e8af9dedb Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 19 Feb 2024 13:19:24 +0100 Subject: [PATCH 9/9] update changelog --- changelog.org | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/changelog.org b/changelog.org index 32ea516..701c3f3 100644 --- a/changelog.org +++ b/changelog.org @@ -11,6 +11,15 @@ ~max(varargs[Tensor]]): Tensor~ type procedure in arraymancer, for some reason older Nim versions bind the symbol ~max~ fully to that ~varargs~ version. That breaks previous formulas. + In formulas using ~max~ (or similarly ~min~) it may also be necessary + to supply type hints, even if they were not required previously for + related reasons. E.g. replace: + ~f{ max(idx("a"), 2) }~ + by + ~f{int: max(idx("a"), 2) }~ + to indicate the column ~"a"~ should be treated as integers. + Given that this is fixed on Nim devel, I won't attempt a hacky + workaround. Please just update your Nim version or your formulas. :( * v0.4.0 - add ~shuffle~ to shuffle a DF. Either using stdlib global RNG or given RNG