diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c712c1e..d249aaf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [version-1-4, devel] + branch: [version-1-6, version-2-0, devel] target: [linux, macos, windows] include: - target: linux diff --git a/changelog.org b/changelog.org index fbaeead..701c3f3 100644 --- a/changelog.org +++ b/changelog.org @@ -1,3 +1,25 @@ +* v0.4.1 +- remove our ~len~ for ~Tensor~ due to addition in arraymaner now + (update arraymancer dep to ~>= 0.7.28~) +- improvement to formula logic to detect procedures with ~{.error: + "".}~ pragma usage and ignore them for type matching (necessary due + to forbidding e.g. ~+~ in arraymancer for ~Tensor + Scalar~ now) + +*Note*: If you are on Nim 1.6 or 2.0 (but not after) and you are using + a formula with ~max(c"foo")~ where ~foo~ is a column name, you will + have to replace that by ~max(col("foo"))~. Due to a new addition of a + ~max(varargs[Tensor]]): Tensor~ type procedure in arraymancer, for + some reason older Nim versions bind the symbol ~max~ fully to that + ~varargs~ version. That breaks previous formulas. + In formulas using ~max~ (or similarly ~min~) it may also be necessary + to supply type hints, even if they were not required previously for + related reasons. E.g. replace: + ~f{ max(idx("a"), 2) }~ + by + ~f{int: max(idx("a"), 2) }~ + to indicate the column ~"a"~ should be treated as integers. + Given that this is fixed on Nim devel, I won't attempt a hacky + workaround. Please just update your Nim version or your formulas. :( * v0.4.0 - add ~shuffle~ to shuffle a DF. Either using stdlib global RNG or given RNG diff --git a/datamancer.nimble b/datamancer.nimble index 6867d84..d8d7176 100644 --- a/datamancer.nimble +++ b/datamancer.nimble @@ -11,7 +11,7 @@ srcDir = "src" requires "nim >= 1.2.0" requires "https://github.com/Vindaar/seqmath >= 0.1.11" -requires "arraymancer >= 0.7.1" +requires "arraymancer >= 0.7.28" task test, "Run standard tests": exec "nim c -r tests/testDf.nim" diff --git a/src/datamancer/dataframe.nim b/src/datamancer/dataframe.nim index 349c5b5..8214d30 100644 --- a/src/datamancer/dataframe.nim +++ b/src/datamancer/dataframe.nim @@ -121,12 +121,6 @@ proc shallowCopy*[C: ColumnLike](df: DataTable[C]): DataTable[C] = # ---------- General convenience helpers ---------- -func len*[T](t: Tensor[T]): int = - ## Helper proc for 1D `Tensor[T]` to return the length of the vector, which - ## corresponds to a length of a DF column. - assert t.rank == 1 - result = t.size - proc contains*[C: ColumnLike](df: DataTable[C], key: string): bool = ## Checks if the `key` names column in the `DataFrame`. result = df.data.hasKey(key) diff --git a/src/datamancer/formula.nim b/src/datamancer/formula.nim index 282b137..22e9194 100644 --- a/src/datamancer/formula.nim +++ b/src/datamancer/formula.nim @@ -629,10 +629,29 @@ proc maybeAddSpecialTypes(possibleTypes: var PossibleTypes, n: NimNode) = if strVal in ["<", ">", ">=", "<=", "==", "!="]: for dtype in Dtypes: possibleTypes.add ProcType(inputTypes: @[ident(dtype), - ident(dtype)], + ident(dtype)], isGeneric: false, resType: some(ident("bool"))) +proc isForbiddenByErrorPragma*(n: NimNode): bool = + ## Checks whether the given procedure is actually forbidden by usage of the `{.error: "".}` pragma. + ## This happens e.g. in arraymancer for the `+` and similar operations between Tensor and Scalar + ## nowadays. + ## + ## An example of such a body: + ## + ## StmtList + ## CommentStmt "Mathematical addition of tensors and scalars is undefined. Must use a broadcasted addition instead" + ## Pragma + ## ExprColonExpr + ## Ident "error" + ## StrLit "To add a tensor to a scalar you must use the `+.` operator (instead of a plain `+` operator)" + ## + let body = n.body + if body.kind == nnkStmtList: # magic procs can have an empty body! + result = body[^1].kind == nnkPragma and body[^1][0].kind == nnkExprColonExpr and + body[^1][0][0].kind in {nnkIdent, nnkSym} and body[^1][0][0].strVal == "error" + proc findType(n: NimNode, numArgs: int): PossibleTypes = ## This procedure tries to find type information about a given NimNode. var possibleTypes = PossibleTypes() @@ -684,6 +703,8 @@ proc findType(n: NimNode, numArgs: int): PossibleTypes = let tImpl = ch.getImpl case tImpl.kind of nnkProcDef, nnkFuncDef: + if tImpl.isForbiddenByErrorPragma(): continue # Forbidden by `{.error: "".}`, skip this + let pt = determineTypeFromProc(tImpl, numArgs) if pt.isSome: possibleTypes.add pt.get diff --git a/tests/testDf.nim b/tests/testDf.nim index 1a0eee0..f97bd54 100644 --- a/tests/testDf.nim +++ b/tests/testDf.nim @@ -6,6 +6,11 @@ from os import removeFile when not declared(AssertionDefect): type AssertionDefect = AssertionError +template onlyDevel(body: untyped): untyped = + ## Used to disable some tests on older nim versions + when (NimMajor, NimMinor, NimPatch) >= (2, 1, 0): + body + suite "Column": test "Constant columns": let c = constantColumn(12, 100) @@ -901,18 +906,35 @@ suite "DataTable tests": test "Filter - comparisons using function": let x = toSeq(0 .. 100) let df = toDf(x) - let dfFilter = df.filter(f{float: c"x" >= max(c"x") * 0.5}) - check dfFilter["x"].toTensor(int) == toTensor toSeq(50 .. 100) - + block: + let dfFilter = df.filter(f{float: c"x" >= max(col("x")) * 0.5}) + check dfFilter["x"].toTensor(int) == toTensor toSeq(50 .. 100) + onlyDevel: + block: + let dfFilter = df.filter(f{float: c"x" >= max(col("x")) * 0.5}) + check dfFilter["x"].toTensor(int) == toTensor toSeq(50 .. 100) test "Filter - data types": let x = toSeq(0 .. 100) let df = toDf(x) - let dfFiltered = df.filter(f{float: c"x" >= max(c"x") * 0.5}) - check dfFiltered["x"].kind == colInt - let dfReduced1 = df.summarize(f{int: max(c"x")}) - check dfReduced1["(max x)"].kind == colInt - let dfReduced2 = df.summarize(f{float: max(c"x")}) - check dfReduced2["(max x)"].kind == colFloat + block: + let dfFiltered = df.filter(f{float: c"x" >= max(col("x")) * 0.5}) + check dfFiltered["x"].kind == colInt + let dfReduced1 = df.summarize(f{int: max(col("x"))}) + check dfReduced1["(max (col x))"].kind == colInt + let dfReduced2 = df.summarize(f{float: max(col("x"))}) + check dfReduced2["(max (col x))"].kind == colFloat + ## Test that `c"x"` works on devel. On older Nim (1.6, 2.0) the `max` is bound directly + ## to `max(varargs[Tensor[T]]): Tensor[T]`, which breaks the formula! + ## This `max` is a recent addition in arraymancer. + ## On devel this seems fixed. + onlyDevel: + block: + let dfFiltered = df.filter(f{float: c"x" >= max(c"x") * 0.5}) + check dfFiltered["x"].kind == colInt + let dfReduced1 = df.summarize(f{int: max(c"x")}) + check dfReduced1["(max x)"].kind == colInt + let dfReduced2 = df.summarize(f{float: max(c"x")}) + check dfReduced2["(max x)"].kind == colFloat test "Transmute - float arithmetic": let x = toSeq(0 ..< 100) @@ -1987,15 +2009,32 @@ suite "Formulas": ## This test is really only to test that the `mutate` formula shown here is ## actually compiled correctly into a mapping operation, with or without ## user given `~` + let mpgDf = readCsv("data/mpg.csv") + block WithName: + let df = mpgDf + .group_by("class") + .mutate(f{float -> float: "subMeanHwy" ~ idx(`cty`) + mean(df["hwy"])}) + .arrange("class") + check df.len == 234 + check df["subMeanHwy", float][0 ..< 5] == [40.8, 39.8, 40.8, 39.8, 39.8].toTensor block: - let df = readCsv("data/mpg.csv") + ## Check that it also works correctly without `idx`! + let df = mpgDf .group_by("class") .mutate(f{float -> float: "subMeanHwy" ~ `cty` + mean(df["hwy"])}) .arrange("class") check df.len == 234 check df["subMeanHwy", float][0 ..< 5] == [40.8, 39.8, 40.8, 39.8, 39.8].toTensor block: - let df = readCsv("data/mpg.csv") + ## And same with direct `col` + let df = mpgDf + .group_by("class") + .mutate(f{float -> float: "subMeanHwy" ~ `cty` + mean(col("hwy"))}) + .arrange("class") + check df.len == 234 + check df["subMeanHwy", float][0 ..< 5] == [40.8, 39.8, 40.8, 39.8, 39.8].toTensor + block NoName: ## without an explicit name + let df = mpgDf .group_by("class") .mutate(f{float -> float: `cty` + mean(df["hwy"])}) .arrange("class") diff --git a/tests/tests.nim b/tests/tests.nim index cdf3256..80914e7 100644 --- a/tests/tests.nim +++ b/tests/tests.nim @@ -11,6 +11,11 @@ proc almostEq(a, b: float, epsilon = 1e-8): bool = if not result: echo "Comparison failed: a = ", a, ", b = ", b +template onlyDevel(body: untyped): untyped = + ## Used to disable some tests on older nim versions + when (NimMajor, NimMinor, NimPatch) >= (2, 1, 0): + body + suite "Value": let v1 = %~ 1 @@ -210,9 +215,15 @@ suite "Formula": check h.val == %~ false check h.name == "(== (%~ tup.a) (%~ tup.b))" - let f2 = f{float: "min" << min(c"runTimes")} - check $f2 == "min" # LHS of formula - check f2.name == "(<< min (min runTimes))" + block: + let f2 = f{float: "min" << min(col("runTimes"))} + check $f2 == "min" # LHS of formula + check f2.name == "(<< min (min (col runTimes)))" + onlyDevel: + block: + let f2 = f{float: "min" << min(c"runTimes")} + check $f2 == "min" # LHS of formula + check f2.name == "(<< min (min runTimes))" test "Evaluate raw formula (no DF column dependency)": diff --git a/tests/testsFormula.nim b/tests/testsFormula.nim index 0626cfd..cdf27de 100644 --- a/tests/testsFormula.nim +++ b/tests/testsFormula.nim @@ -11,6 +11,11 @@ template fails(body: untyped): untyped = else: check true +template onlyDevel(body: untyped): untyped = + ## Used to disable some tests on older nim versions + when (NimMajor, NimMinor, NimPatch) >= (2, 1, 0): + body + when (NimMajor, NimMinor, NimPatch) < (1, 6, 0): proc isNan(x: float): bool = result = classify(x) == fcNaN @@ -79,9 +84,9 @@ suite "Formulas": # - prefix, automatic type deduction let fn = f{ not idx("f") } check fn.evaluate(df).bCol == [true, false, true].toTensor - block: - let fn = f{ idx("x") >= max(col("x")) * 0.5 } - + onlyDevel: + block: + let fn = f{ idx("x") >= max(col("x")) * 0.5 } block: let fn = f{ parseInt(idx("a")) > 2 } @@ -117,16 +122,32 @@ suite "Formulas": # - type deduction based on `idx` in specific argument of a typically overloaded # symbol. Can be deduced due to only single overload matching the arguments proc someInt(): int = 2 - proc max(x: int, y: string, z: float, b: int): int = - result = 5 - let fn = f{ max(idx("a"), "hello", 5.5, someInt()) } - check fn.evaluate(df).iCol == [5, 5, 5].toTensor + + ## On none devel, `max` and `min` are broken + onlyDevel: + block: + proc max(x: int, y: string, z: float, b: int): int = + result = 5 + + let fn = f{ max(idx("a"), "hello", 5.5, someInt()) } + check fn.evaluate(df).iCol == [5, 5, 5].toTensor + block: + proc getAnInt(x: int, y: string, z: float, b: int): int = + result = 5 + + let fn = f{ getAnInt(idx("a"), "hello", 5.5, someInt()) } + check fn.evaluate(df).iCol == [5, 5, 5].toTensor block: # - automatically determines that `a` should be read as `int` # - formula is mapping - let fn = f{ max(idx("a"), 2) } - check fn.evaluate(df).iCol == [2, 2, 3].toTensor + onlyDevel: + block: + let fn = f{ max(idx("a"), 2) } + check fn.evaluate(df).iCol == [2, 2, 3].toTensor + block: # on 1.6, 2.0 requires explicit type hint. Should work though! + let fn = f{int: max(idx("a"), 2) } + check fn.evaluate(df).iCol == [2, 2, 3].toTensor test "Formula with an if expression accessing multiple columns": block: @@ -174,8 +195,12 @@ suite "Formulas": check fn.evaluate(df).fCol == [5.5, 5.5, 5.5].toTensor test "`max` overload is resolved in context of infix with float": - block: - let fn = f{ `a` >= max(`a`) * 0.5 } + onlyDevel: + block: + let fn = f{ `a` >= max(`a`) * 0.5 } + check fn.evaluate(df).bCol == [false, true, true].toTensor + block: # For 1.6, 2.0 does not work anymore! + let fn = f{float: `a` >= max(col("a")) * 0.5 } check fn.evaluate(df).bCol == [false, true, true].toTensor block: