From e6c1edb2d16da906ab2f41e8341dbe71f7e2708d Mon Sep 17 00:00:00 2001 From: Cory Grinstead Date: Wed, 9 Aug 2023 12:02:19 -0500 Subject: [PATCH] fix: properly parse js regex patterns (#92) --- __tests__/expr.test.ts | 21 +++++++++++++++++++-- polars/utils.ts | 9 ++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/__tests__/expr.test.ts b/__tests__/expr.test.ts index 1ea0d40ea..e90c6b66e 100644 --- a/__tests__/expr.test.ts +++ b/__tests__/expr.test.ts @@ -947,6 +947,23 @@ describe("expr.str", () => { expect(actual).toFrameEqual(expected); expect(seriesActual).toSeriesEqual(expected.getColumn("isLinux")); }); + + test("contains:regex", () => { + const df = pl.DataFrame({ + a: ["Foo", "foo", "FoO"], + }); + + const re = new RegExp("foo", "i"); + const expected = pl.DataFrame({ + a: ["Foo", "foo", "FoO"], + contains: [true, true, true], + }); + const seriesActual = df.getColumn("a").str.contains(re).rename("contains"); + const actual = df.withColumn(col("a").str.contains(re).as("contains")); + expect(actual).toFrameEqual(expected); + expect(seriesActual).toSeriesEqual(expected.getColumn("contains")); + }); + test("split", () => { const df = pl.DataFrame({ a: ["ab,cd", "e,fg", "h"] }); const expected = pl.DataFrame({ @@ -976,12 +993,12 @@ describe("expr.str", () => { const seriesActual = df .getColumn("a") - .str.extract(/candidate=(\w+)/g, 1) + .str.extract(/candidate=(\w+)/, 1) .rename("candidate") .toFrame(); const actual = df.select( - col("a").str.extract(/candidate=(\w+)/g, 1).as("candidate"), + col("a").str.extract(/candidate=(\w+)/, 1).as("candidate"), ); expect(actual).toFrameEqual(expected); expect(seriesActual).toFrameEqual(expected); diff --git a/polars/utils.ts b/polars/utils.ts index 9bec5536e..d17173f53 100644 --- a/polars/utils.ts +++ b/polars/utils.ts @@ -58,9 +58,16 @@ export const isExprArray = (ty: any): ty is Expr[] => Array.isArray(ty) && Expr.isExpr(ty[0]); export const isIterator = (ty: any): ty is Iterable => ty !== null && typeof ty[Symbol.iterator] === "function"; + export const regexToString = (r: string | RegExp): string => { if (isRegExp(r)) { - return r.source; + if (r.flags.includes("g")) { + throw new Error("global flag is not supported"); + } + if (r.flags.includes("y")) { + throw new Error("sticky flag is not supported"); + } + return r.flags ? `(?${r.flags})${r.source}` : r.source; } return r;