Using a function to alter text in a string column #8791
-
Hi, how can I apply this function to all records of a string column? turkish_to_latin = str.maketrans("ğĞıİöÖüÜşŞçÇ", "gGiIoOuUsScC")
def trans(text):
text = text.translate(turkish_to_latin)
return text |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments 1 reply
-
depending on the backend, you should be able to use a user-defined function (UDF) for this. here is the definition of one example and its usage:
or a simpler example in the docs: https://ibis-project.org/reference/scalar-udfs#ibis.expr.operations.udf.scalar.python so you would want something like: turkish_to_latin = str.maketrans("ğĞıİöÖüÜşŞçÇ", "gGiIoOuUsScC")
@ibis.udf.scalar.python
def trans(text):
text = text.translate(turkish_to_latin)
return text
table = con.table("some_table")
table = table.mutate(some_column_name=trans(t.some_column_name)) you could also try using a scalar PyArrow UDF instead of a scalar Python UDF if performance is an issue |
Beta Was this translation helpful? Give feedback.
-
this works well: @classmethod
def col_serializer(cls, t:Table, col:str, *args:int)->Table:
"""squeezes the string column into trimmed args and serializes the given column
t: ibis table, which is the latest modified one or a pandas df
col: str, column name to be squeezed
args: int, max length of the string, can be comma separated, if not given 5, 15 used
"""
# pattern to clean the string
pattern = r'[!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~\s]'
def trans(col):
col = col.re_replace(pattern,'')
for a,b in tuple(zip("ğĞıİöÖüÜşŞçÇ", "gGiIoOuUsScC")):
col = col.re_replace(a,b)
return col
if not isinstance(t, Table):
t = ibis.memtable(t)
# first three chars of the col
col_serialized = f"{col}_serialized"
if not args:
args = (5, 15)
else:
args = (*args,)
t = t.mutate(trans(_[col]).name(f"{col_serialized}"))
for arg in args:
t = t.mutate(_[col_serialized].left(arg).name(f"{col_serialized}{arg}"))
return t ...and I have implemented it in a method to find Levenshtein distance: @classmethod
def levenshtein_distance(cls, t_left:Table, col_left:str, t_right:Table, col_right:str, distance:int=3, chars:int=15)->Table:
"""returns the levenshtein distance of the given columns
t_left: ibis table, which is the left table
col_left: str, column name to be used for left table
t_right: ibis table, which is the right table
col_right: str, column name to be used for right table
distance: int, max distance to be used for filtering leveinshtein distance
chars: int, number of chars to be used for levenshtein distance
usage:
modi.levenshtein_distance(
t_left = t.select("code_customer","customer").distinct(),
col_left = "customer",
t_right= df,
col_right= "firma",
distance=5,
chars=15,
)
"""
left = cls.col_serializer(t_left, col_left, chars)
col_left = f"{col_left}_serialized{chars}"
right = cls.col_serializer(t_right, col_right, chars)
col_right = f"{col_right}_serialized{chars}"
t = (left.cross_join(right)
# calculate levenshtein distance by {chars} serialized chars
.mutate(distance=_[col_left].levenshtein(_[col_right]))
# remove _left and _right cols
.select(sel.matches(r"^(?!.*(_left|_right)).*$"))
# drop col_left and col_right
.drop([col_left, col_right, col_right.replace(str(chars), "")])
# drop serialized col of right
# filter by distance distinct
.filter(_["distance"]<=distance).distinct()
# order by distance
.order_by(["distance"])
)
return t |
Beta Was this translation helpful? Give feedback.
this works well: