Skip to content

Commit

Permalink
Use drop_duplicates() instead of groupby (about 1.5~2x faster) (#1617)
Browse files Browse the repository at this point in the history
* Use drop_duplicates() instead of groupby (about 1.5~2x faster)

Signed-off-by: rightx2 <rightx2@gmail.com>

* Lint

Signed-off-by: rightx2 <rightx2@gmail.com>
  • Loading branch information
rightx2 authored Jun 4, 2021
1 parent 24dc3f4 commit 024737c
Showing 1 changed file with 9 additions and 6 deletions.
15 changes: 9 additions & 6 deletions sdk/python/feast/infra/offline_stores/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,12 @@ def evaluate_historical_retrieval():
]

df_to_join.sort_values(by=right_entity_key_sort_columns, inplace=True)
df_to_join = df_to_join.groupby(by=right_entity_key_columns).last()
df_to_join.reset_index(inplace=True)
df_to_join.drop_duplicates(
right_entity_key_sort_columns,
keep="last",
ignore_index=True,
inplace=True,
)

# Select only the columns we need to join from the feature dataframe
df_to_join = df_to_join[right_entity_key_columns + feature_names]
Expand Down Expand Up @@ -231,10 +235,9 @@ def pull_latest_from_table_or_query(
(source_df[event_timestamp_column] >= start_date)
& (source_df[event_timestamp_column] < end_date)
]
last_values_df = filtered_df.groupby(by=join_key_columns).last()

# make driver_id a normal column again
last_values_df.reset_index(inplace=True)
last_values_df = filtered_df.drop_duplicates(
join_key_columns, keep="last", ignore_index=True
)

columns_to_extract = set(join_key_columns + feature_name_columns + ts_columns)
table = pyarrow.Table.from_pandas(last_values_df[columns_to_extract])
Expand Down

0 comments on commit 024737c

Please sign in to comment.