Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support filtering for reviewed only edits #881

Merged
7 changes: 7 additions & 0 deletions dataquality/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ def get_edited_dataframe(
include_token_indices: bool = False,
hf_format: bool = False,
tagging_schema: Optional[TaggingSchema] = None,
reviewed_only: Optional[bool] = False,
as_pandas: bool = True,
include_data_embs: bool = False,
) -> Union[pd.DataFrame, DataFrame]:
Expand Down Expand Up @@ -383,6 +384,8 @@ def get_edited_dataframe(
Whether to export the dataframe in a HuggingFace compatible format
:param tagging_schema: (NER only)
If hf_format is True, you must pass a tagging schema
:param reviewed_only: Whether to export only reviewed edits or all edits.
Default: False (all edits)
:param as_pandas: Whether to return the dataframe as a pandas df (or vaex if False)
If you are having memory issues (the data is too large), set this to False,
and vaex will memory map the data. If any columns returned are multi-dimensional
Expand All @@ -405,6 +408,10 @@ def get_edited_dataframe(
tagging_schema=tagging_schema,
)
data_df = vaex.open(file_name)

if reviewed_only:
data_df = data_df[data_df.reviewers]

return _process_exported_dataframe(
data_df,
project_name,
Expand Down
23 changes: 20 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ accelerate = ">=0.19.0"
typing-inspect = "==0.8.0"
typing-extensions = ">=4.9.0"
lightning = "^2.3.1" # Assuming you want the latest version as no version was specified
pytest-mock = "^3.14.0"



Expand Down
199 changes: 199 additions & 0 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
from unittest.mock import ANY, Mock

import vaex

from dataquality import metrics


def test_get_edited_dataframe_all_edits(mocker):
reviewed_only = False

project_id = "project_id"
project_name = "project_name"
run_id = "run_id"
run_name = "run_name"
task_type = "task_type"
file_type = Mock()
uuid = "test-uuid"
inference_name = ""
hf_format = False
tagging_schema = Mock()
as_pandas = True
include_embs = False
include_data_embs = False
include_probs = False
include_token_indices = False

test_df = vaex.from_dict(
{
"id": range(0, 10),
"confidence": [0.7] * 10,
"is_drifted": [False] * 7 + [True] * 3,
"reviewers": [[]] * 7 + [["review1"]] * 3,
}
)

api_mock = mocker.patch.object(metrics, "api_client")
split = Mock()
conform_split_mock = mocker.patch("dataquality.metrics.conform_split")
split_mock = conform_split_mock.return_value

api_mock._get_project_run_id.return_value = [project_id, run_id]
api_mock.get_task_type.return_value = task_type

mocker.patch("dataquality.metrics.uuid4", return_value=uuid)
mocker.patch("dataquality.metrics.vaex.open", return_value=test_df)

_process_exported_dataframe_mock = mocker.patch(
"dataquality.metrics._process_exported_dataframe"
)

response = metrics.get_edited_dataframe(
project_name,
run_name,
split,
inference_name,
file_type,
include_embs,
include_probs,
include_token_indices,
hf_format,
tagging_schema,
reviewed_only,
as_pandas,
include_data_embs,
)

assert response == _process_exported_dataframe_mock.return_value
conform_split_mock.assert_called_once_with(split)
api_mock._get_project_run_id.assert_called_once_with(project_name, run_name)
api_mock.get_task_type.assert_called_once_with(project_id, run_id)

api_mock.export_edits.assert_called_once_with(
project_name,
run_name,
split_mock,
inference_name=inference_name,
file_name=f"/tmp/{uuid}-data.{file_type}",
hf_format=hf_format,
tagging_schema=tagging_schema,
)

_process_exported_dataframe_mock.assert_called_once_with(
test_df,
project_name,
run_name,
split_mock,
task_type,
inference_name,
include_embs,
include_probs,
include_token_indices,
hf_format,
as_pandas,
include_data_embs,
)


def test_get_edited_dataframe_reviewed_only_edits(mocker):
reviewed_only = True

project_id = "project_id"
project_name = "project_name"
run_id = "run_id"
run_name = "run_name"
task_type = "task_type"
file_type = Mock()
uuid = "test-uuid"
inference_name = ""
hf_format = False
tagging_schema = Mock()
as_pandas = True
include_embs = False
include_data_embs = False
include_probs = False
include_token_indices = False

test_df = vaex.from_dict(
{
"id": range(0, 10),
"confidence": [0.7] * 10,
"is_drifted": [False] * 7 + [True] * 3,
"reviewers": [[]] * 7 + [["review1"]] * 3,
}
)

expected_df = vaex.from_dict(
{
"id": range(7, 10),
"confidence": [0.7] * 3,
"is_drifted": [True] * 3,
}
)
pandas_df = expected_df.to_pandas_df()
pandas_df["reviewers"] = [["review1"]] * 3
expected_df = vaex.from_pandas(pandas_df)

api_mock = mocker.patch.object(metrics, "api_client")
split = Mock()
conform_split_mock = mocker.patch("dataquality.metrics.conform_split")
split_mock = conform_split_mock.return_value

api_mock._get_project_run_id.return_value = [project_id, run_id]
api_mock.get_task_type.return_value = task_type

mocker.patch("dataquality.metrics.uuid4", return_value=uuid)
mocker.patch("dataquality.metrics.vaex.open", return_value=test_df)

_process_exported_dataframe_mock = mocker.patch(
"dataquality.metrics._process_exported_dataframe"
)

response = metrics.get_edited_dataframe(
project_name,
run_name,
split,
inference_name,
file_type,
include_embs,
include_probs,
include_token_indices,
hf_format,
tagging_schema,
reviewed_only,
as_pandas,
include_data_embs,
)

assert response == _process_exported_dataframe_mock.return_value
conform_split_mock.assert_called_once_with(split)
api_mock._get_project_run_id.assert_called_once_with(project_name, run_name)
api_mock.get_task_type.assert_called_once_with(project_id, run_id)

api_mock.export_edits.assert_called_once_with(
project_name,
run_name,
split_mock,
inference_name=inference_name,
file_name=f"/tmp/{uuid}-data.{file_type}",
hf_format=hf_format,
tagging_schema=tagging_schema,
)

_process_exported_dataframe_mock.assert_called_once_with(
ANY,
project_name,
run_name,
split_mock,
task_type,
inference_name,
include_embs,
include_probs,
include_token_indices,
hf_format,
as_pandas,
include_data_embs,
)

call_df = _process_exported_dataframe_mock.call_args_list[0][0][0]
assert call_df.reviewers.tolist() == [["review1"]] * 3
Loading