Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop Ping Time Duplicates #1382

Merged
merged 12 commits into from
Jan 28, 2025
12 changes: 12 additions & 0 deletions echopype/convert/set_groups_ek80.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ..utils.coding import set_time_encodings
from ..utils.log import _init_logger
from .set_groups_base import SetGroupsBase
from .utils.ek_duplicates import check_unique_ping_time_duplicates

logger = _init_logger(__name__)

Expand Down Expand Up @@ -1145,6 +1146,17 @@ def set_beam(self) -> List[xr.Dataset]:

ds_data = self._attach_vars_to_ds_data(ds_data, ch, rs_size=ds_data.range_sample.size)

# Access the 'ping_time' coordinate as a NumPy array
ping_times = ds_data["ping_time"].values

# Check if ping time duplicates exist
if len(ping_times) > len(np.unique(ping_times)):
# Check for unique ping time duplicates and if they are not unique, raise warning.
check_unique_ping_time_duplicates(ds_data, logger)

# Drop duplicates
ds_data = ds_data.drop_duplicates(dim="ping_time")

if ch in self.sorted_channel["complex"]:
ds_complex.append(ds_data)
else:
Expand Down
44 changes: 44 additions & 0 deletions echopype/convert/utils/ek_duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import logging

import xarray as xr


def check_unique_ping_time_duplicates(ds_data: xr.Dataset, logger: logging.Logger) -> None:
"""
Raises a warning if the data stored in duplicate pings is not unique.

Parameters
----------
ds_data : xr.Dataset
Single freq beam dataset being processed in the `SetGroupsEK80.set_beams` class function.
logger : logging.Logger
Warning logger initialized in `SetGroupsEK80` file.
"""
# Group the dataset by the "ping_time" coordinate
groups = ds_data.groupby("ping_time")

# Loop through each ping_time group
for ping_time_val, group in groups:
# Extract all data variable names to check
data_vars = list(group.data_vars)

# Use the first duplicate ping time index as a reference
ref_duplicate_ping_time_index = 0

# Iterate over each data variable in the group
for var in data_vars:
# Extract data array corresponding to the iterated variable
data_array = group[var]

# Use the slice corresponding to the reference index as the reference slice
ref_slice = data_array.isel({"ping_time": ref_duplicate_ping_time_index})

# Iterate over the remaining entries
for i in range(1, data_array.sizes["ping_time"]):
if not ref_slice.equals(data_array.isel({"ping_time": i})):
logger.warning(
f"Duplicate slices in variable '{var}' corresponding to 'ping_time' "
f"{ping_time_val} differ in data. All duplicate 'ping_time' entries "
"will be removed, which will result in data loss."
)
break
59 changes: 59 additions & 0 deletions echopype/tests/convert/test_convert_ek80.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
import numpy as np
import pandas as pd
from scipy.io import loadmat
import xarray as xr

from echopype import open_raw, open_converted
from echopype.testing import TEST_DATA_FOLDER
from echopype.convert.parse_ek80 import ParseEK80
from echopype.convert.set_groups_ek80 import WIDE_BAND_TRANS, PULSE_COMPRESS, FILTER_IMAG, FILTER_REAL, DECIMATION
from echopype.utils import log
from echopype.convert.utils.ek_duplicates import check_unique_ping_time_duplicates


@pytest.fixture
Expand Down Expand Up @@ -513,6 +516,62 @@ def test_parse_missing_sound_velocity_profile():
shutil.rmtree(save_path)


@pytest.mark.unit
def test_duplicate_ping_times(caplog):
"""
Tests that RAW file with duplicate ping times can be parsed and that the correct warning has been raised.
"""
# Turn on logger verbosity
log.verbose(override=False)

# Open RAW
ed = open_raw("echopype/test_data/ek80_duplicate_ping_times/Hake-D20210913-T130612.raw", sonar_model="EK80")

# Check that there are no ping time duplicates in Beam group
assert ed["Sonar/Beam_group1"].equals(
ed["Sonar/Beam_group1"].drop_duplicates(dim="ping_time")
)

# Check that no warning is logged since the data for all duplicate pings is unique
not_expected_warning = ("All duplicate ping_time entries' will be removed, resulting in potential data loss.")
assert not any(not_expected_warning in record.message for record in caplog.records)

# Turn off logger verbosity
log.verbose(override=True)


@pytest.mark.unit
def test_check_unique_ping_time_duplicates(caplog):
"""
Checks that `check_unique_ping_time_duplicates` raises a warning when the data for duplicate ping times is not unique.
"""
# Initialize logger
logger = log._init_logger(__name__)

# Turn on logger verbosity
log.verbose(override=False)

# Open duplicate ping time beam dataset
ds_data = xr.open_zarr("echopype/test_data/ek80_duplicate_ping_times/duplicate_beam_ds.zarr")

# Modify a single entry to ensure that there exists duplicate ping times that do not share the same backscatter data
ds_data["backscatter_r"][0,0,0] = 0

# Check for ping time duplicates
check_unique_ping_time_duplicates(ds_data, logger)

# Turn off logger verbosity
log.verbose(override=True)

# Check if the expected warning is logged
expected_warning = (
"Duplicate slices in variable 'backscatter_r' corresponding to 'ping_time' "
f"{str(ds_data['ping_time'].values[0])} differ in data. All duplicate "
"'ping_time' entries will be removed, which will result in data loss."
)
assert any(expected_warning in record.message for record in caplog.records)


@pytest.mark.unit
def test_parse_ek80_with_invalid_env_datagrams():
"""
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pytz
scipy
xarray
pandas
zarr
zarr>=2,<3
fsspec
s3fs
requests
Expand Down
Loading