From 6b7753cbc614e85c6331572a9df745a4f5a51a41 Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Tue, 18 Apr 2023 10:04:22 -0400 Subject: [PATCH 1/5] Write final parquet with pandas metadata. --- src/hipscat_import/catalog/map_reduce.py | 7 ++++- .../hipscat_import/catalog/test_map_reduce.py | 6 +++-- tests/hipscat_import/conftest.py | 26 +++++++++++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/src/hipscat_import/catalog/map_reduce.py b/src/hipscat_import/catalog/map_reduce.py index 5e441d99..1169a098 100644 --- a/src/hipscat_import/catalog/map_reduce.py +++ b/src/hipscat_import/catalog/map_reduce.py @@ -212,7 +212,12 @@ def reduce_pixel_shards( "Npix", [np.full(rows_written, fill_value=destination_pixel_number, dtype=np.int32)], ) - pq.write_table(merged_table, where=destination_file) + if add_hipscat_index: + merged_table.to_pandas().set_index("_hipscat_index").sort_index().to_parquet( + destination_file + ) + else: + merged_table.to_pandas().to_parquet(destination_file) del merged_table, tables diff --git a/tests/hipscat_import/catalog/test_map_reduce.py b/tests/hipscat_import/catalog/test_map_reduce.py index 4093494a..d2af2caf 100644 --- a/tests/hipscat_import/catalog/test_map_reduce.py +++ b/tests/hipscat_import/catalog/test_map_reduce.py @@ -210,7 +210,9 @@ def test_reduce_order0(parquet_shards_dir, assert_parquet_file_ids, tmp_path): assert_parquet_file_ids(output_file, "id", expected_ids) -def test_reduce_hipscat_index(parquet_shards_dir, assert_parquet_file_ids, tmp_path): +def test_reduce_hipscat_index( + parquet_shards_dir, assert_parquet_file_ids, assert_parquet_file_index, tmp_path +): """Test reducing into one large pixel""" mr.reduce_pixel_shards( cache_path=parquet_shards_dir, @@ -268,7 +270,7 @@ def test_reduce_hipscat_index(parquet_shards_dir, assert_parquet_file_ids, tmp_p 13564690156971098112, 13557377060258709504, ] - assert_parquet_file_ids(output_file, "_hipscat_index", expected_indexes) + assert_parquet_file_index(output_file, expected_indexes) def test_reduce_bad_expectation(parquet_shards_dir, tmp_path): diff --git a/tests/hipscat_import/conftest.py b/tests/hipscat_import/conftest.py index 864b86bd..e5a7bc05 100644 --- a/tests/hipscat_import/conftest.py +++ b/tests/hipscat_import/conftest.py @@ -171,3 +171,29 @@ def assert_parquet_file_ids(file_name, id_column, expected_ids): npt.assert_array_equal(ids, expected_ids) return assert_parquet_file_ids + + +@pytest.fixture +def assert_parquet_file_index(): + def assert_parquet_file_index(file_name, expected_values): + """ + Convenience method to read a parquet file and compare the index values to + a list of expected objects. + + Args: + file_name (str): fully-specified path of the file to read + expected_values (:obj:`int[]`): list of expected values in index + """ + assert os.path.exists(file_name), f"file not found [{file_name}]" + + data_frame = pd.read_parquet(file_name, engine="pyarrow") + values = data_frame.index.values.tolist() + expected_values.sort() + + assert len(values) == len( + expected_values + ), f"object list not the same size ({len(values)} vs {len(expected_values)})" + + npt.assert_array_equal(values, expected_values) + + return assert_parquet_file_index From 3c5dd455c854a2c88f1ff71d6a6572f140fe1242 Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Tue, 18 Apr 2023 12:16:41 -0400 Subject: [PATCH 2/5] Increase test coverage. --- .../hipscat_import/catalog/test_map_reduce.py | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/hipscat_import/catalog/test_map_reduce.py b/tests/hipscat_import/catalog/test_map_reduce.py index d2af2caf..fcada968 100644 --- a/tests/hipscat_import/catalog/test_map_reduce.py +++ b/tests/hipscat_import/catalog/test_map_reduce.py @@ -4,6 +4,7 @@ import hipscat.pixel_math as hist import numpy.testing as npt +import pandas as pd import pyarrow as pa import pytest @@ -198,6 +199,7 @@ def test_reduce_order0(parquet_shards_dir, assert_parquet_file_ids, tmp_path): destination_pixel_number=11, destination_pixel_size=131, output_path=tmp_path, + add_hipscat_index=True, ra_column="ra", dec_column="dec", id_column="id", @@ -213,7 +215,7 @@ def test_reduce_order0(parquet_shards_dir, assert_parquet_file_ids, tmp_path): def test_reduce_hipscat_index( parquet_shards_dir, assert_parquet_file_ids, assert_parquet_file_index, tmp_path ): - """Test reducing into one large pixel""" + """Test reducing with or without a _hipscat_index field""" mr.reduce_pixel_shards( cache_path=parquet_shards_dir, origin_pixel_numbers=[47], @@ -271,6 +273,36 @@ def test_reduce_hipscat_index( 13557377060258709504, ] assert_parquet_file_index(output_file, expected_indexes) + data_frame = pd.read_parquet(output_file, engine="pyarrow") + assert data_frame.index.name == "_hipscat_index" + npt.assert_array_equal( + data_frame.columns, + ["id", "ra", "dec", "ra_error", "dec_error", "Norder", "Dir", "Npix"], + ) + + mr.reduce_pixel_shards( + cache_path=parquet_shards_dir, + origin_pixel_numbers=[47], + destination_pixel_order=0, + destination_pixel_number=11, + destination_pixel_size=18, + output_path=tmp_path, + add_hipscat_index=False, ## different from above + ra_column="ra", + dec_column="dec", + id_column="id", + delete_input_files=False, + ) + + assert_parquet_file_ids(output_file, "id", expected_ids) + data_frame = pd.read_parquet(output_file, engine="pyarrow") + ## No index name. + assert data_frame.index.name is None + ## Data fields are the same. + npt.assert_array_equal( + data_frame.columns, + ["id", "ra", "dec", "ra_error", "dec_error", "Norder", "Dir", "Npix"], + ) def test_reduce_bad_expectation(parquet_shards_dir, tmp_path): From dfa77240e39e6e0c26f882099c3601f4429afc0b Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Wed, 19 Apr 2023 13:51:59 -0400 Subject: [PATCH 3/5] Add tests for panda-indexed parquet input. --- src/hipscat_import/catalog/map_reduce.py | 73 ++++--- .../catalog/test_file_readers.py | 9 +- .../catalog/test_resume_files.py | 26 +-- .../hipscat_import/catalog/test_run_import.py | 183 +++++++++++++++++- tests/hipscat_import/conftest.py | 13 +- .../{shard_1.parquet => shard_0_0.parquet} | Bin 4852 -> 4852 bytes .../{shard_3.parquet => shard_1_0.parquet} | Bin 4976 -> 4976 bytes .../{shard_2.parquet => shard_2_0.parquet} | Bin 4908 -> 4908 bytes .../{shard_0.parquet => shard_3_0.parquet} | Bin 4893 -> 4893 bytes .../{shard_4.parquet => shard_4_0.parquet} | Bin 4978 -> 4978 bytes .../{shard_1.parquet => shard_0_0.parquet} | Bin 4887 -> 4887 bytes .../{shard_3.parquet => shard_1_0.parquet} | Bin 4887 -> 4887 bytes .../{shard_2.parquet => shard_2_0.parquet} | Bin 4840 -> 4840 bytes .../{shard_0.parquet => shard_3_0.parquet} | Bin 4841 -> 4841 bytes .../{shard_4.parquet => shard_4_0.parquet} | Bin 4890 -> 4890 bytes .../{shard_1.parquet => shard_0_0.parquet} | Bin 4969 -> 4969 bytes .../{shard_3.parquet => shard_1_0.parquet} | Bin 4878 -> 4878 bytes .../{shard_2.parquet => shard_2_0.parquet} | Bin 4972 -> 4972 bytes .../{shard_0.parquet => shard_3_0.parquet} | Bin 4958 -> 4958 bytes .../{shard_4.parquet => shard_4_0.parquet} | Bin 4841 -> 4841 bytes .../{shard_1.parquet => shard_0_0.parquet} | Bin 4796 -> 4796 bytes .../{shard_3.parquet => shard_1_0.parquet} | Bin 4796 -> 4796 bytes .../{shard_2.parquet => shard_2_0.parquet} | Bin 4823 -> 4823 bytes .../{shard_0.parquet => shard_3_0.parquet} | Bin 4851 -> 4851 bytes .../{shard_4.parquet => shard_4_0.parquet} | Bin 4869 -> 4869 bytes .../data/test_formats/multiindex.parquet | Bin 0 -> 4036 bytes .../data/test_formats/pandasindex.parquet | Bin 0 -> 4636 bytes 27 files changed, 253 insertions(+), 51 deletions(-) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/{shard_1.parquet => shard_0_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/{shard_3.parquet => shard_1_0.parquet} (96%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/{shard_2.parquet => shard_2_0.parquet} (96%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/{shard_0.parquet => shard_3_0.parquet} (96%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/{shard_4.parquet => shard_4_0.parquet} (96%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/{shard_1.parquet => shard_0_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/{shard_3.parquet => shard_1_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/{shard_2.parquet => shard_2_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/{shard_0.parquet => shard_3_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/{shard_4.parquet => shard_4_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/{shard_1.parquet => shard_0_0.parquet} (96%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/{shard_3.parquet => shard_1_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/{shard_2.parquet => shard_2_0.parquet} (96%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/{shard_0.parquet => shard_3_0.parquet} (96%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/{shard_4.parquet => shard_4_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/{shard_1.parquet => shard_0_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/{shard_3.parquet => shard_1_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/{shard_2.parquet => shard_2_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/{shard_0.parquet => shard_3_0.parquet} (97%) rename tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/{shard_4.parquet => shard_4_0.parquet} (97%) create mode 100644 tests/hipscat_import/data/test_formats/multiindex.parquet create mode 100644 tests/hipscat_import/data/test_formats/pandasindex.parquet diff --git a/src/hipscat_import/catalog/map_reduce.py b/src/hipscat_import/catalog/map_reduce.py index 1169a098..91b3d7da 100644 --- a/src/hipscat_import/catalog/map_reduce.py +++ b/src/hipscat_import/catalog/map_reduce.py @@ -30,6 +30,22 @@ def _get_pixel_directory(cache_path: FilePointer, pixel: np.int64): ) +def _has_named_index(dataframe): + """Heuristic to determine if a dataframe has some meaningful index. + + This will reject dataframes with no index name for a single index, + or empty names for multi-index ([] or [None]). + """ + if dataframe.index.name is not None: + ## Single index with a given name. + return True + if len(dataframe.index.names) == 0: + return False + if dataframe.index.names[0] is not None: + return True + return False + + def map_to_pixels( input_file: FilePointer, file_reader, @@ -107,7 +123,10 @@ def map_to_pixels( output_file = file_io.append_paths_to_pointer( pixel_dir, f"shard_{shard_suffix}_{chunk_number}.parquet" ) - filtered_data.to_parquet(output_file) + if _has_named_index(filtered_data): + filtered_data.to_parquet(output_file, index=True) + else: + filtered_data.to_parquet(output_file, index=False) del filtered_data, data_indexes ## Pesky memory! @@ -181,45 +200,35 @@ def reduce_pixel_shards( f" Expected {destination_pixel_size}, wrote {rows_written}" ) + dataframe = merged_table.to_pandas() if id_column: - merged_table = merged_table.sort_by(id_column) + dataframe = dataframe.sort_values(id_column) if add_hipscat_index: - merged_table = merged_table.append_column( - "_hipscat_index", - [ - pixel_math.compute_hipscat_id( - merged_table[ra_column].to_pylist(), - merged_table[dec_column].to_pylist(), - ) - ], + dataframe["_hipscat_index"] = pixel_math.compute_hipscat_id( + dataframe[ra_column].values, + dataframe[dec_column].values, ) - merged_table = merged_table.sort_by("_hipscat_index") - merged_table = merged_table.append_column( - "Norder", - [np.full(rows_written, fill_value=destination_pixel_order, dtype=np.int32)], + + dataframe["Norder"] = np.full( + rows_written, fill_value=destination_pixel_order, dtype=np.int32 ) - merged_table = merged_table.append_column( - "Dir", - [ - np.full( - rows_written, - fill_value=int(destination_pixel_number / 10_000) * 10_000, - dtype=np.int32, - ) - ], + dataframe["Dir"] = np.full( + rows_written, + fill_value=int(destination_pixel_number / 10_000) * 10_000, + dtype=np.int32, ) - merged_table = merged_table.append_column( - "Npix", - [np.full(rows_written, fill_value=destination_pixel_number, dtype=np.int32)], + dataframe["Npix"] = np.full( + rows_written, fill_value=destination_pixel_number, dtype=np.int32 ) + if add_hipscat_index: - merged_table.to_pandas().set_index("_hipscat_index").sort_index().to_parquet( - destination_file - ) - else: - merged_table.to_pandas().to_parquet(destination_file) + ## If we had a meaningful index before, preserve it as a column. + if _has_named_index(dataframe): + dataframe = dataframe.reset_index() + dataframe = dataframe.set_index("_hipscat_index").sort_index() + dataframe.to_parquet(destination_file) - del merged_table, tables + del dataframe, merged_table, tables if delete_input_files: for pixel in origin_pixel_numbers: diff --git a/tests/hipscat_import/catalog/test_file_readers.py b/tests/hipscat_import/catalog/test_file_readers.py index affc7c23..87ef60b6 100644 --- a/tests/hipscat_import/catalog/test_file_readers.py +++ b/tests/hipscat_import/catalog/test_file_readers.py @@ -9,9 +9,12 @@ import pytest from hipscat.catalog import CatalogParameters -from hipscat_import.catalog.file_readers import (CsvReader, FitsReader, - ParquetReader, - get_file_reader) +from hipscat_import.catalog.file_readers import ( + CsvReader, + FitsReader, + ParquetReader, + get_file_reader, +) def test_unknown_file_type(): diff --git a/tests/hipscat_import/catalog/test_resume_files.py b/tests/hipscat_import/catalog/test_resume_files.py index bb8b8000..5412772b 100644 --- a/tests/hipscat_import/catalog/test_resume_files.py +++ b/tests/hipscat_import/catalog/test_resume_files.py @@ -4,18 +4,20 @@ import numpy.testing as npt import pytest -from hipscat_import.catalog.resume_files import (clean_resume_files, - is_mapping_done, - is_reducing_done, - read_histogram, - read_mapping_keys, - read_reducing_keys, - set_mapping_done, - set_reducing_done, - write_histogram, - write_mapping_done_key, - write_mapping_start_key, - write_reducing_key) +from hipscat_import.catalog.resume_files import ( + clean_resume_files, + is_mapping_done, + is_reducing_done, + read_histogram, + read_mapping_keys, + read_reducing_keys, + set_mapping_done, + set_reducing_done, + write_histogram, + write_mapping_done_key, + write_mapping_start_key, + write_reducing_key, +) def test_mapping_done(tmp_path): diff --git a/tests/hipscat_import/catalog/test_run_import.py b/tests/hipscat_import/catalog/test_run_import.py index 8db3d6d9..619947ad 100644 --- a/tests/hipscat_import/catalog/test_run_import.py +++ b/tests/hipscat_import/catalog/test_run_import.py @@ -4,6 +4,7 @@ import shutil import pandas as pd +import numpy.testing as npt import pytest import hipscat_import.catalog.resume_files as rf @@ -105,7 +106,7 @@ def test_resume_dask_runner( # Check that the partition info file exists expected_partition_lines = [ - "Norder,Dir,Npix,num_objects", + "Norder,Dir,Npix,num_rows", "0,0,11,131", ] partition_filename = os.path.join(args.catalog_path, "partition_info.csv") @@ -187,7 +188,7 @@ def test_dask_runner( # Check that the partition info file exists expected_lines = [ - "Norder,Dir,Npix,num_objects", + "Norder,Dir,Npix,num_rows", "0,0,11,131", ] metadata_filename = os.path.join(args.catalog_path, "partition_info.csv") @@ -245,7 +246,7 @@ def test_dask_runner_source_table( # Check that the partition info file exists expected_lines = [ - "Norder,Dir,Npix,num_objects", + "Norder,Dir,Npix,num_rows", "0,0,4,50", "1,0,47,2395", "2,0,176,385", @@ -326,3 +327,179 @@ def test_dask_runner_mixed_schema_csv( ) assert_parquet_file_ids(output_file, "id", [*range(700, 708)]) + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +@pytest.mark.timeout(10) +def test_dask_runner_preserve_index( + dask_client, + formats_pandasindex, + assert_parquet_file_ids, + assert_parquet_file_index, + tmp_path, +): + """Test basic execution, with input with pandas metadata""" + + expected_indexes = [ + "star1_1", + "star1_2", + "star1_3", + "star1_4", + "galaxy1_1", + "galaxy1_2", + "galaxy2_1", + "galaxy2_2", + ] + assert_parquet_file_index(formats_pandasindex, expected_indexes) + data_frame = pd.read_parquet(formats_pandasindex, engine="pyarrow") + assert data_frame.index.name == "obs_id" + npt.assert_array_equal( + data_frame.columns, + ["obj_id", "band", "ra", "dec", "mag"], + ) + + ## Don't generate a hipscat index. Verify that the original index remains. + args = ImportArguments( + output_catalog_name="pandasindex", + input_file_list=[formats_pandasindex], + input_format="parquet", + id_column="obs_id", + add_hipscat_index=False, + output_path=tmp_path, + dask_tmp=tmp_path, + highest_healpix_order=1, + progress_bar=False, + ) + + runner.run_with_client(args, dask_client) + + # Check that the catalog parquet file exists + output_file = os.path.join( + args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet" + ) + + assert_parquet_file_index(output_file, expected_indexes) + data_frame = pd.read_parquet(output_file, engine="pyarrow") + assert data_frame.index.name == "obs_id" + npt.assert_array_equal( + data_frame.columns, + ["obj_id", "band", "ra", "dec", "mag", "Norder", "Dir", "Npix"], + ) + + ## DO generate a hipscat index. Verify that the original index is preserved in a column. + args = ImportArguments( + output_catalog_name="pandasindex_preserve", + input_file_list=[formats_pandasindex], + input_format="parquet", + id_column="obs_id", + add_hipscat_index=True, + output_path=tmp_path, + dask_tmp=tmp_path, + highest_healpix_order=1, + progress_bar=False, + ) + + runner.run_with_client(args, dask_client) + + # Check that the catalog parquet file exists + output_file = os.path.join( + args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet" + ) + + data_frame = pd.read_parquet(output_file, engine="pyarrow") + assert data_frame.index.name == "_hipscat_index" + npt.assert_array_equal( + data_frame.columns, + ["obs_id", "obj_id", "band", "ra", "dec", "mag", "Norder", "Dir", "Npix"], + ) + assert_parquet_file_ids(output_file, "obs_id", expected_indexes) + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +@pytest.mark.timeout(10) +def test_dask_runner_multiindex( + dask_client, + formats_multiindex, + assert_parquet_file_ids, + assert_parquet_file_index, + tmp_path, +): + """Test basic execution, with input with pandas metadata""" + + index_arrays = [ + [ + "star1", + "star1", + "star1", + "star1", + "galaxy1", + "galaxy1", + "galaxy2", + "galaxy2", + ], + ["r", "r", "i", "i", "r", "r", "r", "r"], + ] + expected_indexes = list(zip(index_arrays[0], index_arrays[1])) + assert_parquet_file_index(formats_multiindex, expected_indexes) + data_frame = pd.read_parquet(formats_multiindex, engine="pyarrow") + assert data_frame.index.names == ["obj_id", "band"] + npt.assert_array_equal( + data_frame.columns, + ["ra", "dec", "mag"], + ) + + ## Don't generate a hipscat index. Verify that the original index remains. + args = ImportArguments( + output_catalog_name="multiindex", + input_file_list=[formats_multiindex], + input_format="parquet", + id_column=["obj_id", "band"], + add_hipscat_index=False, + output_path=tmp_path, + dask_tmp=tmp_path, + highest_healpix_order=1, + progress_bar=False, + ) + + runner.run_with_client(args, dask_client) + + # Check that the catalog parquet file exists + output_file = os.path.join( + args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet" + ) + + assert_parquet_file_index(output_file, expected_indexes) + data_frame = pd.read_parquet(output_file, engine="pyarrow") + assert data_frame.index.names == ["obj_id", "band"] + npt.assert_array_equal( + data_frame.columns, + ["ra", "dec", "mag", "Norder", "Dir", "Npix"], + ) + + ## DO generate a hipscat index. Verify that the original index is preserved in a column. + args = ImportArguments( + output_catalog_name="multiindex_preserve", + input_file_list=[formats_multiindex], + input_format="parquet", + id_column=["obj_id", "band"], + add_hipscat_index=True, + output_path=tmp_path, + dask_tmp=tmp_path, + highest_healpix_order=1, + progress_bar=False, + ) + + runner.run_with_client(args, dask_client) + + # Check that the catalog parquet file exists + output_file = os.path.join( + args.catalog_path, "Norder=0", "Dir=0", "Npix=11.parquet" + ) + + data_frame = pd.read_parquet(output_file, engine="pyarrow") + assert data_frame.index.name == "_hipscat_index" + npt.assert_array_equal( + data_frame.columns, + ["obj_id", "band", "ra", "dec", "mag", "Norder", "Dir", "Npix"], + ) + assert_parquet_file_ids(output_file, "obj_id", index_arrays[0]) diff --git a/tests/hipscat_import/conftest.py b/tests/hipscat_import/conftest.py index e5a7bc05..d7d7ce8c 100644 --- a/tests/hipscat_import/conftest.py +++ b/tests/hipscat_import/conftest.py @@ -69,6 +69,16 @@ def formats_fits(test_data_dir): return os.path.join(test_data_dir, "test_formats", "small_sky.fits") +@pytest.fixture +def formats_pandasindex(test_data_dir): + return os.path.join(test_data_dir, "test_formats", "pandasindex.parquet") + + +@pytest.fixture +def formats_multiindex(test_data_dir): + return os.path.join(test_data_dir, "test_formats", "multiindex.parquet") + + @pytest.fixture def small_sky_parts_dir(test_data_dir): return os.path.join(test_data_dir, "small_sky_parts") @@ -87,7 +97,7 @@ def parquet_shards_dir(test_data_dir): @pytest.fixture def parquet_shards_shard_44_0(test_data_dir): return os.path.join( - test_data_dir, "parquet_shards", "dir_0", "pixel_44", "shard_0.parquet" + test_data_dir, "parquet_shards", "dir_0", "pixel_44", "shard_3_0.parquet" ) @@ -188,6 +198,7 @@ def assert_parquet_file_index(file_name, expected_values): data_frame = pd.read_parquet(file_name, engine="pyarrow") values = data_frame.index.values.tolist() + values.sort() expected_values.sort() assert len(values) == len( diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_1.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_0_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_1.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_0_0.parquet index 0ecc22babdd32f4611321ba83669d833abefbb7f..89f0a5b8c69f7cfae1d068d17bd1939d4ef0e1f9 100644 GIT binary patch delta 30 lcmeyO`bBj^3^%Kho`Igh=0xryRyNO4UuTo@%{K+4nE;>B35x&# delta 30 lcmeyO`bBj^3^%Kxp0S?k=0xryRyNmC-%1n1%{K+4nE;?M34Q@+SlK*FeVt9pHwy|%GXVgVlnAB( delta 30 mcmZ3ZwnlBk6mC{SJ!3u7%`>@+SlL`leJf22Hwy|%GXVgV^az;% diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_0.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_3_0.parquet similarity index 96% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_0.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_3_0.parquet index bb3df3aa84eb8d8a79f687c3322a23061def53ff..968f28ef411efd39a90bf56df4e420e511e2f3c4 100644 GIT binary patch delta 30 lcmbQMHdk#!D>ti=o`Igh<}U6cRyNO4UuTo@%}j#QOaPCI2wMOE delta 30 lcmbQMHdk#!D>tj5p0S?k<}U6cRyNmC-%1n1%}j#QOaPDT2u=V1 diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_4.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_4_0.parquet similarity index 96% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_4.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_44/shard_4_0.parquet index ec155e0bbb13555adc24951435259bca5922472d..dfd66ff1b6406d70b07cdc23c89c457c2ca7461d 100644 GIT binary patch delta 30 mcmeyQ_DOBSEpApLJp(<1&G)&BSlK*FeVt9pH-`&KGXVgxz6t~Y delta 30 mcmeyQ_DOBSEpAprJ!3u7&G)&BSlL`leJf22H-`&KGXVgy9trmV diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_1.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_0_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_1.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_0_0.parquet index 09d4b0a25160954499c0308301f3b637215d514c..c375d53c549173e7032c6342aaf48e4a156ae76a 100644 GIT binary patch delta 30 lcmbQPHeGE)JvXb7o`Igh<`(WERyNO4UuTo@&3^=>nE;U<2|NG* delta 30 lcmbQPHeGE)JvXbNp0S?k<`(WERyNmC-%1n1&3^=>nE;V~2`>Nu diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_3.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_1_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_3.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_1_0.parquet index 0681496c9667828be057cea8b1863fdb2d17633e..462df74d38d9ff0092289e4acc832daa0f392aab 100644 GIT binary patch delta 30 lcmbQPHeGE)JvXb7o`Igh<`(WERyNO4UuTo@&3^=>nE;U<2|NG* delta 30 lcmbQPHeGE)JvXbNp0S?k<`(WERyNmC-%1n1&3^=>nE;V~2`>Nu diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_2.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_2_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_2.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_2_0.parquet index e9d7f7587b84ee1e11e14dd4c457f60cc2ebc80e..70cb9f45fc9939e4bfca26ab1892193be82d5105 100644 GIT binary patch delta 30 lcmaE%`a*R>05_|Vo`Igh=1}e;RyNO4UuTo@&1VIqnE;yo2~Pk3 delta 30 lcmaE%`a*R>05_|lp0S?k=1}e;RyNmC-%1n1&1VIqnE;zz2|@q> diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_0.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_3_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_0.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_3_0.parquet index cd743ec245888bf5b2d6e90cc20631b144654fd0..8b75a91c17a2014c67e20ea525a03eff64dfd9ac 100644 GIT binary patch delta 30 lcmaE<`cid6AUCU#o`Igh<}mIeRyNO4UuTo@&F2K9nE;z)2~+?8 delta 30 lcmaE<`cid6AUCU_p0S?k<}mIeRyNmC-%1n1&F2K9nE;!_2}b|` diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_4.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_4_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_4.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_45/shard_4_0.parquet index 8178d38a3600064341ca4dc1af034c5861860cb7..90628d3bb62aaa3a36351830fa9812c04eca1028 100644 GIT binary patch delta 30 lcmbQGHcM?o6E~}oo`Igh=63EPRyNO4UuTo@&Hn|YnE;Yg2}=L~ delta 30 lcmbQGHcM?o6E~}&p0S?k=63EPRyNmC-%1n1&Hn|YnE;Zr2|fS- diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_1.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_0_0.parquet similarity index 96% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_1.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_0_0.parquet index 4fc74471138d3f72ea82a6311b9ed573b80b3ae5..e086dc79ae343b0c77726ed8a3801de98e611be9 100644 GIT binary patch delta 30 mcmaE<_EK%bMQ&CjJp(<1&DXe#SlK*FeVt9pH~R@nGXVgu9|_k0 delta 30 mcmaE<_EK%bMQ&C@J!3u7&DXe#SlL`leJf22H~R@nGXVgue+kL} diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_3.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_1_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_3.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_1_0.parquet index eed0e75dbea0d5be45759d3c8a038172d37d1954..7a57bb06cfe95bf8150b35621accdf348f6df149 100644 GIT binary patch delta 30 lcmeBE>r>lM#?5M^XP{@WxthC(mCdu%*V&|e^Jf8RCIF3p2@e1O delta 30 lcmeBE>r>lM#?5M|XRK$sxthC(mCd!(x6;IL^Jf8RCIF4!2?78B diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_2.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_2_0.parquet similarity index 96% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_2.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_2_0.parquet index 026b29d02faf44516d0be3122d2cc068fd22d202..a56718d6193be4f3af8fe4c6624eab0d5078a48a 100644 GIT binary patch delta 30 mcmaE(_C{^P6>e4|Jp(<1%{RG=SlK*FeVt9pHwOwzGXVgvR0-w) delta 30 mcmaE(_C{^P6>e5TJ!3u7%{RG=SlL`leJf22HwOwzGXVgvvY2PJp(<1&8NAGSlK*FeVt9pH#-YTGXVgpvI(F7 delta 30 mcmcboc28}?F>Y2vJ!3u7&8NAGSlL`leJf22H#-YTGXVgq5($$4 diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_4.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_4_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_4.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_46/shard_4_0.parquet index 86829624309e509d841abb9775c04d5caffb669f..c3b514c4dfce7285f06ca9ba41ec20b6db4aecae 100644 GIT binary patch delta 30 lcmaE<`cid6AUCU#o`Igh<}mIeRyNO4UuTo@&F2K9nE;z)2~+?8 delta 30 lcmaE<`cid6AUCU_p0S?k<}mIeRyNmC-%1n1&F2K9nE;!_2}b|` diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_1.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_0_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_1.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_0_0.parquet index ead589ed97a767816ffcb41ce3c7b02baf657d6e..f23ccdee35f289a18dd856c2ba850be4f083915b 100644 GIT binary patch delta 30 lcmdm^x<_?`3OB2fo`IghW=-xQRyNO4UuTo@%?kyjnE;7N2y*}c delta 30 lcmdm^x<_?`3OB2vp0S?kW=-xQRyNmC-%1n1%?kyjnE;8Y2xb5P diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_3.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_1_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_3.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_1_0.parquet index 5ea6f49a0ade30111d694088f28973d752e37d74..0132fecfea9ca0a39c856748eefb4076b9cae90c 100644 GIT binary patch delta 30 lcmdm^x<_?`3OB2fo`IghW=-xQRyNO4UuTo@%?kyjnE;7N2y*}c delta 30 lcmdm^x<_?`3OB2vp0S?kW=-xQRyNmC-%1n1%?kyjnE;8Y2xb5P diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_2.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_2_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_2.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_2_0.parquet index 274baece84ce3659cfdf765a268ffd3a9bc5e22f..8802636455ca5a47951c36aca036cbde4a61893f 100644 GIT binary patch delta 30 lcmcbvdR=vcJvXb7o`IghW*6=vRyNO4UuTo@&3go-nE;d~2>Jj3 delta 30 lcmcbvdR=vcJvXbNp0S?kW*6=vRyNmC-%1n1&3go-nE;fA2<-p> diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_0.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_3_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_0.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_3_0.parquet index e20c765496c01bd511cdf4537967d3c5c76cbd8c..726c615856018b18b1c2138bb5e98ba66a282a62 100644 GIT binary patch delta 30 lcmeyY`dM{DG&ie}o`Igh<^=8{RyNO4UuTo@%{K(3nE;<^35Eaw delta 30 lcmeyY`dM{DG&ifEp0S?k<^=8{RyNmC-%1n1%{K(3nE;>433&hj diff --git a/tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_4.parquet b/tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_4_0.parquet similarity index 97% rename from tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_4.parquet rename to tests/hipscat_import/data/parquet_shards/dir_0/pixel_47/shard_4_0.parquet index fd5da403319906f2a30ddb65252b5359e89c0ffc..72a901355147ea644802af67c2ff3e543d8caee7 100644 GIT binary patch delta 30 lcmZowYgOBj%gt(}XP{@Wxrn=nmCdu%*V&|e^D6;qCIE@x2;u+$ delta 30 lcmZowYgOBj%gt)2XRK$sxrn=nmCd!(x6;IL^D6;qCIE^+2-N@p diff --git a/tests/hipscat_import/data/test_formats/multiindex.parquet b/tests/hipscat_import/data/test_formats/multiindex.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4c5444ec770ab83b76829e48cf059b3d08fb05b3 GIT binary patch literal 4036 zcmcInZ)_7~7{7K<_D^jH*R*B{vp_SG4cl%7$_MQAM!VLoY@O@%3cB1~|J`5k4@i1@otm83xY1NLF8X{`BM*Udzq{@Gp|LO+iN)y^JZ%k^N$U6=52{2 z*FgR4^|pW8fO~V6RdU}F_iIM|8FP&D{Qc@j?`*lkdghMJ{#jr-&-~R3Z!Cm3&qeF& z=RHQwv*)d^AO9u5c^3Tni|_u-dY;%nH~CYPYti*u26KBYC&JfkZQ_2avw^$Fq#L*& z@38%~0k}73StWN%?tAc|5J(g0eRJOisGnv8rd)AYy0$vk!)w04$PQ@cCfrecRXvwiEaC;N6;94Ow=2iccueKhc@yb+*sCZNGK(w8+vDB9smx z#RA06tvaYAqZ<#PHsYpbpmefX$Ahn!zZbz! z*R?8c5nPzrkmoWz|eZd0pF6)HItXe~-S zmrW#5VyP(RDoYM1nB)q`E+z|1f+;W%e%P9nAe@)dXzk>qzJXoVBS-3BmZPxB3|hYo zg=SlUA|+&*!j7S46b+6B7>M4Hen6v9v%)E9uA~|7aw#~5&80NsT`pDc`SLBS+gB94 zy=FG;qN1#(hSjWhUQ5x5oMZ}FS<#Zfvoa%Bkd$&2MwYWBw6n8GInQS$WOZ-1Z?{{a z(^jo`gU_k@FmpE%iR`0xs{_=0vil))xrn_4L5#yMx*j2jA;Oy`d<2mu$h2bfdYAc2 z$OvmRzkhsKeTTb}Mezp+qC~(t_*9a>r?Lp{H2iu?H07G1W1|E?mSDYiXFwE+aes6u z<_yoWBhdmYjq>|=FB=fuJR`VsaWaq(kU<`;3kY`fJk2`G4wiDI7_h}-8G)+=b_9F{ z$mPng)FfJ8Oi<-~h$Q&9C zx|c5OBZKZ#bQ$(TD4rV3^GmqMV1=GgxQYxF&BuN2B27626c>eH1o2A8^Ko4p?y`?=0f;QCn7ClqX57P8n;nDQXiBagKyP} z^NJuU=oUcp2PPO?FkRq60bUM`I`rU;Ae%CT(0r<0Xa*9r2s&0Q!!(M|jKK}18=q9F zj3_>}AbM1)__g_j-jxxD{A3BOS4?HN6T~FCHPk~hlVB;ILBl z)Y52%5QN1a5$MveU&~LTq!0)5m4>ORQU)?LkX}Nq&%w&Z`XqYiM1?lup{`fwy*9E5 s0yA&UG4jD;vM|8qas$fuPOtXG(yM&qSPsA+){Q^ddtDed2LHeR2Y~9Ty#N3J literal 0 HcmV?d00001 diff --git a/tests/hipscat_import/data/test_formats/pandasindex.parquet b/tests/hipscat_import/data/test_formats/pandasindex.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2fdc75a61d6bd7f1dac08f57239ed21f1b73a678 GIT binary patch literal 4636 zcmcgwU2NOd6{cjzb{uDQgHn@P4Kak2Ffb>RXvwS6q7_A3GF4l#l|&S|bt*~85^a)F zsQ7W03t{y0mCeO5d^$L;58%$fYYZOAT!aRg2@Te zH2}$Bx*aO$j?P&?9AjxGWhKil%GNLEUyzH=GEDwQgkaJqP)$V-3Kd@R5H$ zap9GJZ2#x?X5{2Y=dZs0o*DVejemXn>G#dZcfR-7#UI}^BlrDk>+3&y*Npt+`lnC* z?2l&T^iTQHJMWm0Kdb3me|X!B0BRTcU=Ou>%3kT7vTa<%{0J%VIyP_d8XJ4=`v0lh z6JBzAL2m{&s|Vd)#{muD=lVuYNfG$*T{s?YeY!ecH>mZztbMUHKo| zroZdocyy0#zm|H>J2k_$|N6%G2S--e_N(U#>z};9wr@p$Q~%|CY8=|mdh)B+;1MGu1x{=VJ`>x{zBY;@N?Jhr;AENn*3q*{@=-w-(K>*82VU_ zlux|=(w{z)Bc+w*$H{;a`R`u%`CA{#5ki|j_vlA*WcS5Z`sF8;5!-HS+*7tsgl||n z#QpDI>Eo_Tg+A`@9;e=%0q(kdZKq;p>I^|zH!~@+H+FBzMr}} zi;nClSEJZ?o(U1|iHcIp>k>IbkTVZE?RNAqvF3&FScE%+Etm!yI8X$MCaAnRGe~yV z0m9X>cg=dd9__Fi-d#1Ud)pJJdwcAtdt2Gmp;ngb*%{X?HRZZ_>;Z!CEVw7iaJ5Jk z_xC4veX5bmH?vAftLa9?cXsYuKC6y>Pc6*(EO8dw9HSF6H59w)AjgnbL41OSUi0mR6FgOF?8k znBhJ}Uy6rFLDv|L)Da$XC}>N=p*q4t4%MmS5v76pAwsb);GGpJ&LXZ$GcQ$3Cazsg zl}*WPq0%k4B-1Q4P@H<+tf(aeNyBISXZ$`WDLZ(dj<$4!dxlA+&hclhi@<&J_&3ng zVqy%#Y{CaUPclr9i58Hoz_0}@i$(|Z4J2IY>em~~);o;Y3)XWC(_mm7d}Al@jTgZv zz$e-e_|T4!S!bBUCajNslhd`D7*7W?bh0k5rmM2CuAWn)GN*@CB@ix)Y$CVDGO9p_ zD>*(;%g~m+n6d2gtwc@W0~)toD#%8e`%N<0Sf zVwW?EsgM#!I!4X$p)&a25zliKj!l4Vppj39{rOB3`$bqJJ-wY{vA!y1Qbq9L7tV7_ z+-gM?`OvmZr^>RfV1Ma!D!z81pl)RX1K5=o*vG2C*~Z1FGM%^toOU>M6z+X`BU+U8 zR1xyh!)Byw5}&M#zyu z=+7gkCm@bxFY1~m$5UIf9;O9%M54s!=7jw&k-{KMq$eC!UzYh1| z9jvoB*vFgUH5Q$jzWU)~#zDD}ZQZDS()V#6uA6;$16nQ~gR`3|cF)?Xzk=$&uQuv} zk=&B_18ZkMKWfkJ>MVA0l!1EY^khrqV}5y&Lv;>0YnBD5BgNRkXB5r|%oFvTwU@M* zNnDora1C+ews&glY_P!@z>iJz_EJx`6k4mxDmqtOYi*kWyF?iEIIF)c2hpsK1`hSV zsDh@@?clMRBx=3z5{;&?5H&^AFYPCeCeRZuqn-fM0=Br*EJM?VAb2UBqE`!5bVuaZnLxeczrdDYhVd~?Y=rUm;bV~$ z&Ky?4eF6PlK=+3A6qW^h{Q>;&LP3Wfj8`mk$R30j(R~4aRxL7+tZvYi7+w^$@1IUA z>pnrf4b>Y6@e*uGxKEZP^HMEeJ*||>r}4cx*SQ$y@Qvxw;IE&3zeE83z}tZT3w;iI CI4f=d literal 0 HcmV?d00001 From abd99444c586aaa15de6e6cf7d1a25222e9bc8c2 Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Wed, 19 Apr 2023 14:08:14 -0400 Subject: [PATCH 4/5] Formatting and coverage. --- src/hipscat_import/catalog/map_reduce.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/hipscat_import/catalog/map_reduce.py b/src/hipscat_import/catalog/map_reduce.py index 91b3d7da..4e1d715e 100644 --- a/src/hipscat_import/catalog/map_reduce.py +++ b/src/hipscat_import/catalog/map_reduce.py @@ -32,18 +32,18 @@ def _get_pixel_directory(cache_path: FilePointer, pixel: np.int64): def _has_named_index(dataframe): """Heuristic to determine if a dataframe has some meaningful index. - + This will reject dataframes with no index name for a single index, - or empty names for multi-index ([] or [None]). + or empty names for multi-index ([] or [None]). """ if dataframe.index.name is not None: ## Single index with a given name. return True - if len(dataframe.index.names) == 0: + if len(dataframe.index.names) == 0 or all( + name is None for name in dataframe.index.names + ): return False - if dataframe.index.names[0] is not None: - return True - return False + return True def map_to_pixels( From 22f8bca971029b0f57dd7186dfb11706041fb64d Mon Sep 17 00:00:00 2001 From: delucchi-cmu Date: Wed, 19 Apr 2023 15:18:56 -0400 Subject: [PATCH 5/5] Expand docstring --- src/hipscat_import/catalog/map_reduce.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/hipscat_import/catalog/map_reduce.py b/src/hipscat_import/catalog/map_reduce.py index 4e1d715e..b2856453 100644 --- a/src/hipscat_import/catalog/map_reduce.py +++ b/src/hipscat_import/catalog/map_reduce.py @@ -34,7 +34,7 @@ def _has_named_index(dataframe): """Heuristic to determine if a dataframe has some meaningful index. This will reject dataframes with no index name for a single index, - or empty names for multi-index ([] or [None]). + or empty names for multi-index (e.g. [] or [None]). """ if dataframe.index.name is not None: ## Single index with a given name. @@ -150,6 +150,24 @@ def reduce_pixel_shards( ): """Reduce sharded source pixels into destination pixels. + In addition to combining multiple shards of data into a single + parquet file, this method will add a few new columns: + + - `Norder` - the healpix order for the pixel + - `Dir` - the directory part, corresponding to the pixel + ` `Npix` - the healpix pixel + - `_hipscat_index` - optional - a spatially-correlated + 64-bit index field. + + Notes on `_hipscat_index`: + + - if we generate the field, we will promote any previous + *named* pandas index field(s) to a column with + that name. + - see `hipscat.pixel_math.hipscat_id` + for more in-depth discussion of this field. + + Args: cache_path (str): where to read intermediate files origin_pixel_numbers (list[int]): high order pixels, with object @@ -160,8 +178,12 @@ def reduce_pixel_shards( for the catalog's final pixel output_path (str): where to write the final catalog pixel data id_column (str): column for survey identifier, or other sortable column + add_hipscat_index (bool): should we add a _hipscat_index column to + the resulting parquet file? delete_input_files (bool): should we delete the intermediate files used as input for this method. + use_schema_file (str): use the parquet schema from the indicated + parquet file. Raises: ValueError: if the number of rows written doesn't equal provided