Skip to content

Commit

Permalink
GH-36619: [Python] Parquet statistics string representation misleading (
Browse files Browse the repository at this point in the history
#36626)

### Rationale for this change

If `Statistics.has_distinct_count` is false, the printout still shows a `distinct_count` value (`0`).

### What changes are included in this PR?

Update the `distinct_count` property  of `Statistics` class to check for `has_distinct_count` value.

### Are these changes tested?

The tests for `distinct_count` are already included in `test_parquet_column_statistics_api` and only needed to be updated.
* Closes: #36619

Authored-by: AlenkaF <frim.alenka@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
AlenkaF authored Jul 12, 2023
1 parent 95a8bfb commit 63644f4
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 21 deletions.
17 changes: 9 additions & 8 deletions python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -175,17 +175,18 @@ cdef class Statistics(_Weakrefable):
@property
def null_count(self):
"""Number of null values in chunk (int)."""
return self.statistics.get().null_count()
if self.has_null_count:
return self.statistics.get().null_count()
else:
return None

@property
def distinct_count(self):
"""
Distinct number of values in chunk (int).
If this is not set, will return 0.
"""
# This seems to be zero if not set. See: ARROW-11793
return self.statistics.get().distinct_count()
"""Distinct number of values in chunk (int)."""
if self.has_distinct_count:
return self.statistics.get().distinct_count()
else:
return None

@property
def num_values(self):
Expand Down
26 changes: 13 additions & 13 deletions python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,33 +161,33 @@ def test_parquet_metadata_lifetime(tempdir):
'distinct_count'
),
[
([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, None),
([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, None),
([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, None),
([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, None),
([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, None),
([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, None),
([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, None),
([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, None),
(
[-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
'FLOAT', -1.1, 4.4, 1, 4, 0
'FLOAT', -1.1, 4.4, 1, 4, None
),
(
[-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
'DOUBLE', -1.1, 4.4, 1, 4, 0
'DOUBLE', -1.1, 4.4, 1, 4, None
),
(
['', 'b', chr(1000), None, 'aaa'], pa.binary(),
'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, 0
'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, None
),
(
[True, False, False, True, True], pa.bool_(),
'BOOLEAN', False, True, 0, 5, 0
'BOOLEAN', False, True, 0, 5, None
),
(
[b'\x00', b'b', b'12', None, b'aaa'], pa.binary(),
'BYTE_ARRAY', b'\x00', b'b', 1, 4, 0
'BYTE_ARRAY', b'\x00', b'b', 1, 4, None
),
]
)
Expand Down

0 comments on commit 63644f4

Please sign in to comment.