Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/json custom datetime format #638

Merged
merged 4 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -581,38 +581,91 @@ def create(self, task_logging: bool = False):
self._delete_dataset(dataset_id)


def load_and_parse_json(file_path: str, date_fields: Set[str] = None):
"""Load a JSON file for testing purposes. It parses string dates and datetimes into date and datetime instances."""
def load_and_parse_json(
file_path: str,
date_fields: Set[str] = None,
timestamp_fields: Set[str] = None,
date_formats: Set[str] = None,
timestamp_formats: str = None,
):
"""Load a JSON file for testing purposes. It parses string dates and datetimes into date and datetime instances.

:param file_path: the path to the JSON file.
:param date_fields: The fields to parse as a date.
:param timestamp_fields: The fields to parse as a timestamp.
:param date_formats: The date formats to use. If none, will use [%Y-%m-%d, %Y%m%d].
:param timestamp_formats: The timestamp formats to use. If none, will use [%Y-%m-%d %H:%M:%S.%f %Z].
"""

if date_fields is None:
date_fields = set()

if timestamp_fields is None:
timestamp_fields = set()

if date_formats is None:
date_formats = {"%Y-%m-%d", "%Y%m%d"}

if timestamp_formats is None:
timestamp_formats = {"%Y-%m-%d %H:%M:%S.%f %Z"}

def parse_datetime(obj):
for key, value in obj.items():
# Try to parse into a date or datetime
if key in date_fields:
if isinstance(value, str):
try:
obj[key] = datetime.strptime(value, "%Y-%m-%d").date()
except (ValueError, TypeError):
format_found = False
for format in date_formats:
try:
obj[key] = datetime.strptime(value, format).date()
format_found = True
break
except (ValueError, TypeError):
pass
if not format_found:
try:
obj[key] = datetime.strptime(value, "%Y%m%d").date()
dt = pendulum.parse(value)
dt = datetime(
dt.year,
dt.month,
dt.day,
dt.hour,
dt.minute,
dt.second,
dt.microsecond,
tzinfo=dt.tzinfo,
).date()
obj[key] = dt
except (ValueError, TypeError):
try:
dt = pendulum.parse(value)
dt = datetime(
dt.year,
dt.month,
dt.day,
dt.hour,
dt.minute,
dt.second,
dt.microsecond,
tzinfo=dt.tzinfo,
)
obj[key] = dt
except (ValueError, TypeError):
pass
pass

if key in timestamp_fields:
if isinstance(value, str):
format_found = False
for format in timestamp_formats:
try:
obj[key] = datetime.strptime(value, format)
format_found = True
break
except (ValueError, TypeError):
pass
if not format_found:
try:
dt = pendulum.parse(value)
dt = datetime(
dt.year,
dt.month,
dt.day,
dt.hour,
dt.minute,
dt.second,
dt.microsecond,
tzinfo=dt.tzinfo,
)
obj[key] = dt
except (ValueError, TypeError):
pass

return obj

with open(file_path, mode="r") as f:
Expand Down
51 changes: 49 additions & 2 deletions tests/observatory/platform/test_observatory_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@
import logging
import os
import unittest
from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import List, Union
from unittest.mock import patch
from ftplib import FTP
import re
import tempfile
import json

import croniter
import httpretty
Expand All @@ -50,6 +51,7 @@
random_id,
test_fixtures_path,
find_free_port,
load_and_parse_json,
)
from observatory.platform.utils.http_download import (
DownloadInfo,
Expand Down Expand Up @@ -798,3 +800,48 @@ def test_context_manager(self):
dst_file = os.path.join(tmpdir, "testfile.txt")
download_file(url=url, filename=dst_file)
self.assert_file_integrity(dst_file, expected_hash, algorithm)


class TestLoadAndParseJson(unittest.TestCase):
def test_load_and_parse_json(self):
# Create a temporary JSON file
with tempfile.NamedTemporaryFile() as temp_file:
# Create the data dictionary and write to temp file
data = {
"date1": "2022-01-01",
"timestamp1": "2022-01-01 12:00:00.100000 UTC",
"date2": "20230101",
"timestamp2": "2023-01-01 12:00:00",
}
with open(temp_file.name, "w") as f:
json.dump(data, f)

# Test case 1: Parsing date fields with default date formats. Not specifying timestamp fields
expected_result = data.copy()
expected_result["date1"] = datetime(2022, 1, 1).date()
expected_result["date2"] = datetime(2023, 1, 1).date() # Should be converted by pendulum
result = load_and_parse_json(temp_file.name, date_fields=["date1", "date2"], date_formats=["%Y-%m-%d"])
self.assertEqual(result, expected_result)

# Test case 2: Parsing timestamp fields with custom timestamp format, not specifying date field
expected_result = data.copy()
expected_result["timestamp1"] = datetime(2022, 1, 1, 12, 0, 0, 100000)
expected_result["timestamp2"] = datetime(
2023, 1, 1, 12, 0, 0, tzinfo=pendulum.tz.timezone("UTC")
) # Converted by pendulum
result = load_and_parse_json(
temp_file.name,
timestamp_fields=["timestamp1", "timestamp2"],
timestamp_formats=["%Y-%m-%d %H:%M:%S.%f %Z"],
)
self.assertEqual(result, expected_result)

# Test case 3: Default date and timestamp formats
expected_result = {
"date1": datetime(2022, 1, 1).date(),
"date2": "20230101",
"timestamp1": datetime(2022, 1, 1, 12, 0, 0, 100000),
"timestamp2": "2023-01-01 12:00:00",
}
result = load_and_parse_json(temp_file.name, date_fields=["date1"], timestamp_fields=["timestamp1"])
self.assertEqual(result, expected_result)