Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#182 Type guessing fixes #186

Merged
merged 4 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,4 @@ max-line-length=127

# List ignore rules one per line.
ignore =
E501
C901
W503
F401
F403
2 changes: 1 addition & 1 deletion ckanext/xloader/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import sqlalchemy as sa

from ckan import model
from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config, check_ckan_version
from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config

from . import loader
from . import db
Expand Down
8 changes: 7 additions & 1 deletion ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from unidecode import unidecode

import ckan.plugins as p
import ckan.plugins.toolkit as tk

from .job_exceptions import FileCouldNotBeLoadedError, LoaderError
from .parser import XloaderCSVParser
Expand Down Expand Up @@ -318,9 +317,16 @@ def row_iterator():

logger.info('Copying to database...')
count = 0
# Some types cannot be stored as empty strings and must be converted to None,
# https://github.com/ckan/ckanext-xloader/issues/182
non_empty_types = ['timestamp', 'numeric']
for i, records in enumerate(chunky(result, 250)):
count += len(records)
logger.info('Saving chunk {number}'.format(number=i))
for row in records:
for column_index, column_name in enumerate(row):
if headers_dicts[column_index]['type'] in non_empty_types and row[column_name] == '':
row[column_name] = None
send_resource_to_datastore(resource_id, headers_dicts, records)
logger.info('...copying done')

Expand Down
2 changes: 0 additions & 2 deletions ckanext/xloader/parser.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# -*- coding: utf-8 -*-
import csv
from codecs import iterencode
from decimal import Decimal, InvalidOperation
from itertools import chain

import six
from ckan.plugins.toolkit import asbool
from dateutil.parser import isoparser, parser
from dateutil.parser import ParserError
Expand Down
1 change: 0 additions & 1 deletion ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from ckan.plugins import toolkit

from . import action, auth, helpers as xloader_helpers, utils
from .loader import fulltext_function_exists, get_write_engine

try:
config_declarations = toolkit.blanket.config_declarations
Expand Down
2 changes: 1 addition & 1 deletion ckanext/xloader/tests/ckan_setup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
try:
from ckan.tests.pytest_ckan.ckan_setup import *
from ckan.tests.pytest_ckan.ckan_setup import * # noqa
except ImportError:
import pkg_resources
from paste.deploy import loadapp
Expand Down
5 changes: 2 additions & 3 deletions ckanext/xloader/tests/fixtures.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
import sqlalchemy
import sqlalchemy.orm as orm
from sqlalchemy import orm
import os

from ckanext.datastore.tests import helpers as datastore_helpers
Expand All @@ -11,7 +10,7 @@
)

try:
from ckan.tests.pytest_ckan.fixtures import *
from ckan.tests.pytest_ckan.fixtures import * # noqa
except ImportError:
import pytest

Expand Down
3 changes: 3 additions & 0 deletions ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Funding agency,Program title,Maximum (indicative) grant amount
DTIS,Accessible Tourism Infrastructure Grants,Five hundred thousand dollars
DTIS,Boosting Accessible Tourism Experiences Grants,5000
4 changes: 4 additions & 0 deletions ckanext/xloader/tests/samples/sample_with_blanks.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Funding agency,Program title,Opening date,Service ID
DTIS,Visitor First Experiences Fund,23/03/2023,63039
DTIS,First Nations Sport and Recreation Program Round 2,22/03/2023,63040
,,,63041
24 changes: 24 additions & 0 deletions ckanext/xloader/tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,30 @@ def test_german(self, Session):
u"tsvector",
] + [u"text"] * (len(records[0]) - 1)

def test_with_blanks(self, Session):
csv_filepath = get_sample_filepath("sample_with_blanks.csv")
resource_id = "test1"
factories.Resource(id=resource_id)
loader.load_csv(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
assert len(self._get_records(Session, "test1")) == 3

def test_with_mixed_types(self, Session):
csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv")
resource_id = "test1"
factories.Resource(id=resource_id)
loader.load_csv(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
assert len(self._get_records(Session, "test1")) == 2

def test_reload(self, Session):
csv_filepath = get_sample_filepath("simple.csv")
resource_id = "test1"
Expand Down
2 changes: 1 addition & 1 deletion ckanext/xloader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,10 @@ def type_guess(rows, types=TYPES, strict=False):
for ci, cell in enumerate(row):
if not cell:
continue
at_least_one_value[ci] = True
for type in list(guesses[ci].keys()):
if not isinstance(cell, type):
guesses[ci].pop(type)
at_least_one_value[ci] = True if guesses[ci] else False
# no need to set guessing weights before this
# because we only accept a type if it never fails
for i, guess in enumerate(guesses):
Expand Down