Skip to content

Commit

Permalink
Merge pull request ckan#3390 from ckan/3390-datastore-dump-formats
Browse files Browse the repository at this point in the history
datastore: dump more formats
  • Loading branch information
amercader authored Jan 13, 2017
2 parents b3b895d + 753359b commit 0aec577
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 23 deletions.
59 changes: 38 additions & 21 deletions ckanext/datastore/controller.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# encoding: utf-8

import StringIO
import unicodecsv as csv

import pylons

Expand All @@ -16,9 +15,17 @@
BaseController,
abort,
)
from ckanext.datastore.writer import (
csv_writer,
tsv_writer,
json_writer,
xml_writer,
)

int_validator = get_validator('int_validator')
boolean_validator = get_validator('boolean_validator')

DUMP_FORMATS = 'csv', 'tsv', 'json', 'xml'
PAGINATE_BY = 10000


Expand All @@ -32,14 +39,24 @@ def dump(self, resource_id):
limit = int_validator(request.GET.get('limit'), {})
except Invalid as e:
abort(400, u'limit: ' + e.error)
bom = boolean_validator(request.GET.get('bom'), {})
fmt = request.GET.get('format', 'csv')

wr = None
while True:
if limit is not None and limit <= 0:
break
def start_writer(fields):
if fmt == 'csv':
return csv_writer(response, fields, resource_id, bom)
if fmt == 'tsv':
return tsv_writer(response, fields, resource_id, bom)
if fmt == 'json':
return json_writer(response, fields, resource_id, bom)
if fmt == 'xml':
return xml_writer(response, fields, resource_id, bom)
abort(400, _(
u'format: must be one of %s') % u', '.join(DUMP_FORMATS))

def result_page(offset, limit):
try:
result = get_action('datastore_search')(None, {
return get_action('datastore_search')(None, {
'resource_id': resource_id,
'limit':
PAGINATE_BY if limit is None
Expand All @@ -49,21 +66,21 @@ def dump(self, resource_id):
except ObjectNotFound:
abort(404, _('DataStore resource not found'))

if not wr:
response.headers['Content-Type'] = 'text/csv; charset=utf-8'
response.headers['Content-disposition'] = (
'attachment; filename="{name}.csv"'.format(
name=resource_id))
wr = csv.writer(response, encoding='utf-8')
result = result_page(offset, limit)
columns = [x['id'] for x in result['fields']]

with start_writer(result['fields']) as wr:
while True:
if limit is not None and limit <= 0:
break

header = [x['id'] for x in result['fields']]
wr.writerow(header)
for record in result['records']:
wr.writerow([record[column] for column in columns])

for record in result['records']:
wr.writerow([record[column] for column in header])
if len(result['records']) < PAGINATE_BY:
break
offset += PAGINATE_BY
if limit is not None:
limit -= PAGINATE_BY

if len(result['records']) < PAGINATE_BY:
break
offset += PAGINATE_BY
if limit is not None:
limit -= PAGINATE_BY
result = result_page(offset, limit)
169 changes: 169 additions & 0 deletions ckanext/datastore/writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# encoding: utf-8

from contextlib import contextmanager
from email.utils import encode_rfc2231
import json
from xml.etree.cElementTree import Element, SubElement, ElementTree

import unicodecsv

UTF8_BOM = u'\uFEFF'.encode(u'utf-8')


@contextmanager
def csv_writer(response, fields, name=None, bom=False):
u'''Context manager for writing UTF-8 CSV data to response
:param response: file-like or response-like object for writing
data and headers (response-like objects only)
:param fields: list of datastore fields
:param name: file name (for headers, response-like objects only)
:param bom: True to include a UTF-8 BOM at the start of the file
>>> with csv_writer(response, fields) as d:
>>> d.writerow(row1)
>>> d.writerow(row2)
'''

if hasattr(response, u'headers'):
response.headers['Content-Type'] = b'text/csv; charset=utf-8'
if name:
response.headers['Content-disposition'] = (
b'attachment; filename="{name}.csv"'.format(
name=encode_rfc2231(name)))
wr = unicodecsv.writer(response, encoding=u'utf-8')
if bom:
response.write(UTF8_BOM)
wr.writerow(f['id'] for f in fields)
yield wr


@contextmanager
def tsv_writer(response, fields, name=None, bom=False):
u'''Context manager for writing UTF-8 TSV data to response
:param response: file-like or response-like object for writing
data and headers (response-like objects only)
:param fields: list of datastore fields
:param name: file name (for headers, response-like objects only)
:param bom: True to include a UTF-8 BOM at the start of the file
>>> with tsv_writer(response, fields) as d:
>>> d.writerow(row1)
>>> d.writerow(row2)
'''

if hasattr(response, u'headers'):
response.headers['Content-Type'] = (
b'text/tab-separated-values; charset=utf-8')
if name:
response.headers['Content-disposition'] = (
b'attachment; filename="{name}.tsv"'.format(
name=encode_rfc2231(name)))
wr = unicodecsv.writer(
response, encoding=u'utf-8', dialect=unicodecsv.excel_tab)
if bom:
response.write(UTF8_BOM)
wr.writerow(f['id'] for f in fields)
yield wr


@contextmanager
def json_writer(response, fields, name=None, bom=False):
u'''Context manager for writing UTF-8 JSON data to response
:param response: file-like or response-like object for writing
data and headers (response-like objects only)
:param fields: list of datastore fields
:param name: file name (for headers, response-like objects only)
:param bom: True to include a UTF-8 BOM at the start of the file
>>> with json_writer(response, fields) as d:
>>> d.writerow(row1)
>>> d.writerow(row2)
'''

if hasattr(response, u'headers'):
response.headers['Content-Type'] = (
b'application/json; charset=utf-8')
if name:
response.headers['Content-disposition'] = (
b'attachment; filename="{name}.json"'.format(
name=encode_rfc2231(name)))
if bom:
response.write(UTF8_BOM)
response.write(
b'{\n "fields": %s,\n "records": [' % json.dumps(
fields, ensure_ascii=False, separators=(u',', u':')))
yield JSONWriter(response, [f['id'] for f in fields])
response.write(b'\n]}\n')


class JSONWriter(object):
def __init__(self, response, columns):
self.response = response
self.columns = columns
self.first = True

def writerow(self, row):
if self.first:
self.first = False
self.response.write(b'\n ')
else:
self.response.write(b',\n ')
self.response.write(json.dumps(
row,
ensure_ascii=False,
separators=(u',', u':'),
sort_keys=True).encode(u'utf-8'))


@contextmanager
def xml_writer(response, fields, name=None, bom=False):
u'''Context manager for writing UTF-8 XML data to response
:param response: file-like or response-like object for writing
data and headers (response-like objects only)
:param fields: list of datastore fields
:param name: file name (for headers, response-like objects only)
:param bom: True to include a UTF-8 BOM at the start of the file
>>> with xml_writer(response, fields) as d:
>>> d.writerow(row1)
>>> d.writerow(row2)
'''

if hasattr(response, u'headers'):
response.headers['Content-Type'] = (
b'text/xml; charset=utf-8')
if name:
response.headers['Content-disposition'] = (
b'attachment; filename="{name}.xml"'.format(
name=encode_rfc2231(name)))
if bom:
response.write(UTF8_BOM)
response.write(b'<data>\n')
yield XMLWriter(response, [f['id'] for f in fields])
response.write(b'</data>\n')


class XMLWriter(object):
def __init__(self, response, columns):
self.response = response
self.id_col = columns[0] == u'_id'
if self.id_col:
columns = columns[1:]
self.columns = columns

def writerow(self, row):
root = Element(u'row')
if self.id_col:
root.attrib[u'_id'] = unicode(row[0])
row = row[1:]
for k, v in zip(self.columns, row):
if v is None:
SubElement(root, k).attrib[u'xsi:nil'] = u'true'
continue
SubElement(root, k).text = unicode(v)
ElementTree(root).write(self.response, encoding=u'utf-8')
self.response.write(b'\n')
11 changes: 9 additions & 2 deletions doc/maintaining/datastore.rst
Original file line number Diff line number Diff line change
Expand Up @@ -275,11 +275,18 @@ API reference

.. _dump:

Download resource as CSV
------------------------
Download resource
-----------------

A DataStore resource can be downloaded in the `CSV`_ file format from ``{CKAN-URL}/datastore/dump/{RESOURCE-ID}``.

For an Excel-compatible CSV file use ``{CKAN-URL}/datastore/dump/{RESOURCE-ID}?bom=true``.

Other formats supported include tab-separated values (``?format=tsv``),
JSON (``?format=json``) and XML (``?format=xml``). E.g. to download an Excel-compatible
tab-separated file use
``{CKAN-URL}/datastore/dump/{RESOURCE-ID}?format=tsv&bom=true``.

.. _CSV: https://en.wikipedia.org/wiki/Comma-separated_values


Expand Down

0 comments on commit 0aec577

Please sign in to comment.