Skip to content

Commit

Permalink
Add documentation and improve code to apply regex on bad CSV tables
Browse files Browse the repository at this point in the history
  • Loading branch information
LeoFrachet committed Jan 15, 2018
1 parent eb2fbe4 commit f677321
Show file tree
Hide file tree
Showing 16 changed files with 87 additions and 63 deletions.
45 changes: 35 additions & 10 deletions gtfs.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,15 @@ function addItems(gtfs, tableName, items) {
*
* @param {Gtfs} gtfs GTFS object containing the table to get.
* @param {string} tableName Name of the table of the GTFS to get.
* @param {Object} [options] Configuration object passed to importTable function.
* @return {
* Object|
* Map.<string, Object>|
* Map.<string, Map.<string, Object>>
* } Indexed table returned
*/
function getIndexedTable(gtfs, tableName, options) {
function getIndexedTable(gtfs, tableName) {
if (gtfs._tables.has(tableName) === false) {
importTable(gtfs, tableName, options);
importTable(gtfs, tableName);
infoLog(`[Importation] Table ${tableName} has been imported.`);
}

Expand Down Expand Up @@ -145,14 +144,39 @@ class Gtfs {
/**
* Constructor of the GTFS
*
* @param {string} path Path to the folder that contains the GTFS text files.
* @param {Map.<
* string,
* Array.<{regex: RegExp, pattern: string}>
* >} [regexPatternObjectsByTableName] Optional ad-hoc regex to fix the tables. See importTable.
* # options.regexPatternObjectsByTableName
*
* Optional ad-hoc list of regex to fix the tables. The keys are the tableName like defined in schema.js, the value
* are arrays containing pairs of regex and pattern to be applied on the raw table, before parsing. The goal is to fix
* some bad CSV to make them readable.
*
* Example
*
* The following raw is invalid according to the CSV specification:
*
* > something,something else,a field "not" properly escaped,one last thing
*
* Assuming it is in someTable.txt, it could be fixed with the following regexPatternObjectsByTableName:
*
* regexPatternObjectsByTableName = {
* nameOfTheTable: [{
* regex: /,a field "not" properly escaped,/g,
* pattern: ',a field ""not"" properly escaped,',
* }]
* };
*
* # options.throws
*
* Optional ad-hoc boolean. Default is true. Will force the parser to ignore invalid rows in the tables.
*
* @param {string} path Path to the folder that contains the GTFS text files.
* @param {{
* regexPatternObjectsByTableName: Map.<string, Array.<{regex: RegExp, pattern: string}>>,
* throws: boolean
* }} [options] Optional. See list above.
* @return {Gtfs} gtfs Instanciated GTFS object.
*/
constructor(path, regexPatternObjectsByTableName) {
constructor(path, {regexPatternObjectsByTableName = new Map(), throws = true} = {}) {
if (typeof path !== 'string' || path.length === 0) {
throw new Error(`Gtfs need a valid input path as string, instead of: "${path}".`);
}
Expand All @@ -166,7 +190,8 @@ class Gtfs {
this.isGtfs = true;

this._path = path;
this._regexPatternObjectsByTableName = regexPatternObjectsByTableName || {};
this._regexPatternObjectsByTableName = regexPatternObjectsByTableName;
this._shouldThrow = throws;
this._tables = new Map();
}

Expand Down
4 changes: 2 additions & 2 deletions helpers/csv.js
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ function fromCsvStringToArray(string, tableName) {
return fromCsvStringToArray(string, tableName);
}
process.notices.addWarning(__filename, `Row not valid in table ${tableName}: ${string}`);
return null;
return [];
}

const a = []; // Initialize array to receive values.
Expand All @@ -85,7 +85,7 @@ function fromCsvStringToArray(string, tableName) {
/* else */
if (m2 !== undefined) a.push(m2.replace(/\\"/g, '"'));
else if (m3 !== undefined) a.push(m3);
return ''; // Return empty string.
return []; // Return empty string.
});
// Handle special case of empty last value.
if (/,\s*$/.test(string)) {
Expand Down
49 changes: 12 additions & 37 deletions helpers/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,48 +10,21 @@ const { fromCsvStringToArray } = require('./csv');
const schema = require('./schema');

/**
* Import a table in the GTFS.
*
* @param {Gtfs} gtfs The GTFS in which to import the table.
* @param {string} tableName The table of the name to import.
* @param {
* Map.<
* string,
* Array.<{regex: RegExp, pattern: string}>
* >
* } [regexPatternObjectsByTableName] Optional ad-hoc regex to fix the tables. The keys are the tableName like defined
* in schema.js, the value are arrays containing pairs of regex and pattern to be
* applied on the raw table, before parsing. The goal is to fix some bad CSV to make
* them readable.
*
* Example:
* The following raw is invalid according to the CSV specification:
*
* > something,something else,a field "not" properly escaped,one last thing
*
* It could be fixed with:
* { regex: /,a field "not" properly escaped,/g,
* pattern: ',a field ""not"" properly escaped,' }
*
* The regexPatternObjectsByTableName would be:
*
* regexPatternObjectsByTableName = {
* nameOfTheTable: [{
* regex: /,a field "not" properly escaped,/g,
* pattern: ',a field ""not"" properly escaped,',
* }]
* };
*/

exports.importTable = (gtfs, tableName, regexPatternObjectsByTableName) => {
regexPatternObjectsByTableName = regexPatternObjectsByTableName || new Map();
exports.importTable = (gtfs, tableName) => {
const indexKeys = schema.indexKeysByTableName[tableName];
const fullPath = `${gtfs.getPath() + tableName}.txt`;

if (fs.existsSync(fullPath)) {
const fileContent = fs.readFileSync(fullPath);
const rows = getRows(fileContent, regexPatternObjectsByTableName, tableName);
const rows = getRows(fileContent, gtfs._regexPatternObjectsByTableName.get(tableName), tableName);

gtfs._tables.set(tableName, processRows(gtfs, tableName, indexKeys, rows));
gtfs._tables.set(tableName, processRows(gtfs, tableName, indexKeys, rows, gtfs._shouldThrow));
return;
}

Expand All @@ -64,24 +37,22 @@ exports.importTable = (gtfs, tableName, regexPatternObjectsByTableName) => {
* Private functions
*/

function getRows(buffer, regexPatternObjectsByTableName, tableName) {
function getRows(buffer, regexPatternObjects, tableName) {
const rows = [];
let rowsSlice;
let position = 0;
const batchLength = 50000;
let merge;
const regexPatternObjects = regexPatternObjectsByTableName.get(tableName);

while (position < buffer.length) {
rowsSlice = buffer.toString('utf8', position, Math.min(buffer.length, position + batchLength));

if (regexPatternObjects) {
regexPatternObjects.forEach(({regex, pattern}) => {
const modifiedRowsSlice = rowsSlice.replace(regex, pattern || '');

if (modifiedRowsSlice !== rowsSlice) {
process.notices.addInfo(
__filename, `Applying regex replace to table: "${tableName}". regex: "${regexPatternObject.regex}".`
);
process.notices.addInfo(__filename, `Applying regex replace to table: "${tableName}". regex: "${regex}".`);
rowsSlice = modifiedRowsSlice;
}
});
Expand All @@ -102,7 +73,7 @@ function getRows(buffer, regexPatternObjectsByTableName, tableName) {
return rows;
}

function processRows(gtfs, tableName, indexKeys, rows) {
function processRows(gtfs, tableName, indexKeys, rows, shouldThrow) {
let table = new Map();

if (rows === undefined || rows === null || rows.length === 0) {
Expand All @@ -124,6 +95,10 @@ function processRows(gtfs, tableName, indexKeys, rows) {
}, {});

if (sortedKeys.length !== arrayOfValues.length) {
if (shouldThrow === true) {
throw new Error(`Invalid raw in table ${tableName}: ${JSON.stringify(item)}`);
}

process.notices.addWarning(__filename, `Row not valid in table: ${JSON.stringify(item)}`);
return;
}
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
3 changes: 3 additions & 0 deletions samples/2/stops.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon
stop_0,SC0,Stop 0,Some stop,37.728631,-122.431282
stop_1,SC1,Stop 1,Some "other" stop,37.74103,-122.422482
49 changes: 35 additions & 14 deletions tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ const { Gtfs } = require('./index');
describe('Tests on GTFS', () => {
// eslint-disable-next-line no-undef
it('Test on meta functions', (done) => {
const path = `${__dirname}/sample/`;
const path = `${__dirname}/samples/1/`;
const gtfs = new Gtfs(path);

expect(gtfs.isGtfs).to.equal(true);
Expand All @@ -23,7 +23,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Test on generic table functions', (done) => {
const path = `${__dirname}/sample/`;
const path = `${__dirname}/samples/1/`;
const gtfs = new Gtfs(path);

const indexedAgencies = gtfs.getIndexedTable('agency');
Expand Down Expand Up @@ -80,7 +80,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on agencies', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(sortedKeys(gtfs.getIndexedAgencies())).to.deep.equal(['agency_0']);
Expand Down Expand Up @@ -121,7 +121,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on stops', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(sortedKeys(gtfs.getIndexedStops())).to.deep.equal(['stop_0', 'stop_1']);
Expand Down Expand Up @@ -164,7 +164,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on routes', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(sortedKeys(gtfs.getIndexedRoutes())).to.deep.equal(['route_0']);
Expand Down Expand Up @@ -209,7 +209,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on trips', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(sortedKeys(gtfs.getIndexedTrips())).to.deep.equal(['trip_0']);
Expand Down Expand Up @@ -250,7 +250,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on stop times', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(sortedKeys(gtfs.getIndexedStopTimes())).to.deep.equal(['trip_0']);
Expand Down Expand Up @@ -314,7 +314,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on calendars', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(sortedKeys(gtfs.getIndexedCalendars())).to.deep.equal(['service_0']);
Expand Down Expand Up @@ -359,7 +359,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on calendar dates', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(sortedKeys(gtfs.getIndexedCalendarDates())).to.deep.equal(['service_0']);
Expand Down Expand Up @@ -420,7 +420,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on shapes', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(sortedKeys(gtfs.getIndexedShapePoints())).to.deep.equal(['shape_0']);
Expand Down Expand Up @@ -477,7 +477,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on frequencies', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(sortedKeys(gtfs.getIndexedFrequencies())).to.deep.equal(['trip_0']);
Expand Down Expand Up @@ -528,7 +528,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on transfers', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(sortedKeys(gtfs.getIndexedTransfers())).to.deep.equal(['stop_0', 'stop_1']);
Expand Down Expand Up @@ -581,7 +581,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on feed info', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

expect(gtfs.getFeedInfo().feed_lang).to.equal('en');
Expand All @@ -599,7 +599,7 @@ describe('Tests on GTFS', () => {

// eslint-disable-next-line no-undef
it('Tests on exporting', (done) => {
const path = `${__dirname}/sample`;
const path = `${__dirname}/samples/1`;
const gtfs = new Gtfs(path);

gtfs.getFeedInfo().feed_lang = 'fr';
Expand Down Expand Up @@ -641,6 +641,27 @@ describe('Tests on GTFS', () => {
});
});
});

// eslint-disable-next-line no-undef
it('Tests on the regex/pattern applied to fix a bad CSV', (done) => {
const path = `${__dirname}/samples/2/`;
const gtfsWithoutFix = new Gtfs(path);

expect(() => gtfsWithoutFix.getIndexedStops()).to.throw();

const gtfsWithoutFixWithoutThrow = new Gtfs(path, { throws: false });

expect(() => gtfsWithoutFixWithoutThrow.getIndexedStops()).to.not.throw();

const regexPatternObjectsByTableName = new Map([[
'stops', [{regex: /,Some "other" stop,/g, pattern: ',"Some ""other"" stop",'}],
]]);
const gtfsWithFix = new Gtfs(path, { regexPatternObjectsByTableName });

expect(gtfsWithFix.getStopWithId('stop_1').stop_desc).to.equal('Some "other" stop');

done();
});
});

function sortedKeys(map) {
Expand Down

0 comments on commit f677321

Please sign in to comment.