Skip to content

Commit

Permalink
Merge pull request #7 from TransitApp/fix/import-utf8-characters
Browse files Browse the repository at this point in the history
fix/import-utf8-characters
  • Loading branch information
juanborre authored Jun 27, 2018
2 parents f4075ef + 355095f commit 93ba374
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion helpers/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

const infoLog = require('debug')('gtfsNodeLib:i');
const fs = require('fs-extra');
const { StringDecoder } = require('string_decoder');

const eachWithLog = require('./logging_iterator_wrapper');
const { fromCsvStringToArray } = require('./csv');
Expand Down Expand Up @@ -49,9 +50,19 @@ function getRows(buffer, regexPatternObjects, tableName) {
let position = 0;
const batchLength = 50000;
let merge;
/*
Use string decoder to properly decode utf8 characters. Characters not in the basic ASCII take more
than one byte.
If the end of the batch cuts one of those characters, then we will yield weird characters.
decoder will accumulate any "lost" utf8 character at the end of the batch and accumulate it for the next
iteration.
*/
const decoder = new StringDecoder('utf8');

while (position < buffer.length) {
rowsSlice = buffer.toString('utf8', position, Math.min(buffer.length, position + batchLength));
rowsSlice = decoder.write(buffer.slice(position, Math.min(buffer.length, position + batchLength)));

if (regexPatternObjects) {
regexPatternObjects.forEach(({ regex, pattern }) => {
Expand Down

0 comments on commit 93ba374

Please sign in to comment.