Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabled CLI to read predefined and customized CSV files #480

Merged
merged 10 commits into from
Jan 3, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cli/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ dependencies {
implementation 'net.sf.jopt-simple:jopt-simple:[5.0,6.0)'
implementation 'org.jetbrains.kotlin:kotlin-stdlib-jdk8'

implementation 'org.apache.commons:commons-csv:1.8'

testImplementation 'org.junit.vintage:junit-vintage-engine:5.7.0'
}

Expand Down
35 changes: 30 additions & 5 deletions cli/src/org/partiql/cli/functions/ReadFile.kt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package org.partiql.cli.functions

import com.amazon.ion.*
import org.apache.commons.csv.CSVFormat
import org.partiql.lang.eval.*
import org.partiql.lang.eval.io.*
import org.partiql.lang.eval.io.DelimitedValues.ConversionMode
Expand All @@ -28,14 +29,30 @@ internal class ReadFile(valueFactory: ExprValueFactory) : BaseFunction(valueFact
ConversionMode.values().find { it.name.toLowerCase() == name } ?:
throw IllegalArgumentException( "Unknown conversion: $name")

private fun delimitedReadHandler(delimiter: Char): (InputStream, IonStruct) -> ExprValue = { input, options ->
private fun fileReadHandler(csvFormat: CSVFormat): (InputStream, IonStruct) -> ExprValue = { input, options ->
val encoding = options["encoding"]?.stringValue() ?: "UTF-8"
val reader = InputStreamReader(input, encoding)
val conversion = options["conversion"]?.stringValue() ?: "none"

val hasHeader = options["header"]?.booleanValue() ?: false
val ignoreEmptyLine = options["ignore_empty_line"]?.booleanValue() ?: true
val ignoreSurroundingSpace = options["ignore_surrounding_space"]?.booleanValue() ?: true
val trim = options["trim"]?.booleanValue() ?: true
val delimiter = options["delimiter"]?.stringValue()?.first() // CSVParser library only accepts a single character as delimiter
val record = options["line_breaker"]?.stringValue()
val escape = options["escape"]?.stringValue()?.first() // CSVParser library only accepts a single character as escape
val quote = options["quote"]?.stringValue()?.first() // CSVParser library only accepts a single character as quote

val reader = InputStreamReader(input, encoding)
val csvFormat = csvFormat.let{ it.withIgnoreEmptyLines(ignoreEmptyLine) }
.let{ it.withIgnoreSurroundingSpaces(ignoreSurroundingSpace) }
.let{ it.withTrim(trim) }
.let { if (hasHeader) it.withFirstRecordAsHeader() else it }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(sorry missed this in the initial review) nit: .let calls on these lines are redundant

.let { if (delimiter != null) it.withDelimiter(delimiter) else it }
.let { if (record != null) it.withRecordSeparator(record) else it }
.let { if (escape != null) it.withEscape(escape) else it }
.let { if (quote != null) it.withQuote(quote) else it }

DelimitedValues.exprValue(valueFactory, reader, delimiter, hasHeader, conversionModeFor(conversion))
DelimitedValues.exprValue(valueFactory, reader, csvFormat, conversionModeFor(conversion))
}

private fun ionReadHandler(): (InputStream, IonStruct) -> ExprValue = { input, _ ->
Expand All @@ -44,8 +61,16 @@ internal class ReadFile(valueFactory: ExprValueFactory) : BaseFunction(valueFact

private val readHandlers = mapOf(
"ion" to ionReadHandler(),
"tsv" to delimitedReadHandler('\t'),
"csv" to delimitedReadHandler(','))
"csv" to fileReadHandler(CSVFormat.DEFAULT),
"tsv" to fileReadHandler(CSVFormat.DEFAULT.withDelimiter('\t')),
"excel_csv" to fileReadHandler(CSVFormat.EXCEL),
"mysql_csv" to fileReadHandler(CSVFormat.MYSQL),
"mongodb_csv" to fileReadHandler(CSVFormat.MONGODB_CSV),
"mongodb_tsv" to fileReadHandler(CSVFormat.MONGODB_TSV),
"postgresql_csv" to fileReadHandler(CSVFormat.POSTGRESQL_CSV),
"postgresql_text" to fileReadHandler(CSVFormat.POSTGRESQL_TEXT),
"customized" to fileReadHandler(CSVFormat.DEFAULT)
)

override fun call(env: Environment, args: List<ExprValue>): ExprValue {
val options = optionsStruct(1, args)
Expand Down
60 changes: 60 additions & 0 deletions cli/test/org/partiql/cli/functions/ReadFileTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -151,4 +151,64 @@ class ReadFileTest {

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readExcelCsvFile() {
writeFile("simple_excel.csv", "title,category,price\nharry potter,book,7.99")

val args = listOf("\"${dirPath("simple_excel.csv")}\"", "{type:\"excel_csv\", header:true}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{title:\"harry potter\",category:\"book\",price:\"7.99\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readPostgreCsvFile() {
writeFile("simple_postgre.csv", "id,name,balance\n1,Bob,10000.00")

val args = listOf("\"${dirPath("simple_postgre.csv")}\"", "{type:\"postgresql_csv\", header:true}").map { it.exprValue() }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not all the typos were addressed here. 'simple_postgre.csv' -> 'simple_postgresql.csv'


val actual = function.call(env, args).ionValue
val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile1() { // delimiter
writeFile("customized.csv", "id name balance\n1 Bob 10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, delimiter:' '}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile2() { // ignore_empty_line
writeFile("customized.csv", "id,name,balance\n\n1,Bob,10000.00")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, ignore_empty_line: false}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\"\"},{id:\"1\",name:\"Bob\",balance:\"10000.00\"}]"

assertEquals(ion.singleValue(expected), actual)
}

@Test
fun readCustomizedCsvFile3() { // trim and ignore_surrounding_space
writeFile("customized.csv", "id,name,balance\n 1 , Bob , 10000.00 ")

val args = listOf("\"${dirPath("customized.csv")}\"", "{type:\"customized\", header:true, ignore_surrounding_space:false, trim:false}").map { it.exprValue() }

val actual = function.call(env, args).ionValue
val expected = "[{id:\" 1 \",name:\" Bob \",balance:\" 10000.00 \"}]"

assertEquals(ion.singleValue(expected), actual)
}
}
30 changes: 30 additions & 0 deletions docs/user/CLI.md
Original file line number Diff line number Diff line change
Expand Up @@ -641,3 +641,33 @@ Kumo dog
Mochi dog
Lilikoi unicorn
```

## Predefined CSV Data

The `read_file` function provides options to read other predefined CSV data formats.
For example, if a CSV file is exported from PostgreSQL, we can use the following command
to read the file:
```
read_file('simple_postgre.csv', {'type':'postgre_csv'})
```
Other available options for the argument `type` besides `postgre_csv` are `excel_csv`, `mysql_csv`, `mongodb_csv`, `mongodb_tsv`, `postgresql_csv` and `postgresql_text`.

## Customized CSV Data
The `read_file` function also provides options to read customized CSV data formats.
For example, we have a data file where the whitespace is the separator as shown below:
```
title category price
harry_potter book 7.99
dot electronics 49.99
echo electronics 99.99
```
We can use the following command to read the file:
```
read_file('customized.csv', {'type':'customized', 'delimiter':' ', 'header':true})
```
The following command explicitly shows all the available options for a standard CSV file:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: what is meant by "standard" here? Is there a standard CSV file?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just means the default CSV format in CSVParser library.

```
read_file('customized.csv', {'type':'customized', 'delimiter':',', 'header':true, \
'ignore_empty_line':true, 'ignore_surrounding_space':true, 'trim':true, \
'line_breaker: \n', 'escape':'\', 'quote':'"'})
```
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May want to note what the arguments do or link to CSVParser's reference on these.

Also would be helpful to note that delimiter, escape, and quote arguments can only be a 1 character.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Applied in the next commit.

2 changes: 2 additions & 0 deletions examples/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ dependencies {
implementation 'com.amazonaws:aws-java-sdk-s3:1.11.554'
implementation 'com.amazonaws:aws-java-sdk-s3control:1.11.554'

implementation 'org.apache.commons:commons-csv:1.8'

testImplementation 'org.junit.vintage:junit-vintage-engine:5.7.0'
}

Expand Down
9 changes: 2 additions & 7 deletions lang/src/org/partiql/lang/eval/io/DelimitedValues.kt
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,11 @@ object DelimitedValues {
@JvmStatic
fun exprValue(valueFactory: ExprValueFactory,
input: Reader,
delimiter: Char,
hasHeader: Boolean,
csvFormat: CSVFormat,
conversionMode: ConversionMode): ExprValue {
val reader = BufferedReader(input)
val csvFormat = when (hasHeader){
true -> CSVFormat.DEFAULT.withDelimiter(delimiter).withFirstRecordAsHeader()
false -> CSVFormat.DEFAULT.withDelimiter(delimiter)
}
val csvParser = CSVParser(reader, csvFormat)
val columns: List<String> = csvParser.headerNames // `columns` is an empty list when `hasHeader` is false
val columns: List<String> = csvParser.headerNames

val seq = csvParser.asSequence().map { csvRecord ->
valueFactory.newStruct(
Expand Down
21 changes: 8 additions & 13 deletions lang/test/org/partiql/lang/eval/io/DelimitedValuesTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

package org.partiql.lang.eval.io

import org.apache.commons.csv.CSVFormat
import org.partiql.lang.*
import org.partiql.lang.eval.io.DelimitedValues.ConversionMode
import org.partiql.lang.eval.io.DelimitedValues.ConversionMode.*
Expand All @@ -32,10 +33,9 @@ class DelimitedValuesTest : TestBase() {
}

private fun read(text: String,
delimiter: Char,
hasHeader: Boolean,
csvFormat: CSVFormat,
conversionMode: ConversionMode): ExprValue =
DelimitedValues.exprValue(valueFactory, StringReader(text), delimiter, hasHeader, conversionMode)
DelimitedValues.exprValue(valueFactory, StringReader(text), csvFormat, conversionMode)

private fun assertWrite(expectedText: String,
valueText: String,
Expand Down Expand Up @@ -78,8 +78,7 @@ class DelimitedValuesTest : TestBase() {
"""[]""",
read(
"",
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = NONE
)
)
Expand All @@ -89,8 +88,7 @@ class DelimitedValuesTest : TestBase() {
"""[]""",
read(
"",
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = AUTO
)
)
Expand All @@ -100,8 +98,7 @@ class DelimitedValuesTest : TestBase() {
"""[{_1: "1", _2: "2", _3: "3"}]""",
read(
"""1,2,3""",
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = NONE
)
)
Expand All @@ -119,8 +116,7 @@ class DelimitedValuesTest : TestBase() {
|1.0,2e0,2007-10-10T12:00:00Z
|hello,{,}
""".trimMargin(),
delimiter = ',',
hasHeader = false,
CSVFormat.DEFAULT,
conversionMode = AUTO
)
)
Expand All @@ -139,8 +135,7 @@ class DelimitedValuesTest : TestBase() {
|1.0,2e0,2007-10-10T12:00:00Z
|hello,{,}
""".trimMargin(),
delimiter = ',',
hasHeader = true,
CSVFormat.DEFAULT.withFirstRecordAsHeader(),
conversionMode = AUTO
)
)
Expand Down