Skip to content

Commit 8746737

Browse files
authored
feat(java): add lance spark connector for read (lancedb#2429)
Add Lance Spark Connector with - getSchema - getFragments - Full table scan + fragment scan - Fragment scan with column push down - Fragment scan with filter push down (numerical filter & And/Or/Not)
1 parent d0a26e4 commit 8746737

File tree

94 files changed

+2245
-15
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+2245
-15
lines changed

java/core/lance-jni/src/blocking_dataset.rs

+2-3
Original file line numberDiff line numberDiff line change
@@ -348,9 +348,8 @@ fn inner_import_ffi_schema(
348348
Schema::from(dataset.inner.schema())
349349
};
350350

351-
let c_schema = FFI_ArrowSchema::try_from(&schema)?;
352-
let out_c_schema = unsafe { &mut *(arrow_schema_addr as *mut FFI_ArrowSchema) };
353-
let _old = std::mem::replace(out_c_schema, c_schema);
351+
let ffi_schema = FFI_ArrowSchema::try_from(&schema)?;
352+
unsafe { std::ptr::write_unaligned(arrow_schema_addr as *mut FFI_ArrowSchema, ffi_schema) }
354353
Ok(())
355354
}
356355

java/core/src/main/java/com/lancedb/lance/Dataset.java

+1-5
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,10 @@
1616

1717
import com.lancedb.lance.ipc.LanceScanner;
1818
import com.lancedb.lance.ipc.ScanOptions;
19-
import io.questdb.jar.jni.JarJniLoader;
2019
import java.io.Closeable;
2120
import java.util.List;
2221
import java.util.Optional;
23-
import java.util.concurrent.locks.ReadWriteLock;
24-
import java.util.concurrent.locks.ReentrantReadWriteLock;
2522
import java.util.stream.Collectors;
26-
import javax.annotation.concurrent.NotThreadSafe;
2723
import org.apache.arrow.c.ArrowArrayStream;
2824
import org.apache.arrow.c.ArrowSchema;
2925
import org.apache.arrow.c.Data;
@@ -91,7 +87,7 @@ public static Dataset create(BufferAllocator allocator, String path, Schema sche
9187
public static Dataset create(BufferAllocator allocator, ArrowArrayStream stream,
9288
String path, WriteParams params) {
9389
Preconditions.checkNotNull(allocator);
94-
Preconditions.checkNotNull(stream);
90+
Preconditions.checkNotNull(stream);
9591
Preconditions.checkNotNull(path);
9692
Preconditions.checkNotNull(params);
9793
Dataset dataset = createWithFfiStream(stream.memoryAddress(), path,

java/core/src/main/java/com/lancedb/lance/DatasetFragment.java

-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import com.lancedb.lance.ipc.LanceScanner;
1818
import com.lancedb.lance.ipc.ScanOptions;
1919
import java.util.Arrays;
20-
import java.util.List;
2120
import org.apache.arrow.util.Preconditions;
2221

2322
/**

java/core/src/main/java/com/lancedb/lance/FragmentMetadata.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import org.json.JSONObject;
2020

2121
/**
22-
* Metadata of a Fragment in the dataset.
22+
* Metadata of a Fragment in the dataset.
2323
* Matching to lance Fragment.
2424
* */
2525
public class FragmentMetadata implements Serializable {
@@ -39,7 +39,7 @@ private FragmentMetadata(String jsonMetadata, int id, long physicalRows) {
3939
public int getId() {
4040
return id;
4141
}
42-
42+
4343
public long getPhysicalRows() {
4444
return physicalRows;
4545
}

java/core/src/main/java/com/lancedb/lance/FragmentOperation.java

-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import java.util.stream.Collectors;
2020
import org.apache.arrow.memory.BufferAllocator;
2121
import org.apache.arrow.util.Preconditions;
22-
import org.apache.arrow.vector.types.pojo.Schema;
2322

2423
/** Fragment related operations. */
2524
public abstract class FragmentOperation {

java/core/src/main/java/com/lancedb/lance/WriteParams.java

-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414

1515
package com.lancedb.lance;
1616

17-
import java.util.HashMap;
18-
import java.util.Map;
1917
import java.util.Optional;
2018

2119
/**
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
15+
package com.lancedb.lance;
16+
17+
import java.io.IOException;
18+
import java.nio.file.Path;
19+
20+
import com.lancedb.lance.ipc.LanceScanner;
21+
import com.lancedb.lance.ipc.ScanOptions;
22+
import org.apache.arrow.memory.BufferAllocator;
23+
import org.apache.arrow.memory.RootAllocator;
24+
import org.junit.jupiter.api.AfterAll;
25+
import org.junit.jupiter.api.BeforeAll;
26+
import org.junit.jupiter.api.Test;
27+
import org.junit.jupiter.api.io.TempDir;
28+
29+
import static org.junit.jupiter.api.Assertions.assertEquals;
30+
31+
public class FilterTest {
32+
@TempDir
33+
static Path tempDir;
34+
private static BufferAllocator allocator;
35+
private static Dataset dataset;
36+
37+
@BeforeAll
38+
static void setup() throws IOException {
39+
String datasetPath = tempDir.resolve("filter_test_dataset").toString();
40+
allocator = new RootAllocator();
41+
TestUtils.SimpleTestDataset testDataset = new TestUtils.SimpleTestDataset(allocator, datasetPath);
42+
testDataset.createEmptyDataset().close();
43+
// write id with value from 0 to 39
44+
dataset = testDataset.write(1, 40);
45+
}
46+
47+
@AfterAll
48+
static void tearDown() {
49+
// Cleanup resources used by the tests
50+
if (dataset != null) {
51+
dataset.close();
52+
}
53+
if (allocator != null) {
54+
allocator.close();
55+
}
56+
}
57+
58+
@Test
59+
void testFilters() throws Exception {
60+
testFilter("id == 10", 1);
61+
testFilter("id == 10", 1);
62+
testFilter("id != 10", 39);
63+
testFilter("id > 10", 29);
64+
testFilter("id >= 10", 30);
65+
testFilter("id < 10", 10);
66+
testFilter("id <= 10", 11);
67+
testFilter("id >= 10 and id < 20", 10);
68+
testFilter("id < 10 or id > 30", 19);
69+
testFilter("id != 10 and id < 20", 19);
70+
testFilter("id < 5 or id > 35", 9);
71+
testFilter("(id >= 5 and id <= 15) or (id >= 25 and id <= 35)", 22);
72+
testFilter("id == 5 or (id >= 30 and id < 35)", 6);
73+
testFilter("id IS NOT NULL", 40);
74+
testFilter("id IS NULL", 0);
75+
testFilter("id IN (5, 15, 25, 35)", 4);
76+
77+
testFilter("name LIKE 'Person%'", 40);
78+
testFilter("name LIKE 'Person 1%'", 11);
79+
testFilter("name LIKE '%0'", 4);
80+
testFilter("name LIKE '%son 1'", 1);
81+
testFilter("name LIKE '%son 1%'", 11);
82+
testFilter("name LIKE '%son%'", 40);
83+
testFilter("name == 'Person 1'", 1);
84+
testFilter("name IS NULL", 0);
85+
testFilter("name IS NOT NULL", 40);
86+
87+
testFilter("name LIKE 'Person%' AND name LIKE '%0'", 4);
88+
testFilter("name LIKE 'Person%' AND name LIKE '%1%'", 13);
89+
testFilter("name LIKE '%son%' AND name LIKE '%0'", 4);
90+
testFilter("name LIKE '%son%' AND name LIKE '%1'", 4);
91+
testFilter("name LIKE 'Person 1%' AND name LIKE '%1'", 2);
92+
93+
testFilter("(name IS NOT NULL) AND (name == 'Person 1')", 1);
94+
testFilter("(name IS NOT NULL) AND (name == 'Person')", 0);
95+
// Not supported, bug?, LanceError(IO): Schema error: No field named person. Valid fields are id, name.
96+
// testFilter("(name IS NOT NULL) AND (name == Person)", 0);
97+
98+
// Not supported
99+
// testFilter("\"id\" == 10", 1);
100+
// testFilter("'id' == 10", 1);
101+
}
102+
103+
private void testFilter(String filter, int expectedCount) throws Exception {
104+
try (LanceScanner scanner = dataset.newScan(new ScanOptions.Builder().filter(filter).build())) {
105+
assertEquals(expectedCount, scanner.countRows());
106+
}
107+
}
108+
}

java/dev/checkstyle.xml

+182
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
<!--
2+
~ Licensed to the Apache Software Foundation (ASF) under one or more
3+
~ contributor license agreements. See the NOTICE file distributed with
4+
~ this work for additional information regarding copyright ownership.
5+
~ The ASF licenses this file to You under the Apache License, Version 2.0
6+
~ (the "License"); you may not use this file except in compliance with
7+
~ the License. You may obtain a copy of the License at
8+
~
9+
~ http://www.apache.org/licenses/LICENSE-2.0
10+
~
11+
~ Unless required by applicable law or agreed to in writing, software
12+
~ distributed under the License is distributed on an "AS IS" BASIS,
13+
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
~ See the License for the specific language governing permissions and
15+
~ limitations under the License.
16+
-->
17+
18+
19+
<!DOCTYPE module PUBLIC
20+
"-//Puppy Crawl//DTD Check Configuration 1.3//EN"
21+
"https://checkstyle.org/dtds/configuration_1_3.dtd">
22+
23+
<module name = "Checker">
24+
<property name="charset" value="UTF-8"/>
25+
26+
<property name="severity" value="error"/>
27+
28+
<property name="fileExtensions" value="java, properties, xml"/>
29+
30+
<!-- Checks for whitespace -->
31+
<!-- See http://checkstyle.sf.net/config_whitespace.html -->
32+
<module name="FileTabCharacter">
33+
<property name="eachLine" value="true"/>
34+
</module>
35+
36+
<module name="RegexpSingleline">
37+
<!-- \s matches whitespace character, $ matches end of line. -->
38+
<property name="format" value="\s+$"/>
39+
<property name="message" value="No trailing whitespace allowed."/>
40+
</module>
41+
42+
<module name="LineLength">
43+
<property name="max" value="100"/>
44+
<property name="ignorePattern" value="^package.*|^import.*|a href|href|http://|https://|ftp://"/>
45+
</module>
46+
47+
<module name="TreeWalker">
48+
<!--
49+
If you wish to turn off checking for a section of code, you can put a comment in the source
50+
before and after the section, with the following syntax:
51+
52+
// checkstyle.off: XXX (such as checkstyle.off: NoFinalizer)
53+
... // stuff that breaks the styles
54+
// checkstyle.on: XXX (such as checkstyle.on: NoFinalizer)
55+
-->
56+
<module name="SuppressionCommentFilter">
57+
<property name="offCommentFormat" value="checkstyle\.off\: ([\w\|]+)"/>
58+
<property name="onCommentFormat" value="checkstyle\.on\: ([\w\|]+)"/>
59+
<property name="checkFormat" value="$1"/>
60+
</module>
61+
<module name="OuterTypeFilename"/>
62+
<module name="IllegalTokenText">
63+
<property name="tokens" value="STRING_LITERAL, CHAR_LITERAL"/>
64+
<property name="format" value="\\u00(08|09|0(a|A)|0(c|C)|0(d|D)|22|27|5(C|c))|\\(0(10|11|12|14|15|42|47)|134)"/>
65+
<property name="message" value="Avoid using corresponding octal or Unicode escape."/>
66+
</module>
67+
<module name="AvoidEscapedUnicodeCharacters">
68+
<property name="allowEscapesForControlCharacters" value="true"/>
69+
<property name="allowByTailComment" value="true"/>
70+
<property name="allowNonPrintableEscapes" value="true"/>
71+
</module>
72+
<module name="NoLineWrap"/>
73+
<module name="EmptyBlock">
74+
<property name="option" value="TEXT"/>
75+
<property name="tokens" value="LITERAL_TRY, LITERAL_FINALLY, LITERAL_IF, LITERAL_ELSE, LITERAL_SWITCH"/>
76+
</module>
77+
<module name="NeedBraces">
78+
<property name="allowSingleLineStatement" value="true"/>
79+
</module>
80+
<module name="OneStatementPerLine"/>
81+
<module name="ArrayTypeStyle"/>
82+
<module name="FallThrough"/>
83+
<module name="UpperEll"/>
84+
<module name="ModifierOrder"/>
85+
<module name="SeparatorWrap">
86+
<property name="tokens" value="DOT"/>
87+
<property name="option" value="nl"/>
88+
</module>
89+
<module name="SeparatorWrap">
90+
<property name="tokens" value="COMMA"/>
91+
<property name="option" value="EOL"/>
92+
</module>
93+
<module name="PackageName">
94+
<property name="format" value="^[a-z]+(\.[a-z][a-z0-9]*)*$"/>
95+
<message key="name.invalidPattern"
96+
value="Package name ''{0}'' must match pattern ''{1}''."/>
97+
</module>
98+
<module name="ClassTypeParameterName">
99+
<property name="format" value="([A-Z][a-zA-Z0-9]*$)"/>
100+
<message key="name.invalidPattern"
101+
value="Class type name ''{0}'' must match pattern ''{1}''."/>
102+
</module>
103+
<module name="MethodTypeParameterName">
104+
<property name="format" value="([A-Z][a-zA-Z0-9]*)"/>
105+
<message key="name.invalidPattern"
106+
value="Method type name ''{0}'' must match pattern ''{1}''."/>
107+
</module>
108+
<module name="GenericWhitespace">
109+
<message key="ws.followed"
110+
value="GenericWhitespace ''{0}'' is followed by whitespace."/>
111+
<message key="ws.preceded"
112+
value="GenericWhitespace ''{0}'' is preceded with whitespace."/>
113+
<message key="ws.illegalFollow"
114+
value="GenericWhitespace ''{0}'' should followed by whitespace."/>
115+
<message key="ws.notPreceded"
116+
value="GenericWhitespace ''{0}'' is not preceded with whitespace."/>
117+
</module>
118+
<module name="MethodParamPad"/>
119+
<module name="AnnotationLocation">
120+
<property name="tokens" value="CLASS_DEF, INTERFACE_DEF, ENUM_DEF, METHOD_DEF, CTOR_DEF"/>
121+
</module>
122+
<module name="AnnotationLocation">
123+
<property name="tokens" value="VARIABLE_DEF"/>
124+
<property name="allowSamelineMultipleAnnotations" value="true"/>
125+
</module>
126+
<module name="MethodName">
127+
<property name="format" value="^[a-z][a-z0-9][a-zA-Z0-9_]*$"/>
128+
<message key="name.invalidPattern"
129+
value="Method name ''{0}'' must match pattern ''{1}''."/>
130+
</module>
131+
<module name="EmptyCatchBlock">
132+
<property name="exceptionVariableName" value="expected"/>
133+
</module>
134+
<module name="CommentsIndentation"/>
135+
<module name="UnusedImports"/>
136+
<module name="RedundantImport"/>
137+
<module name="RedundantModifier"/>
138+
<module name="RegexpSinglelineJava">
139+
<property name="format" value="throw new \w+Error\("/>
140+
<property name="message" value="Avoid throwing error in application code."/>
141+
</module>
142+
<module name="RegexpSinglelineJava">
143+
<property name="format" value="Objects\.toStringHelper"/>
144+
<property name="message" value="Avoid using Object.toStringHelper. Use ToStringBuilder instead." />
145+
</module>
146+
<module name="RegexpSinglelineJava">
147+
<property name="format" value="new (java\.lang\.)?(Byte|Integer|Long|Short)\("/>
148+
<property name="message" value="Use static factory 'valueOf' or 'parseXXX' instead of the deprecated constructors." />
149+
</module>
150+
<module name="RegexpSinglelineJava">
151+
<property name="format" value="Files\.createTempDir\("/>
152+
<property name="message"
153+
value="Avoid using com.google.common.io.Files.createTempDir() due to CVE-2020-8908.
154+
Use org.apache.spark.network.util.JavaUtils.createTempDir() instead." />
155+
</module>
156+
<module name="RegexpSinglelineJava">
157+
<property name="format" value="FileBackedOutputStream"/>
158+
<property name="message" value="Avoid using FileBackedOutputStream due to CVE-2023-2976." />
159+
</module>
160+
<module name="RegexpSinglelineJava">
161+
<property name="format" value="AtomicDoubleArray"/>
162+
<property name="message" value="Avoid using AtomicDoubleArray due to CVE-2018-10237." />
163+
</module>
164+
<module name="RegexpSinglelineJava">
165+
<property name="format" value="CompoundOrdering"/>
166+
<property name="message" value="Avoid using CompoundOrdering due to CVE-2018-10237." />
167+
</module>
168+
<module name="RegexpSinglelineJava">
169+
<property name="format" value="@Test\(expected"/>
170+
<property name="message" value="Please use the `assertThrows` method to test for exceptions." />
171+
</module>
172+
<module name="IllegalImport">
173+
<property name="illegalPkgs" value="org.apache.log4j" />
174+
<property name="illegalPkgs" value="org.apache.commons.lang" />
175+
</module>
176+
<!-- support structured logging -->
177+
<module name="RegexpSinglelineJava">
178+
<property name="format" value="org\.slf4j\.(Logger|LoggerFactory)" />
179+
<property name="message" value="Please use org.apache.spark.internal.(SparkLogger|SparkLoggerFactory) instead." />
180+
</module>
181+
</module>
182+
</module>

java/pom.xml

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
<modules>
2121
<module>core</module>
22+
<module>spark</module>
2223
</modules>
2324

2425
<dependencyManagement>
@@ -105,7 +106,7 @@
105106
<artifactId>maven-checkstyle-plugin</artifactId>
106107
<version>3.3.1</version>
107108
<configuration>
108-
<configLocation>google_checks.xml</configLocation>
109+
<configLocation>dev/checkstyle.xml</configLocation>
109110
<consoleOutput>true</consoleOutput>
110111
<failsOnError>true</failsOnError>
111112
<violationSeverity>warning</violationSeverity>

0 commit comments

Comments
 (0)