Skip to content

Commit f02046f

Browse files
NikhilCollooruarhimondr
authored andcommitted
Avoid creating expensive Path objects in split creation code
1 parent 003d86a commit f02046f

16 files changed

+118
-135
lines changed

.idea/icon.png

-13.3 KB
Binary file not shown.

presto-hdfs-core/src/main/java/com/facebook/presto/hive/HiveFileInfo.java

+14-10
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import com.facebook.drift.annotations.ThriftStruct;
1919
import com.google.common.collect.ImmutableMap;
2020
import org.apache.hadoop.fs.LocatedFileStatus;
21-
import org.apache.hadoop.fs.Path;
2221
import org.openjdk.jol.info.ClassLayout;
2322

2423
import java.io.IOException;
@@ -69,15 +68,15 @@ public static HiveFileInfo createHiveFileInfo(LocatedFileStatus locatedFileStatu
6968

7069
@ThriftConstructor
7170
public HiveFileInfo(
72-
String pathString,
71+
String path,
7372
boolean directory,
7473
List<BlockLocation> blockLocations,
7574
long length,
7675
long fileModifiedTime,
7776
Optional<byte[]> extraFileInfo,
7877
Map<String, String> customSplitInfo)
7978
{
80-
this.path = requireNonNull(pathString, "pathString is null");
79+
this.path = requireNonNull(path, "path is null");
8180
this.isDirectory = directory;
8281
this.blockLocations = requireNonNull(blockLocations, "blockLocations is null");
8382
this.length = length;
@@ -87,9 +86,9 @@ public HiveFileInfo(
8786
}
8887

8988
@ThriftField(1)
90-
public String getPathString()
89+
public String getPath()
9190
{
92-
return path.toString();
91+
return path;
9392
}
9493

9594
@ThriftField(2)
@@ -128,11 +127,6 @@ public Map<String, String> getCustomSplitInfo()
128127
return customSplitInfo;
129128
}
130129

131-
public Path getPath()
132-
{
133-
return new Path(path);
134-
}
135-
136130
public long getRetainedSizeInBytes()
137131
{
138132
long blockLocationsSizeInBytes = blockLocations.stream().map(BlockLocation::getRetainedSizeInBytes).reduce(0L, Long::sum);
@@ -141,6 +135,16 @@ public long getRetainedSizeInBytes()
141135
return INSTANCE_SIZE + path.length() + blockLocationsSizeInBytes + extraFileInfoSizeInBytes + customSplitInfoSizeInBytes;
142136
}
143137

138+
public String getParent()
139+
{
140+
return path.substring(0, path.lastIndexOf('/'));
141+
}
142+
143+
public String getFileName()
144+
{
145+
return path.substring(path.lastIndexOf('/') + 1);
146+
}
147+
144148
@Override
145149
public boolean equals(Object o)
146150
{

presto-hive/src/main/java/com/facebook/presto/hive/HiveBucketing.java

+2-3
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import com.google.common.primitives.Shorts;
3333
import com.google.common.primitives.SignedBytes;
3434
import io.airlift.slice.Slice;
35-
import org.apache.hadoop.fs.Path;
3635
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
3736
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
3837
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
@@ -76,10 +75,10 @@ public final class HiveBucketing
7675

7776
private HiveBucketing() {}
7877

79-
public static int getVirtualBucketNumber(int bucketCount, Path path)
78+
public static int getVirtualBucketNumber(int bucketCount, String path)
8079
{
8180
// this is equivalent to bucketing the table on a VARCHAR column containing $path
82-
return (hashBytes(0, utf8Slice(path.toString())) & Integer.MAX_VALUE) % bucketCount;
81+
return (hashBytes(0, utf8Slice(path)) & Integer.MAX_VALUE) % bucketCount;
8382
}
8483

8584
public static int getBucket(int bucketCount, List<Type> types, Page page, int position)

presto-hive/src/main/java/com/facebook/presto/hive/HiveSplitPartitionInfo.java

+4-27
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,13 @@
1616

1717
import com.facebook.presto.hive.metastore.Storage;
1818
import com.facebook.presto.spi.ColumnHandle;
19-
import com.facebook.presto.spi.PrestoException;
2019
import org.openjdk.jol.info.ClassLayout;
2120

22-
import java.net.URI;
23-
import java.net.URISyntaxException;
2421
import java.util.List;
2522
import java.util.Optional;
2623
import java.util.Set;
2724
import java.util.concurrent.atomic.AtomicInteger;
2825

29-
import static com.facebook.presto.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR;
3026
import static io.airlift.slice.SizeOf.sizeOfObjectArray;
3127
import static java.util.Objects.requireNonNull;
3228

@@ -39,7 +35,7 @@ public class HiveSplitPartitionInfo
3935
private static final int INSTANCE_SIZE = ClassLayout.parseClass(HiveSplitPartitionInfo.class).instanceSize();
4036

4137
private final Storage storage;
42-
private final URI path;
38+
private final String path;
4339
private final List<HivePartitionKey> partitionKeys;
4440
private final String partitionName;
4541
private final int partitionDataColumnCount;
@@ -53,7 +49,7 @@ public class HiveSplitPartitionInfo
5349

5450
HiveSplitPartitionInfo(
5551
Storage storage,
56-
URI path,
52+
String path,
5753
List<HivePartitionKey> partitionKeys,
5854
String partitionName,
5955
int partitionDataColumnCount,
@@ -72,7 +68,7 @@ public class HiveSplitPartitionInfo
7268
requireNonNull(rowIdPartitionComponent, "rowIdPartitionComponent is null");
7369

7470
this.storage = storage;
75-
this.path = ensurePathHasTrailingSlash(path);
71+
this.path = path;
7672
this.partitionKeys = partitionKeys;
7773
this.partitionName = partitionName;
7874
this.partitionDataColumnCount = partitionDataColumnCount;
@@ -82,25 +78,6 @@ public class HiveSplitPartitionInfo
8278
this.rowIdPartitionComponent = rowIdPartitionComponent;
8379
}
8480

85-
// Hadoop path strips trailing slashes from the path string,
86-
// and Java URI has a bug where a.resolve(a.relativize(b))
87-
// doesn't equal 'b' if 'a' had any components after the last slash
88-
// https://bugs.openjdk.java.net/browse/JDK-6523089
89-
private static URI ensurePathHasTrailingSlash(URI path)
90-
{
91-
// since this is the partition path, it's always a directory.
92-
// it's safe to add a trailing slash
93-
if (!path.getPath().endsWith("/")) {
94-
try {
95-
path = new URI(path.toString() + "/");
96-
}
97-
catch (URISyntaxException e) {
98-
throw new PrestoException(GENERIC_INTERNAL_ERROR, e);
99-
}
100-
}
101-
return path;
102-
}
103-
10481
public Storage getStorage()
10582
{
10683
return storage;
@@ -164,7 +141,7 @@ public int decrementAndGetReferences()
164141
return references.decrementAndGet();
165142
}
166143

167-
public URI getPath()
144+
public String getPath()
168145
{
169146
return path;
170147
}

presto-hive/src/main/java/com/facebook/presto/hive/HiveUtil.java

+5-5
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,7 @@ public static long parseHiveTimestamp(String value, DateTimeZone timeZone)
442442
return HIVE_TIMESTAMP_PARSER.withZone(timeZone).parseMillis(value);
443443
}
444444

445-
public static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fileSystem, Path path)
445+
public static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fileSystem, String path)
446446
{
447447
if (inputFormat instanceof OrcInputFormat || inputFormat instanceof RCFileInputFormat) {
448448
return true;
@@ -464,25 +464,25 @@ public static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fil
464464
}
465465
try {
466466
method.setAccessible(true);
467-
return (boolean) method.invoke(inputFormat, fileSystem, path);
467+
return (boolean) method.invoke(inputFormat, fileSystem, new Path(path));
468468
}
469469
catch (InvocationTargetException | IllegalAccessException e) {
470470
throw new RuntimeException(e);
471471
}
472472
}
473473

474-
public static boolean isSelectSplittable(InputFormat<?, ?> inputFormat, Path path, boolean s3SelectPushdownEnabled)
474+
public static boolean isSelectSplittable(InputFormat<?, ?> inputFormat, String path, boolean s3SelectPushdownEnabled)
475475
{
476476
// S3 Select supports splitting for uncompressed CSV & JSON files
477477
// Previous checks for supported input formats, SerDes, column types and S3 path
478478
// are reflected by the value of s3SelectPushdownEnabled.
479479
return !s3SelectPushdownEnabled || isUncompressed(inputFormat, path);
480480
}
481481

482-
private static boolean isUncompressed(InputFormat<?, ?> inputFormat, Path path)
482+
private static boolean isUncompressed(InputFormat<?, ?> inputFormat, String path)
483483
{
484484
if (inputFormat instanceof TextInputFormat) {
485-
return !getCompressionCodec((TextInputFormat) inputFormat, path).isPresent();
485+
return !getCompressionCodec((TextInputFormat) inputFormat, new Path(path)).isPresent();
486486
}
487487
return false;
488488
}

presto-hive/src/main/java/com/facebook/presto/hive/InternalHiveSplit.java

+12-11
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import com.facebook.presto.spi.schedule.NodeSelectionStrategy;
1919
import com.google.common.collect.ImmutableList;
2020
import com.google.common.collect.ImmutableMap;
21-
import org.apache.hadoop.fs.Path;
2221
import org.openjdk.jol.info.ClassLayout;
2322

2423
import javax.annotation.concurrent.NotThreadSafe;
@@ -32,8 +31,8 @@
3231
import static com.google.common.base.Preconditions.checkArgument;
3332
import static com.google.common.base.Preconditions.checkState;
3433
import static io.airlift.slice.SizeOf.sizeOf;
34+
import static io.airlift.slice.SizeOf.sizeOfCharArray;
3535
import static io.airlift.slice.SizeOf.sizeOfObjectArray;
36-
import static java.nio.charset.StandardCharsets.UTF_8;
3736
import static java.util.Objects.requireNonNull;
3837

3938
@NotThreadSafe
@@ -45,7 +44,8 @@ public class InternalHiveSplit
4544
private static final int HOST_ADDRESS_INSTANCE_SIZE = ClassLayout.parseClass(HostAddress.class).instanceSize() +
4645
ClassLayout.parseClass(String.class).instanceSize();
4746

48-
private final byte[] relativeUri;
47+
private final String path;
48+
private final boolean isRelative;
4949
private final long end;
5050
private final long fileSize;
5151
private final long fileModifiedTime;
@@ -72,7 +72,8 @@ public class InternalHiveSplit
7272
private int currentBlockIndex;
7373

7474
public InternalHiveSplit(
75-
String relativeUri,
75+
String path,
76+
boolean isRelative,
7677
long start,
7778
long end,
7879
long fileSize,
@@ -92,15 +93,16 @@ public InternalHiveSplit(
9293
checkArgument(end >= 0, "end must be non-negative");
9394
checkArgument(fileSize >= 0, "fileSize must be non-negative");
9495
checkArgument(fileModifiedTime >= 0, "fileModifiedTime must be non-negative");
95-
requireNonNull(relativeUri, "relativeUri is null");
96+
requireNonNull(path, "path is null");
9697
requireNonNull(readBucketNumber, "readBucketNumber is null");
9798
requireNonNull(tableBucketNumber, "tableBucketNumber is null");
9899
requireNonNull(nodeSelectionStrategy, "nodeSelectionStrategy is null");
99100
requireNonNull(partitionInfo, "partitionInfo is null");
100101
requireNonNull(extraFileInfo, "extraFileInfo is null");
101102
requireNonNull(encryptionInformation, "encryptionInformation is null");
102103

103-
this.relativeUri = relativeUri.getBytes(UTF_8);
104+
this.path = path;
105+
this.isRelative = isRelative;
104106
this.start = start;
105107
this.end = end;
106108
this.fileSize = fileSize;
@@ -113,7 +115,7 @@ public InternalHiveSplit(
113115
this.partitionInfo = partitionInfo;
114116
this.extraFileInfo = extraFileInfo;
115117
this.customSplitInfo = ImmutableMap
116-
.copyOf(requireNonNull(customSplitInfo, "customSplitInfo is null"));
118+
.copyOf(requireNonNull(customSplitInfo, "customSplitInfo is null"));
117119

118120
ImmutableList.Builder<List<HostAddress>> addressesBuilder = ImmutableList.builder();
119121
blockEndOffsets = new long[blocks.size()];
@@ -131,8 +133,7 @@ public InternalHiveSplit(
131133

132134
public String getPath()
133135
{
134-
String relativePathString = new String(relativeUri, UTF_8);
135-
return new Path(partitionInfo.getPath().resolve(relativePathString)).toString();
136+
return isRelative ? partitionInfo.getPath() + path : path;
136137
}
137138

138139
public long getStart()
@@ -254,7 +255,7 @@ public void reset()
254255
public int getEstimatedSizeInBytes()
255256
{
256257
int result = INSTANCE_SIZE;
257-
result += sizeOf(relativeUri);
258+
result += sizeOfCharArray(path.length());
258259
result += sizeOf(blockEndOffsets);
259260
if (!blockAddresses.isEmpty()) {
260261
result += sizeOfObjectArray(blockAddresses.size());
@@ -275,7 +276,7 @@ public int getEstimatedSizeInBytes()
275276
public String toString()
276277
{
277278
return toStringHelper(this)
278-
.add("relativeUri", new String(relativeUri, UTF_8))
279+
.add("path", path)
279280
.add("start", start)
280281
.add("end", end)
281282
.add("fileSize", fileSize)

presto-hive/src/main/java/com/facebook/presto/hive/ManifestPartitionLoader.java

+4-3
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,8 @@ private InternalHiveSplitFactory createInternalHiveSplitFactory(
159159
.map(p -> p.getColumns().size())
160160
.orElseGet(table.getDataColumns()::size);
161161
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition(), partitionName);
162-
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
162+
String location = getPartitionLocation(table, partition.getPartition());
163+
Path path = new Path(location);
163164
Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
164165
InputFormat<?, ?> inputFormat = getInputFormat(configuration, inputFormatName, false);
165166
ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(hdfsContext, path);
@@ -173,7 +174,7 @@ private InternalHiveSplitFactory createInternalHiveSplitFactory(
173174
false,
174175
new HiveSplitPartitionInfo(
175176
storage,
176-
path.toUri(),
177+
location,
177178
partitionKeys,
178179
partitionName,
179180
partitionDataColumnCount,
@@ -201,7 +202,7 @@ private void validateManifest(ConnectorSession session, HivePartitionMetadata pa
201202
int fileCount = 0;
202203
while (fileInfoIterator.hasNext()) {
203204
HiveFileInfo fileInfo = fileInfoIterator.next();
204-
String fileName = fileInfo.getPath().getName();
205+
String fileName = fileInfo.getFileName();
205206
if (!manifestFileNames.contains(fileName)) {
206207
throw new PrestoException(
207208
MALFORMED_HIVE_FILE_STATISTICS,

0 commit comments

Comments
 (0)