Skip to content

Commit

Permalink
Avoid creating expensive Path objects in split creation code
Browse files Browse the repository at this point in the history
  • Loading branch information
NikhilCollooru committed Feb 4, 2025
1 parent a8a6ffd commit 6bb9027
Show file tree
Hide file tree
Showing 16 changed files with 106 additions and 134 deletions.
Binary file removed .idea/icon.png
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import com.facebook.drift.annotations.ThriftStruct;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.openjdk.jol.info.ClassLayout;

import java.io.IOException;
Expand Down Expand Up @@ -69,15 +68,15 @@ public static HiveFileInfo createHiveFileInfo(LocatedFileStatus locatedFileStatu

@ThriftConstructor
public HiveFileInfo(
String pathString,
String path,
boolean directory,
List<BlockLocation> blockLocations,
long length,
long fileModifiedTime,
Optional<byte[]> extraFileInfo,
Map<String, String> customSplitInfo)
{
this.path = requireNonNull(pathString, "pathString is null");
this.path = requireNonNull(path, "path is null");
this.isDirectory = directory;
this.blockLocations = requireNonNull(blockLocations, "blockLocations is null");
this.length = length;
Expand All @@ -87,9 +86,9 @@ public HiveFileInfo(
}

@ThriftField(1)
public String getPathString()
public String getPath()
{
return path.toString();
return path;
}

@ThriftField(2)
Expand Down Expand Up @@ -128,11 +127,6 @@ public Map<String, String> getCustomSplitInfo()
return customSplitInfo;
}

public Path getPath()
{
return new Path(path);
}

public long getRetainedSizeInBytes()
{
long blockLocationsSizeInBytes = blockLocations.stream().map(BlockLocation::getRetainedSizeInBytes).reduce(0L, Long::sum);
Expand All @@ -141,6 +135,16 @@ public long getRetainedSizeInBytes()
return INSTANCE_SIZE + path.length() + blockLocationsSizeInBytes + extraFileInfoSizeInBytes + customSplitInfoSizeInBytes;
}

public String getParent()
{
return path.substring(0, path.lastIndexOf('/'));
}

public String getFileName()
{
return path.substring(path.lastIndexOf('/') + 1);
}

@Override
public boolean equals(Object o)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import com.google.common.primitives.Shorts;
import com.google.common.primitives.SignedBytes;
import io.airlift.slice.Slice;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
Expand Down Expand Up @@ -76,10 +75,10 @@ public final class HiveBucketing

private HiveBucketing() {}

public static int getVirtualBucketNumber(int bucketCount, Path path)
public static int getVirtualBucketNumber(int bucketCount, String path)
{
// this is equivalent to bucketing the table on a VARCHAR column containing $path
return (hashBytes(0, utf8Slice(path.toString())) & Integer.MAX_VALUE) % bucketCount;
return (hashBytes(0, utf8Slice(path)) & Integer.MAX_VALUE) % bucketCount;
}

public static int getBucket(int bucketCount, List<Type> types, Page page, int position)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,13 @@

import com.facebook.presto.hive.metastore.Storage;
import com.facebook.presto.spi.ColumnHandle;
import com.facebook.presto.spi.PrestoException;
import org.openjdk.jol.info.ClassLayout;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import static com.facebook.presto.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR;
import static io.airlift.slice.SizeOf.sizeOfObjectArray;
import static java.util.Objects.requireNonNull;

Expand All @@ -39,7 +35,7 @@ public class HiveSplitPartitionInfo
private static final int INSTANCE_SIZE = ClassLayout.parseClass(HiveSplitPartitionInfo.class).instanceSize();

private final Storage storage;
private final URI path;
private final String path;
private final List<HivePartitionKey> partitionKeys;
private final String partitionName;
private final int partitionDataColumnCount;
Expand All @@ -53,7 +49,7 @@ public class HiveSplitPartitionInfo

HiveSplitPartitionInfo(
Storage storage,
URI path,
String path,
List<HivePartitionKey> partitionKeys,
String partitionName,
int partitionDataColumnCount,
Expand All @@ -72,7 +68,7 @@ public class HiveSplitPartitionInfo
requireNonNull(rowIdPartitionComponent, "rowIdPartitionComponent is null");

this.storage = storage;
this.path = ensurePathHasTrailingSlash(path);
this.path = path;
this.partitionKeys = partitionKeys;
this.partitionName = partitionName;
this.partitionDataColumnCount = partitionDataColumnCount;
Expand All @@ -82,25 +78,6 @@ public class HiveSplitPartitionInfo
this.rowIdPartitionComponent = rowIdPartitionComponent;
}

// Hadoop path strips trailing slashes from the path string,
// and Java URI has a bug where a.resolve(a.relativize(b))
// doesn't equal 'b' if 'a' had any components after the last slash
// https://bugs.openjdk.java.net/browse/JDK-6523089
private static URI ensurePathHasTrailingSlash(URI path)
{
// since this is the partition path, it's always a directory.
// it's safe to add a trailing slash
if (!path.getPath().endsWith("/")) {
try {
path = new URI(path.toString() + "/");
}
catch (URISyntaxException e) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, e);
}
}
return path;
}

public Storage getStorage()
{
return storage;
Expand Down Expand Up @@ -164,7 +141,7 @@ public int decrementAndGetReferences()
return references.decrementAndGet();
}

public URI getPath()
public String getPath()
{
return path;
}
Expand Down
10 changes: 5 additions & 5 deletions presto-hive/src/main/java/com/facebook/presto/hive/HiveUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ public static long parseHiveTimestamp(String value, DateTimeZone timeZone)
return HIVE_TIMESTAMP_PARSER.withZone(timeZone).parseMillis(value);
}

public static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fileSystem, Path path)
public static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fileSystem, String path)
{
if (inputFormat instanceof OrcInputFormat || inputFormat instanceof RCFileInputFormat) {
return true;
Expand All @@ -464,25 +464,25 @@ public static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fil
}
try {
method.setAccessible(true);
return (boolean) method.invoke(inputFormat, fileSystem, path);
return (boolean) method.invoke(inputFormat, fileSystem, new Path(path));
}
catch (InvocationTargetException | IllegalAccessException e) {
throw new RuntimeException(e);
}
}

public static boolean isSelectSplittable(InputFormat<?, ?> inputFormat, Path path, boolean s3SelectPushdownEnabled)
public static boolean isSelectSplittable(InputFormat<?, ?> inputFormat, String path, boolean s3SelectPushdownEnabled)
{
// S3 Select supports splitting for uncompressed CSV & JSON files
// Previous checks for supported input formats, SerDes, column types and S3 path
// are reflected by the value of s3SelectPushdownEnabled.
return !s3SelectPushdownEnabled || isUncompressed(inputFormat, path);
}

private static boolean isUncompressed(InputFormat<?, ?> inputFormat, Path path)
private static boolean isUncompressed(InputFormat<?, ?> inputFormat, String path)
{
if (inputFormat instanceof TextInputFormat) {
return !getCompressionCodec((TextInputFormat) inputFormat, path).isPresent();
return !getCompressionCodec((TextInputFormat) inputFormat, new Path(path)).isPresent();
}
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import com.facebook.presto.spi.schedule.NodeSelectionStrategy;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.fs.Path;
import org.openjdk.jol.info.ClassLayout;

import javax.annotation.concurrent.NotThreadSafe;
Expand All @@ -32,8 +31,8 @@
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static io.airlift.slice.SizeOf.sizeOf;
import static io.airlift.slice.SizeOf.sizeOfCharArray;
import static io.airlift.slice.SizeOf.sizeOfObjectArray;
import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Objects.requireNonNull;

@NotThreadSafe
Expand All @@ -45,7 +44,7 @@ public class InternalHiveSplit
private static final int HOST_ADDRESS_INSTANCE_SIZE = ClassLayout.parseClass(HostAddress.class).instanceSize() +
ClassLayout.parseClass(String.class).instanceSize();

private final byte[] relativeUri;
private final String path;
private final long end;
private final long fileSize;
private final long fileModifiedTime;
Expand All @@ -72,7 +71,7 @@ public class InternalHiveSplit
private int currentBlockIndex;

public InternalHiveSplit(
String relativeUri,
String path,
long start,
long end,
long fileSize,
Expand All @@ -92,15 +91,15 @@ public InternalHiveSplit(
checkArgument(end >= 0, "end must be non-negative");
checkArgument(fileSize >= 0, "fileSize must be non-negative");
checkArgument(fileModifiedTime >= 0, "fileModifiedTime must be non-negative");
requireNonNull(relativeUri, "relativeUri is null");
requireNonNull(path, "path is null");
requireNonNull(readBucketNumber, "readBucketNumber is null");
requireNonNull(tableBucketNumber, "tableBucketNumber is null");
requireNonNull(nodeSelectionStrategy, "nodeSelectionStrategy is null");
requireNonNull(partitionInfo, "partitionInfo is null");
requireNonNull(extraFileInfo, "extraFileInfo is null");
requireNonNull(encryptionInformation, "encryptionInformation is null");

this.relativeUri = relativeUri.getBytes(UTF_8);
this.path = path;
this.start = start;
this.end = end;
this.fileSize = fileSize;
Expand All @@ -113,7 +112,7 @@ public InternalHiveSplit(
this.partitionInfo = partitionInfo;
this.extraFileInfo = extraFileInfo;
this.customSplitInfo = ImmutableMap
.copyOf(requireNonNull(customSplitInfo, "customSplitInfo is null"));
.copyOf(requireNonNull(customSplitInfo, "customSplitInfo is null"));

ImmutableList.Builder<List<HostAddress>> addressesBuilder = ImmutableList.builder();
blockEndOffsets = new long[blocks.size()];
Expand All @@ -131,8 +130,7 @@ public InternalHiveSplit(

public String getPath()
{
String relativePathString = new String(relativeUri, UTF_8);
return new Path(partitionInfo.getPath().resolve(relativePathString)).toString();
return path;
}

public long getStart()
Expand Down Expand Up @@ -254,7 +252,7 @@ public void reset()
public int getEstimatedSizeInBytes()
{
int result = INSTANCE_SIZE;
result += sizeOf(relativeUri);
result += sizeOfCharArray(path.length());
result += sizeOf(blockEndOffsets);
if (!blockAddresses.isEmpty()) {
result += sizeOfObjectArray(blockAddresses.size());
Expand All @@ -275,7 +273,7 @@ public int getEstimatedSizeInBytes()
public String toString()
{
return toStringHelper(this)
.add("relativeUri", new String(relativeUri, UTF_8))
.add("path", path)
.add("start", start)
.add("end", end)
.add("fileSize", fileSize)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@ private InternalHiveSplitFactory createInternalHiveSplitFactory(
.map(p -> p.getColumns().size())
.orElseGet(table.getDataColumns()::size);
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition(), partitionName);
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
String location = getPartitionLocation(table, partition.getPartition());
Path path = new Path(location);
Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, inputFormatName, false);
ExtendedFileSystem fileSystem = hdfsEnvironment.getFileSystem(hdfsContext, path);
Expand All @@ -173,7 +174,7 @@ private InternalHiveSplitFactory createInternalHiveSplitFactory(
false,
new HiveSplitPartitionInfo(
storage,
path.toUri(),
location,
partitionKeys,
partitionName,
partitionDataColumnCount,
Expand Down Expand Up @@ -201,7 +202,7 @@ private void validateManifest(ConnectorSession session, HivePartitionMetadata pa
int fileCount = 0;
while (fileInfoIterator.hasNext()) {
HiveFileInfo fileInfo = fileInfoIterator.next();
String fileName = fileInfo.getPath().getName();
String fileName = fileInfo.getFileName();
if (!manifestFileNames.contains(fileName)) {
throw new PrestoException(
MALFORMED_HIVE_FILE_STATISTICS,
Expand Down
Loading

0 comments on commit 6bb9027

Please sign in to comment.