Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for double to varchar coercion in hive tables #18930

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import io.trino.filesystem.Location;
import io.trino.plugin.hive.HivePageSourceProvider.BucketAdaptation;
import io.trino.plugin.hive.HivePageSourceProvider.ColumnMapping;
import io.trino.plugin.hive.coercions.CoercionUtils.CoercionContext;
import io.trino.plugin.hive.coercions.TypeCoercer;
import io.trino.plugin.hive.type.TypeInfo;
import io.trino.plugin.hive.util.HiveBucketing.BucketingVersion;
Expand Down Expand Up @@ -78,12 +79,12 @@ public HivePageSource(
Optional<BucketValidator> bucketValidator,
Optional<ReaderProjectionsAdapter> projectionsAdapter,
TypeManager typeManager,
HiveTimestampPrecision timestampPrecision,
CoercionContext coercionContext,
kokosing marked this conversation as resolved.
Show resolved Hide resolved
ConnectorPageSource delegate)
{
requireNonNull(columnMappings, "columnMappings is null");
requireNonNull(typeManager, "typeManager is null");
requireNonNull(timestampPrecision, "timestampPrecision is null");
requireNonNull(coercionContext, "coercionContext is null");

this.delegate = requireNonNull(delegate, "delegate is null");
this.columnMappings = columnMappings;
Expand Down Expand Up @@ -111,7 +112,7 @@ public HivePageSource(
.orElse(ImmutableList.of());
HiveType fromType = columnMapping.getBaseTypeCoercionFrom().get().getHiveTypeForDereferences(dereferenceIndices).get();
HiveType toType = columnMapping.getHiveColumnHandle().getHiveType();
coercers.add(createCoercer(typeManager, fromType, toType, timestampPrecision));
coercers.add(createCoercer(typeManager, fromType, toType, coercionContext));
}
else {
coercers.add(Optional.empty());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import io.trino.plugin.hive.HiveSplit.BucketConversion;
import io.trino.plugin.hive.HiveSplit.BucketValidation;
import io.trino.plugin.hive.acid.AcidTransaction;
import io.trino.plugin.hive.coercions.CoercionUtils.CoercionContext;
import io.trino.plugin.hive.type.TypeInfo;
import io.trino.plugin.hive.util.HiveBucketing.BucketingVersion;
import io.trino.spi.TrinoException;
Expand Down Expand Up @@ -68,6 +69,7 @@
import static io.trino.plugin.hive.coercions.CoercionUtils.createTypeFromCoercer;
import static io.trino.plugin.hive.util.HiveBucketing.HiveBucketFilter;
import static io.trino.plugin.hive.util.HiveBucketing.getHiveBucketFilter;
import static io.trino.plugin.hive.util.HiveClassNames.ORC_SERDE_CLASS;
import static io.trino.plugin.hive.util.HiveUtil.getDeserializerClassName;
import static io.trino.plugin.hive.util.HiveUtil.getInputFormatName;
import static io.trino.plugin.hive.util.HiveUtil.getPrefilledColumnValue;
Expand Down Expand Up @@ -190,10 +192,12 @@ public static Optional<ConnectorPageSource> createHivePageSource(
Optional<BucketAdaptation> bucketAdaptation = createBucketAdaptation(bucketConversion, tableBucketNumber, regularAndInterimColumnMappings);
Optional<BucketValidator> bucketValidator = createBucketValidator(path, bucketValidation, tableBucketNumber, regularAndInterimColumnMappings);

HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
// Apache Hive reads Double.NaN as null when coerced to varchar for ORC file format
boolean treatNaNAsNull = ORC_SERDE_CLASS.equals(getDeserializerClassName(schema));
CoercionContext coercionContext = new CoercionContext(getTimestampPrecision(session), treatNaNAsNull);

for (HivePageSourceFactory pageSourceFactory : pageSourceFactories) {
List<HiveColumnHandle> desiredColumns = toColumnHandles(regularAndInterimColumnMappings, typeManager, timestampPrecision);
List<HiveColumnHandle> desiredColumns = toColumnHandles(regularAndInterimColumnMappings, typeManager, coercionContext);

Optional<ReaderPageSource> readerWithProjections = pageSourceFactory.createPageSource(
session,
Expand Down Expand Up @@ -224,7 +228,7 @@ public static Optional<ConnectorPageSource> createHivePageSource(
bucketValidator,
adapter,
typeManager,
timestampPrecision,
coercionContext,
pageSource));
}
}
Expand Down Expand Up @@ -473,7 +477,7 @@ public static List<ColumnMapping> extractRegularAndInterimColumnMappings(List<Co
.collect(toImmutableList());
}

public static List<HiveColumnHandle> toColumnHandles(List<ColumnMapping> regularColumnMappings, TypeManager typeManager, HiveTimestampPrecision timestampPrecision)
public static List<HiveColumnHandle> toColumnHandles(List<ColumnMapping> regularColumnMappings, TypeManager typeManager, CoercionContext coercionContext)
{
return regularColumnMappings.stream()
.map(columnMapping -> {
Expand All @@ -489,14 +493,14 @@ public static List<HiveColumnHandle> toColumnHandles(List<ColumnMapping> regular
projectedColumn.getDereferenceIndices(),
projectedColumn.getDereferenceNames(),
fromHiveType,
createTypeFromCoercer(typeManager, fromHiveType, columnHandle.getHiveType(), timestampPrecision));
createTypeFromCoercer(typeManager, fromHiveType, columnHandle.getHiveType(), coercionContext));
});

return new HiveColumnHandle(
columnHandle.getBaseColumnName(),
columnHandle.getBaseHiveColumnIndex(),
fromHiveTypeBase,
createTypeFromCoercer(typeManager, fromHiveTypeBase, columnHandle.getBaseHiveType(), timestampPrecision),
createTypeFromCoercer(typeManager, fromHiveTypeBase, columnHandle.getBaseHiveType(), coercionContext),
newColumnProjectionInfo,
columnHandle.getColumnType(),
columnHandle.getComment());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,21 +71,21 @@ public final class CoercionUtils
{
private CoercionUtils() {}

public static Type createTypeFromCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType, HiveTimestampPrecision timestampPrecision)
public static Type createTypeFromCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType, CoercionContext coercionContext)
{
return createCoercer(typeManager, fromHiveType, toHiveType, timestampPrecision)
return createCoercer(typeManager, fromHiveType, toHiveType, coercionContext)
.map(TypeCoercer::getFromType)
.orElseGet(() -> fromHiveType.getType(typeManager, timestampPrecision));
.orElseGet(() -> fromHiveType.getType(typeManager, coercionContext.timestampPrecision()));
}

public static Optional<TypeCoercer<? extends Type, ? extends Type>> createCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType, HiveTimestampPrecision timestampPrecision)
public static Optional<TypeCoercer<? extends Type, ? extends Type>> createCoercer(TypeManager typeManager, HiveType fromHiveType, HiveType toHiveType, CoercionContext coercionContext)
{
if (fromHiveType.equals(toHiveType)) {
return Optional.empty();
}

Type fromType = fromHiveType.getType(typeManager, timestampPrecision);
Type toType = toHiveType.getType(typeManager, timestampPrecision);
Type fromType = fromHiveType.getType(typeManager, coercionContext.timestampPrecision());
Type toType = toHiveType.getType(typeManager, coercionContext.timestampPrecision());

if (toType instanceof VarcharType toVarcharType && (fromHiveType.equals(HIVE_BYTE) || fromHiveType.equals(HIVE_SHORT) || fromHiveType.equals(HIVE_INT) || fromHiveType.equals(HIVE_LONG))) {
return Optional.of(new IntegerNumberToVarcharCoercer<>(fromType, toVarcharType));
Expand Down Expand Up @@ -147,19 +147,22 @@ public static Type createTypeFromCoercer(TypeManager typeManager, HiveType fromH
if (fromType instanceof TimestampType && toType instanceof VarcharType varcharType) {
return Optional.of(new TimestampCoercer.LongTimestampToVarcharCoercer(TIMESTAMP_NANOS, varcharType));
}
if (fromType == DOUBLE && toType instanceof VarcharType toVarcharType) {
return Optional.of(new DoubleToVarcharCoercer(toVarcharType, coercionContext.treatNaNAsNull()));
}
if ((fromType instanceof ArrayType) && (toType instanceof ArrayType)) {
return createCoercerForList(
typeManager,
(ListTypeInfo) fromHiveType.getTypeInfo(),
(ListTypeInfo) toHiveType.getTypeInfo(),
timestampPrecision);
coercionContext);
}
if ((fromType instanceof MapType) && (toType instanceof MapType)) {
return createCoercerForMap(
typeManager,
(MapTypeInfo) fromHiveType.getTypeInfo(),
(MapTypeInfo) toHiveType.getTypeInfo(),
timestampPrecision);
coercionContext);
}
if ((fromType instanceof RowType) && (toType instanceof RowType)) {
HiveType fromHiveTypeStruct = (fromHiveType.getCategory() == Category.UNION) ? HiveType.toHiveType(fromType) : fromHiveType;
Expand All @@ -169,7 +172,7 @@ public static Type createTypeFromCoercer(TypeManager typeManager, HiveType fromH
typeManager,
(StructTypeInfo) fromHiveTypeStruct.getTypeInfo(),
(StructTypeInfo) toHiveTypeStruct.getTypeInfo(),
timestampPrecision);
coercionContext);
}

throw new TrinoException(NOT_SUPPORTED, format("Unsupported coercion from %s to %s", fromHiveType, toHiveType));
Expand All @@ -196,35 +199,35 @@ public static boolean narrowerThan(CharType first, CharType second)
TypeManager typeManager,
ListTypeInfo fromListTypeInfo,
ListTypeInfo toListTypeInfo,
HiveTimestampPrecision timestampPrecision)
CoercionContext coercionContext)
{
HiveType fromElementHiveType = HiveType.valueOf(fromListTypeInfo.getListElementTypeInfo().getTypeName());
HiveType toElementHiveType = HiveType.valueOf(toListTypeInfo.getListElementTypeInfo().getTypeName());

return createCoercer(typeManager, fromElementHiveType, toElementHiveType, timestampPrecision)
return createCoercer(typeManager, fromElementHiveType, toElementHiveType, coercionContext)
.map(elementCoercer -> new ListCoercer(new ArrayType(elementCoercer.getFromType()), new ArrayType(elementCoercer.getToType()), elementCoercer));
}

private static Optional<TypeCoercer<? extends Type, ? extends Type>> createCoercerForMap(
TypeManager typeManager,
MapTypeInfo fromMapTypeInfo,
MapTypeInfo toMapTypeInfo,
HiveTimestampPrecision timestampPrecision)
CoercionContext coercionContext)
{
HiveType fromKeyHiveType = HiveType.valueOf(fromMapTypeInfo.getMapKeyTypeInfo().getTypeName());
HiveType fromValueHiveType = HiveType.valueOf(fromMapTypeInfo.getMapValueTypeInfo().getTypeName());
HiveType toKeyHiveType = HiveType.valueOf(toMapTypeInfo.getMapKeyTypeInfo().getTypeName());
HiveType toValueHiveType = HiveType.valueOf(toMapTypeInfo.getMapValueTypeInfo().getTypeName());
Optional<TypeCoercer<? extends Type, ? extends Type>> keyCoercer = createCoercer(typeManager, fromKeyHiveType, toKeyHiveType, timestampPrecision);
Optional<TypeCoercer<? extends Type, ? extends Type>> valueCoercer = createCoercer(typeManager, fromValueHiveType, toValueHiveType, timestampPrecision);
Optional<TypeCoercer<? extends Type, ? extends Type>> keyCoercer = createCoercer(typeManager, fromKeyHiveType, toKeyHiveType, coercionContext);
Optional<TypeCoercer<? extends Type, ? extends Type>> valueCoercer = createCoercer(typeManager, fromValueHiveType, toValueHiveType, coercionContext);
MapType fromType = new MapType(
keyCoercer.map(TypeCoercer::getFromType).orElseGet(() -> fromKeyHiveType.getType(typeManager, timestampPrecision)),
valueCoercer.map(TypeCoercer::getFromType).orElseGet(() -> fromValueHiveType.getType(typeManager, timestampPrecision)),
keyCoercer.map(TypeCoercer::getFromType).orElseGet(() -> fromKeyHiveType.getType(typeManager, coercionContext.timestampPrecision())),
valueCoercer.map(TypeCoercer::getFromType).orElseGet(() -> fromValueHiveType.getType(typeManager, coercionContext.timestampPrecision())),
typeManager.getTypeOperators());

MapType toType = new MapType(
keyCoercer.map(TypeCoercer::getToType).orElseGet(() -> toKeyHiveType.getType(typeManager, timestampPrecision)),
valueCoercer.map(TypeCoercer::getToType).orElseGet(() -> toValueHiveType.getType(typeManager, timestampPrecision)),
keyCoercer.map(TypeCoercer::getToType).orElseGet(() -> toKeyHiveType.getType(typeManager, coercionContext.timestampPrecision())),
valueCoercer.map(TypeCoercer::getToType).orElseGet(() -> toValueHiveType.getType(typeManager, coercionContext.timestampPrecision())),
typeManager.getTypeOperators());

return Optional.of(new MapCoercer(fromType, toType, keyCoercer, valueCoercer));
Expand All @@ -234,7 +237,7 @@ public static boolean narrowerThan(CharType first, CharType second)
TypeManager typeManager,
StructTypeInfo fromStructTypeInfo,
StructTypeInfo toStructTypeInfo,
HiveTimestampPrecision timestampPrecision)
CoercionContext coercionContext)
{
ImmutableList.Builder<Optional<TypeCoercer<? extends Type, ? extends Type>>> coercers = ImmutableList.builder();
ImmutableList.Builder<Field> fromField = ImmutableList.builder();
Expand All @@ -248,20 +251,20 @@ public static boolean narrowerThan(CharType first, CharType second)
if (i >= fromStructFieldName.size()) {
toField.add(new Field(
Optional.of(toStructFieldNames.get(i)),
toStructFieldType.getType(typeManager, timestampPrecision)));
toStructFieldType.getType(typeManager, coercionContext.timestampPrecision())));
coercers.add(Optional.empty());
}
else {
HiveType fromStructFieldType = HiveType.valueOf(fromStructTypeInfo.getAllStructFieldTypeInfos().get(i).getTypeName());

Optional<TypeCoercer<? extends Type, ? extends Type>> coercer = createCoercer(typeManager, fromStructFieldType, toStructFieldType, timestampPrecision);
Optional<TypeCoercer<? extends Type, ? extends Type>> coercer = createCoercer(typeManager, fromStructFieldType, toStructFieldType, coercionContext);

fromField.add(new Field(
Optional.of(fromStructFieldName.get(i)),
coercer.map(TypeCoercer::getFromType).orElseGet(() -> fromStructFieldType.getType(typeManager, timestampPrecision))));
coercer.map(TypeCoercer::getFromType).orElseGet(() -> fromStructFieldType.getType(typeManager, coercionContext.timestampPrecision()))));
toField.add(new Field(
Optional.of(toStructFieldNames.get(i)),
coercer.map(TypeCoercer::getToType).orElseGet(() -> toStructFieldType.getType(typeManager, timestampPrecision))));
coercer.map(TypeCoercer::getToType).orElseGet(() -> toStructFieldType.getType(typeManager, coercionContext.timestampPrecision()))));

coercers.add(coercer);
}
Expand Down Expand Up @@ -394,4 +397,12 @@ protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int pos
throw new UnsupportedOperationException("Not supported");
}
}

public record CoercionContext(HiveTimestampPrecision timestampPrecision, boolean treatNaNAsNull)
{
public CoercionContext
{
requireNonNull(timestampPrecision, "timestampPrecision is null");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.hive.coercions;

import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import io.trino.spi.TrinoException;
import io.trino.spi.block.Block;
import io.trino.spi.block.BlockBuilder;
import io.trino.spi.type.DoubleType;
import io.trino.spi.type.VarcharType;

import static io.airlift.slice.SliceUtf8.countCodePoints;
import static io.trino.spi.StandardErrorCode.INVALID_ARGUMENTS;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static java.lang.String.format;

public class DoubleToVarcharCoercer
extends TypeCoercer<DoubleType, VarcharType>
{
private final boolean treatNaNAsNull;

public DoubleToVarcharCoercer(VarcharType toType, boolean treatNaNAsNull)
{
super(DOUBLE, toType);
this.treatNaNAsNull = treatNaNAsNull;
}

@Override
protected void applyCoercedValue(BlockBuilder blockBuilder, Block block, int position)
{
double doubleValue = DOUBLE.getDouble(block, position);

if (Double.isNaN(doubleValue) && treatNaNAsNull) {
blockBuilder.appendNull();
return;
}

Slice converted = Slices.utf8Slice(Double.toString(doubleValue));
if (!toType.isUnbounded() && countCodePoints(converted) > toType.getBoundedLength()) {
throw new TrinoException(INVALID_ARGUMENTS, format("Varchar representation of %s exceeds %s bounds", doubleValue, toType));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we call truncateToLength(converted, toType) without calling countCodePoints(converted) > toType.getBoundedLength() does that not throw any error ?
It would be nicer to avoid the extra bounds check and have the error come out of truncateToLength directly

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we call truncateToLength(converted, toType) without calling countCodePoints(converted) > toType.getBoundedLength() does that not throw any error

Yes it doesn't throw any error - scope of truncateToLength is to trim the length of the Varchar representation - to a narrower precision - so we don't have to fail it explicitly - but here it is an redundant operation as we don't have to truncate it out. Thanks for pointing it out

}
toType.writeSlice(blockBuilder, converted);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
package io.trino.plugin.hive.orc;

import io.trino.orc.metadata.OrcType.OrcTypeKind;
import io.trino.plugin.hive.coercions.DoubleToVarcharCoercer;
import io.trino.plugin.hive.coercions.TimestampCoercer.LongTimestampToVarcharCoercer;
import io.trino.plugin.hive.coercions.TimestampCoercer.VarcharToLongTimestampCoercer;
import io.trino.plugin.hive.coercions.TimestampCoercer.VarcharToShortTimestampCoercer;
Expand All @@ -24,6 +25,7 @@

import java.util.Optional;

import static io.trino.orc.metadata.OrcType.OrcTypeKind.DOUBLE;
import static io.trino.orc.metadata.OrcType.OrcTypeKind.STRING;
import static io.trino.orc.metadata.OrcType.OrcTypeKind.TIMESTAMP;
import static io.trino.orc.metadata.OrcType.OrcTypeKind.VARCHAR;
Expand All @@ -45,6 +47,9 @@ private OrcTypeTranslator() {}
}
return Optional.of(new VarcharToLongTimestampCoercer(createUnboundedVarcharType(), timestampType));
}
if (fromOrcType == DOUBLE && toTrinoType instanceof VarcharType varcharType) {
return Optional.of(new DoubleToVarcharCoercer(varcharType, true));
}
return Optional.empty();
}

Expand Down
Loading