man-group · poodlewars · Feb 7, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025
@@ -937,7 +937,7 @@ if(${TEST})
             processing/test/test_component_manager.cpp
             processing/test/test_expression.cpp
             processing/test/test_filter_and_project_sparse.cpp
-            processing/test/test_has_valid_type_promotion.cpp
+            processing/test/test_type_promotion.cpp
             processing/test/test_operation_dispatch.cpp
             processing/test/test_parallel_processing.cpp
             processing/test/test_resample.cpp

@@ -39,32 +39,57 @@ namespace arcticdb {
         return false;
     }
 
-    std::optional<entity::TypeDescriptor> has_valid_type_promotion(
+    constexpr bool is_mixed_float_and_integer(entity::DataType left, entity::DataType right) {
+        return (is_integer_type(left) && is_floating_point_type(right)) || (is_floating_point_type(left) && is_integer_type(right));
+    }
+
+    static std::optional<entity::TypeDescriptor> common_type_float_integer(const entity::TypeDescriptor& left, const entity::TypeDescriptor& right) {
+        util::check(left.dimension() == right.dimension(), "Dimensions should match but were left={} right={}", left.dimension(), right.dimension());
+        auto dimension = left.dimension();
+        auto left_type = left.data_type();
+        auto right_type = right.data_type();
+        auto left_size = slice_bit_size(left_type);
+        auto right_size = slice_bit_size(right_type);
+        internal::check<ErrorCode::E_ASSERTION_FAILURE>(is_mixed_float_and_integer(left_type, right_type),
+                                                        "Expected one int and one float in common_type_floats_integer");
+
+        auto target_size = entity::SizeBits::UNKNOWN_SIZE_BITS;
+        auto floating_size = is_floating_point_type(left_type) ? left_size : right_size;
+        auto integral_size = is_integer_type(left_type) ? left_size : right_size;
+        if (floating_size == entity::SizeBits::S64 || integral_size >= entity::SizeBits::S32) {
+            // (u)int64 up to float64 will lose precision, we accept that
+            target_size = entity::SizeBits::S64;
+        } else {
+            // (u)int(8/16) can fit in float32 since float32 has 24 precision bits
+            target_size = entity::SizeBits::S32;
+        }
+
+        return std::make_optional<entity::TypeDescriptor>(combine_data_type(entity::ValueType::FLOAT, target_size), dimension);
+    }
+
+    bool is_valid_type_promotion_to_target(
         const entity::TypeDescriptor& source,
         const entity::TypeDescriptor& target
     ) {
-
         if (source.dimension() != target.dimension()) {
             // Empty of dimension 0 means lack of any given type and can be promoted to anything (even if the dimensions
             // don't match), e.g. empty type can become int or array of ints. Empty type of higher dimension is used to
             // specify an empty array or an empty matrix, thus it cannot become any other type unless the dimensionality
             // matches
-            if (is_empty_type(source.data_type()) && source.dimension() == entity::Dimension::Dim0)
-                return target;
-            return std::nullopt;
+            return is_empty_type(source.data_type()) && source.dimension() == entity::Dimension::Dim0;
         }
 
         if (source == target)
-            return target;
+            return true;
 
         // Empty type is coercible to any type
         if (is_empty_type(source.data_type())) {
-            return target;
+            return true;
         }
 
         // Nothing is coercible to the empty type.
         if (is_empty_type(target.data_type())) {
-            return std::nullopt;
+            return false;
         }
 
         auto source_type = source.data_type();
@@ -73,101 +98,104 @@ namespace arcticdb {
         auto target_size = slice_bit_size(target_type);
 
         if (is_time_type(source_type)) {
-            if (!is_time_type(target_type))
-                return std::nullopt;
+            return is_time_type(target_type);
         } else if (is_unsigned_type(source_type)) {
             if (is_unsigned_type(target_type)) {
-                // UINT->UINT, target_size must be >= source_size
-                if (source_size > target_size)
-                    return std::nullopt;
+                // UINT->UINT
+                return target_size >= source_size;
             } else if (is_signed_type(target_type)) {
-                // UINT->INT, target_size must be > source_size
-                if (source_size >= target_size)
-                    return std::nullopt;
+                // UINT->INT
+                return target_size > source_size;
             } else if (is_floating_point_type(target_type)) {
-                // UINT->FLOAT, no restrictions on relative sizes
+                // UINT->FLOAT
+                return target_size == entity::SizeBits::S64 || source_size < entity::SizeBits::S32;
             } else {
                 // Non-numeric target type
-                return std::nullopt;
+                return false;
             }
         } else if (is_signed_type(source_type)) {
             if (is_unsigned_type(target_type)) {
                 // INT->UINT never promotable
-                return std::nullopt;
+                return false;
             } else if (is_signed_type(target_type)) {
-                // INT->INT, target_size must be >= source_size
-                if (source_size > target_size)
-                    return std::nullopt;
+                // INT->INT
+                return target_size >= source_size;
             } else if (is_floating_point_type(target_type)) {
-                // INT->FLOAT, no restrictions on relative sizes
+                // INT->FLOAT
+                return target_size == entity::SizeBits::S64 || source_size < entity::SizeBits::S32;
             } else {
                 // Non-numeric target type
-                return std::nullopt;
+                return false;
             }
         } else if (is_floating_point_type(source_type)) {
-            if (is_unsigned_type(target_type) || is_signed_type(target_type)) {
+            if (is_integer_type(target_type)) {
                 // FLOAT->U/INT never promotable
-                return std::nullopt;
+                return false;
             } else if (is_floating_point_type(target_type)) {
-                // FLOAT->FLOAT, target_size must be >= source_size
-                if (source_size > target_size)
-                    return std::nullopt;
+                // FLOAT->FLOAT
+                return target_size >= source_size;
             } else {
                 // Non-numeric target type
-                return std::nullopt;
+                return false;
             }
         } else if (is_sequence_type(source_type) && is_sequence_type(target_type)) {
             // Only allow promotion with UTF strings, and only to dynamic (never to fixed width)
-            if (!is_utf_type(source_type) || !is_utf_type(target_type) || !is_dynamic_string_type(target_type))
-                return std::nullopt;
+            return is_utf_type(source_type) && is_utf_type(target_type) && is_dynamic_string_type(target_type);
         } else if (is_bool_object_type(source_type)) {
-            return std::nullopt;
+            return false;
         } else {
             // Non-numeric source type
-            return std::nullopt;
+            return false;
+        }
+    }
+
+    static std::optional<entity::TypeDescriptor> common_type_mixed_sign_ints(const entity::TypeDescriptor& left, const entity::TypeDescriptor& right) {
+        auto dimension = left.dimension();
+        auto left_type = left.data_type();
+        auto right_type = right.data_type();
+        auto left_size = slice_bit_size(left_type);
+        auto right_size = slice_bit_size(right_type);
+        // To get here we must have one signed and one unsigned type, with the width of the signed type <= the width of the unsigned type
+        internal::check<ErrorCode::E_ASSERTION_FAILURE>(is_signed_type(left_type) ^ is_signed_type(right_type),
+                                                        "Expected one signed and one unsigned int in has_valid_common_type");
+        if (is_signed_type(left_type)) {
+            internal::check<ErrorCode::E_ASSERTION_FAILURE>(left_size <= right_size,
+                                                            "Expected left_size <= right_size in has_valid_common_type");
+        } else {
+            // is_signed_type(right_type)
+            internal::check<ErrorCode::E_ASSERTION_FAILURE>(right_size <= left_size,
+                                                            "Expected right_size <= left_size in has_valid_common_type");
         }
 
-        return target;
+        auto target_size = entity::SizeBits(uint8_t(std::max(left_size, right_size)) + 1);
+        if (target_size < entity::SizeBits::COUNT) {
+            return std::make_optional<entity::TypeDescriptor>(combine_data_type(entity::ValueType::INT, target_size), dimension);
+        } else {
+            return std::nullopt;
+        }
     }
 
-    std::optional<entity::TypeDescriptor> has_valid_common_type(
-        const entity::TypeDescriptor& left,
-        const entity::TypeDescriptor& right
-    ) {
-        auto maybe_common_type = has_valid_type_promotion(left, right);
-        if (!maybe_common_type) {
-            maybe_common_type = has_valid_type_promotion(right, left);
+    std::optional<entity::TypeDescriptor> has_valid_common_type(const entity::TypeDescriptor& left, const entity::TypeDescriptor& right) {
+        if (is_valid_type_promotion_to_target(left, right)) {
+            return right;
+        } else if (is_valid_type_promotion_to_target(right, left)) {
+            return left;
+        }
+
+        if (left.dimension() != right.dimension()) {
+            return std::nullopt;
         }
-        // has_valid_type_promotion checks if the second type can represent all values of the first type
-        // We also want to handle cases where there is a third type that can represent both
-        // In practice, this is only the case when there is one signed and one unsigned integer right now
-        if (!maybe_common_type &&
-            left.dimension() == right.dimension() &&
-            is_integer_type(left.data_type()) &&
-            is_integer_type(right.data_type())) {
-            auto dimension = left.dimension();
-            auto left_type = left.data_type();
-            auto right_type = right.data_type();
-            auto left_size = slice_bit_size(left_type);
-            auto right_size = slice_bit_size(right_type);
-            // To get here we must have one signed and one unsigned type, with the width of the signed type <= the width of the unsigned type
-            internal::check<ErrorCode::E_ASSERTION_FAILURE>(is_signed_type(left_type) ^ is_signed_type(right_type),
+
+        if (is_integer_type(left.data_type()) && is_integer_type(right.data_type())) {
+            // One must be signed and the other unsigned, since if they matched is_valid_type_promotion_to_target would have handled them already
+            internal::check<ErrorCode::E_ASSERTION_FAILURE>(is_signed_type(left.data_type()) ^ is_signed_type(right.data_type()),
                                                             "Expected one signed and one unsigned int in has_valid_common_type");
-            if (is_signed_type(left_type)) {
-                internal::check<ErrorCode::E_ASSERTION_FAILURE>(left_size <= right_size,
-                                                                "Expected left_size <= right_size in has_valid_common_type");
-            } else {
-                // is_signed_type(right_type)
-                internal::check<ErrorCode::E_ASSERTION_FAILURE>(right_size <= left_size,
-                                                                "Expected right_size <= left_size in has_valid_common_type");
-            }
-            auto target_size = entity::SizeBits(uint8_t(std::max(left_size, right_size)) + 1);
-            if (target_size < entity::SizeBits::COUNT) {
-                maybe_common_type = entity::TypeDescriptor{
-                        combine_data_type(entity::ValueType::INT, target_size),
-                        dimension};
-            }
+            return common_type_mixed_sign_ints(left, right);
+        } else if (is_mixed_float_and_integer(left.data_type(), right.data_type())) {
+            return common_type_float_integer(left, right);
+        } else {
+            return std::nullopt;
         }
-        return maybe_common_type;
     }
+
 }
@@ -21,26 +21,16 @@ namespace entity {
 /// n elements of left type from one buffer to n elements of type right in another buffer and get the same result
 [[nodiscard]] bool trivially_compatible_types(const entity::TypeDescriptor& left, const entity::TypeDescriptor& right);
 
-[[nodiscard]] std::optional<entity::TypeDescriptor> has_valid_type_promotion(
+[[nodiscard]] bool is_valid_type_promotion_to_target(
     const entity::TypeDescriptor& source,
     const entity::TypeDescriptor& target
 );
 
-[[nodiscard]] std::optional<entity::TypeDescriptor> has_valid_type_promotion(
-    const proto::descriptors::TypeDescriptor& source,
-    const proto::descriptors::TypeDescriptor& target
-);
-
 [[nodiscard]] std::optional<entity::TypeDescriptor> has_valid_common_type(
     const entity::TypeDescriptor& left,
     const entity::TypeDescriptor& right
 );
 
-[[nodiscard]] std::optional<entity::TypeDescriptor> has_valid_common_type(
-    const proto::descriptors::TypeDescriptor& left,
-    const proto::descriptors::TypeDescriptor& right
-);
-
 inline std::string get_user_friendly_type_string(const entity::TypeDescriptor& type) {
     return is_sequence_type(type.data_type()) ? fmt::format("TD<type=STRING, dim={}>", type.dimension_) : fmt::format("{}", type);
 }

@@ -105,7 +105,7 @@ struct ColumnExpression {
     ColumnExpression(const IterableContainer& left, const IterableContainer& right) :
         left_(left),
         right_(right) {
-        auto promoted_type = has_valid_type_promotion(left_->type(), right_->type());
+        auto promoted_type = has_valid_common_type(left_->type(), right_->type());
         util::check(static_cast<bool>(promoted_type), "Cannot promote from type {} and type {} in column expression", left_->type(), right_->type());
         type_ = promoted_type.value();
     }

@@ -567,7 +567,7 @@ void decode_into_frame_static(
 
 void check_mapping_type_compatibility(const ColumnMapping& m) {
     util::check(
-        static_cast<bool>(has_valid_type_promotion(m.source_type_desc_, m.dest_type_desc_)),
+        is_valid_type_promotion_to_target(m.source_type_desc_, m.dest_type_desc_),
         "Can't promote type {} to type {} in field {}",
         m.source_type_desc_,
         m.dest_type_desc_,

@@ -169,7 +169,6 @@ std::vector<std::vector<EntityId>> offsets_to_entity_ids(const std::vector<std::
  * On entry to a clause, construct ProcessingUnits from the input entity IDs. These will be provided by the
  * structure_for_processing method for the first clause in the pipeline and for clauses whose expected input structure
  * differs from the previous clause's output structure. Otherwise, these come directly from the previous clause.
- * clauses.
  */
 template<class... Args>
 ProcessingUnit gather_entities(ComponentManager& component_manager, const std::vector<EntityId>& entity_ids) {

@@ -139,11 +139,10 @@ struct type_arithmetic_promoted_type {
                     double,
                     float
                 >,
-                // Otherwise, if only one type is floating point, promote to this type
-                std::conditional_t<std::is_floating_point_v<LHS>,
-                    LHS,
-                    RHS
-                >
+                // Otherwise, if only one type is floating point, always promote to double
+                // For example when combining int32 and float32 the result can only fit in float64 without loss of precision
+                // Special cases like int16 and float32 can fit in float32, but we always promote up to float64 (as does Pandas)
+                double
             >,
             // Otherwise, both types are integers
             std::conditional_t<std::is_unsigned_v<LHS> && std::is_unsigned_v<RHS>,