apache · harshmotw-db · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -4186,6 +4186,12 @@
     ],
     "sqlState" : "42846"
   },
+  "UNKNOWN_PRIMITIVE_TYPE_IN_VARIANT" : {
+    "message" : [
+      "Unknown primitive type with id <id> was found in a variant value."
+    ],
+    "sqlState" : "22023"
+  },
   "UNKNOWN_PROTOBUF_MESSAGE_TYPE" : {
     "message" : [
       "Attempting to treat <descriptorName> as a Message, but it was <containingType>."

diff --git a/common/utils/src/main/scala/org/apache/spark/util/DayTimeIntervalUtils.java b/common/utils/src/main/scala/org/apache/spark/util/DayTimeIntervalUtils.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+import org.apache.spark.SparkException;
+
+import java.math.BigDecimal;
+import java.util.ArrayList;
+
+// Replicating code from SparkIntervalUtils so code in the 'common' space can work with
+// year-month intervals.
+public class DayTimeIntervalUtils {
+  private static byte DAY = 0;
+  private static byte HOUR = 1;
+  private static byte MINUTE = 2;
+  private static byte SECOND = 3;
+  private static long HOURS_PER_DAY = 24;
+  private static long MINUTES_PER_HOUR = 60;
+  private static long SECONDS_PER_MINUTE = 60;
+  private static long MILLIS_PER_SECOND = 1000;
+  private static long MICROS_PER_MILLIS = 1000;
+  private static long MICROS_PER_SECOND = MICROS_PER_MILLIS * MILLIS_PER_SECOND;
+  private static long MICROS_PER_MINUTE = SECONDS_PER_MINUTE * MICROS_PER_SECOND;
+  private static long MICROS_PER_HOUR = MINUTES_PER_HOUR * MICROS_PER_MINUTE;
+  private static long MICROS_PER_DAY = HOURS_PER_DAY * MICROS_PER_HOUR;
+  private static long MAX_DAY = Long.MAX_VALUE / MICROS_PER_DAY;
+  private static long MAX_HOUR = Long.MAX_VALUE / MICROS_PER_HOUR;
+  private static long MAX_MINUTE = Long.MAX_VALUE / MICROS_PER_MINUTE;
+  private static long MAX_SECOND = Long.MAX_VALUE / MICROS_PER_SECOND;
+
+  public static String fieldToString(byte field) throws SparkException {
+    if (field == DAY) {
+      return "DAY";
+    } else if (field == HOUR) {
+      return "HOUR";
+    } else if (field == MINUTE) {
+      return "MINUTE";
+    } else if (field == SECOND) {
+      return "SECOND";
+    } else {
+      throw new SparkException("Invalid field in day-time interval: " + field +
+              ". Supported fields are: DAY, HOUR, MINUTE, SECOND");
+    }
+  }
+
+  // Used to convert microseconds representing a day-time interval with given start and end fields
+  // to its ANSI SQL string representation. Throws a SparkException if startField or endField are
+  // out of bounds.
+  public static String toDayTimeIntervalANSIString(long micros, byte startField, byte endField)
+          throws SparkException {
+    String sign = "";
+    long rest = micros;
+    try {
+      String from = fieldToString(startField).toUpperCase();
+      String to = fieldToString(endField).toUpperCase();
+      String prefix = "INTERVAL '";
+      String postfix = startField == endField ? "' " + from : "' " + from + " TO " + to;
+      if (micros < 0) {
+        if (micros == Long.MIN_VALUE) {
+          // Especial handling of minimum `Long` value because negate op overflows `Long`.
+          // seconds = 106751991 * (24 * 60 * 60) + 4 * 60 * 60 + 54 = 9223372036854
+          // microseconds = -9223372036854000000L-775808 == Long.MinValue
+          String baseStr = "-106751991 04:00:54.775808000";
+          String firstStr = "-" + (startField == DAY ? Long.toString(MAX_DAY) :
+                  (startField == HOUR ? Long.toString(MAX_HOUR) :
+                          (startField == MINUTE ? Long.toString(MAX_MINUTE) :
+                                  Long.toString(MAX_SECOND) + ".775808")));
+          if (startField == endField) {
+            return prefix + firstStr + postfix;
+          } else {
+            int substrStart = startField == DAY ? 10 : (startField == HOUR ? 13 : 16);
+            int substrEnd = endField == HOUR ? 13 : (endField == MINUTE ? 16 : 26);
+            return prefix + firstStr + baseStr.substring(substrStart, substrEnd) + postfix;
+          }
+        } else {
+          sign = "-";
+          rest = -rest;
+        }
+      }
+      StringBuilder formatBuilder = new StringBuilder(sign);
+      ArrayList<Long> formatArgs = new ArrayList<>();
+      if (startField == DAY) {
+        formatBuilder.append(rest / MICROS_PER_DAY);
+        rest %= MICROS_PER_DAY;
+      } else if (startField == HOUR) {
+        formatBuilder.append("%02d");
+        formatArgs.add(rest / MICROS_PER_HOUR);
+        rest %= MICROS_PER_HOUR;
+      } else if (startField == MINUTE) {
+        formatBuilder.append("%02d");
+        formatArgs.add(rest / MICROS_PER_MINUTE);
+        rest %= MICROS_PER_MINUTE;
+      } else if (startField == SECOND) {
+        String leadZero = rest < 10 * MICROS_PER_SECOND ? "0" : "";
+        formatBuilder.append(leadZero + BigDecimal.valueOf(rest, 6)
+                .stripTrailingZeros().toPlainString());
+      }
+
+      if (startField < HOUR && HOUR <= endField) {
+        formatBuilder.append(" %02d");
+        formatArgs.add(rest / MICROS_PER_HOUR);
+        rest %= MICROS_PER_HOUR;
+      }
+      if (startField < MINUTE && MINUTE <= endField) {
+        formatBuilder.append(":%02d");
+        formatArgs.add(rest / MICROS_PER_MINUTE);
+        rest %= MICROS_PER_MINUTE;
+      }
+      if (startField < SECOND && SECOND <= endField) {
+        String leadZero = rest < 10 * MICROS_PER_SECOND ? "0" : "";
+        formatBuilder.append(":" + leadZero + BigDecimal.valueOf(rest, 6)
+                .stripTrailingZeros().toPlainString());
+      }
+      return prefix + String.format(formatBuilder.toString(), formatArgs.toArray()) + postfix;
+    } catch (SparkException e) {
+      throw e;
+    }
+  }
+}
diff --git a/common/utils/src/main/scala/org/apache/spark/util/YearMonthIntervalUtils.java b/common/utils/src/main/scala/org/apache/spark/util/YearMonthIntervalUtils.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util;
+
+// Replicating code from SparkIntervalUtils so code in the 'common' space can work with
+// year-month intervals.
+public class YearMonthIntervalUtils {
+  private static byte YEAR = 0;
+  private static byte MONTH = 1;
+  private static int MONTHS_PER_YEAR = 12;
+
+  // Used to convert months representing a year-month interval with given start and end fields
+  // to its ANSI SQL string representation.
+  public static String toYearMonthIntervalANSIString(int months, byte startField, byte endField) {
+    String sign = "";
+    long absMonths = months;
+    if (months < 0) {
+      sign = "-";
+      absMonths = -absMonths;
+    }
+    String year = sign + Long.toString(absMonths / MONTHS_PER_YEAR);
+    String yearAndMonth = year + "-" + Long.toString(absMonths % MONTHS_PER_YEAR);
+    StringBuilder formatBuilder = new StringBuilder("INTERVAL '");
+    if (startField == endField) {
+      if (startField == YEAR) {
+        formatBuilder.append(year + "' YEAR");
+      } else {
+        formatBuilder.append(Integer.toString(months) + "' MONTH");
+      }
+    } else {
+      formatBuilder.append(yearAndMonth + "' YEAR TO MONTH");
+    }
+    return formatBuilder.toString();
+  }
+}
diff --git a/common/variant/README.md b/common/variant/README.md
@@ -335,27 +335,29 @@ The Decimal type contains a scale, but no precision. The implied precision of a
 | Object       | `2` | A collection of (string-key, variant-value) pairs |
 | Array        | `3` | An ordered sequence of variant values             |
 
-| Primitive Type              | Type ID | Equivalent Parquet Type   | Binary format                                                                                             |
-|-----------------------------|---------|---------------------------|-----------------------------------------------------------------------------------------------------------|
-| null                        | `0`     | any                       | none                                                                                                      |
-| boolean (True)              | `1`     | BOOLEAN                   | none                                                                                                      |
-| boolean (False)             | `2`     | BOOLEAN                   | none                                                                                                      |
-| int8                        | `3`     | INT(8, signed)            | 1 byte                                                                                                    |
-| int16                       | `4`     | INT(16, signed)           | 2 byte little-endian                                                                                      |
-| int32                       | `5`     | INT(32, signed)           | 4 byte little-endian                                                                                      |
-| int64                       | `6`     | INT(64, signed)           | 8 byte little-endian                                                                                      |
-| double                      | `7`     | DOUBLE                    | IEEE little-endian                                                                                        |
-| decimal4                    | `8`     | DECIMAL(precision, scale) | 1 byte scale in range [0, 38], followed by little-endian unscaled value (see decimal table)               |
-| decimal8                    | `9`     | DECIMAL(precision, scale) | 1 byte scale in range [0, 38], followed by little-endian unscaled value (see decimal table)               |
-| decimal16                   | `10`    | DECIMAL(precision, scale) | 1 byte scale in range [0, 38], followed by little-endian unscaled value (see decimal table)               |
-| date                        | `11`    | DATE                      | 4 byte little-endian                                                                                      |
-| timestamp                   | `12`    | TIMESTAMP(true, MICROS)   | 8-byte little-endian                                                                                      |
-| timestamp without time zone | `13`    | TIMESTAMP(false, MICROS)  | 8-byte little-endian                                                                                      |
-| float                       | `14`    | FLOAT                     | IEEE little-endian                                                                                        |
-| binary                      | `15`    | BINARY                    | 4 byte little-endian size, followed by bytes                                                              |
-| string                      | `16`    | STRING                    | 4 byte little-endian size, followed by UTF-8 encoded bytes                                                |
-| binary from metadata        | `17`    | BINARY                    | Little-endian index into the metadata dictionary. Number of bytes is equal to the metadata `offset_size`. |
-| string from metadata        | `18`    | STRING                    | Little-endian index into the metadata dictionary. Number of bytes is equal to the metadata `offset_size`. |
+| Primitive Type              | Type ID | Equivalent Parquet Type     | Binary format                                                                                                       |
+|-----------------------------|---------|-----------------------------|---------------------------------------------------------------------------------------------------------------------|
+| null                        | `0`     | any                         | none                                                                                                                |
+| boolean (True)              | `1`     | BOOLEAN                     | none                                                                                                                |
+| boolean (False)             | `2`     | BOOLEAN                     | none                                                                                                                |
+| int8                        | `3`     | INT(8, signed)              | 1 byte                                                                                                              |
+| int16                       | `4`     | INT(16, signed)             | 2 byte little-endian                                                                                                |
+| int32                       | `5`     | INT(32, signed)             | 4 byte little-endian                                                                                                |
+| int64                       | `6`     | INT(64, signed)             | 8 byte little-endian                                                                                                |
+| double                      | `7`     | DOUBLE                      | IEEE little-endian                                                                                                  |
+| decimal4                    | `8`     | DECIMAL(precision, scale)   | 1 byte scale in range [0, 38], followed by little-endian unscaled value (see decimal table)                         |
+| decimal8                    | `9`     | DECIMAL(precision, scale)   | 1 byte scale in range [0, 38], followed by little-endian unscaled value (see decimal table)                         |
+| decimal16                   | `10`    | DECIMAL(precision, scale)   | 1 byte scale in range [0, 38], followed by little-endian unscaled value (see decimal table)                         |
+| date                        | `11`    | DATE                        | 4 byte little-endian                                                                                                |
+| timestamp                   | `12`    | TIMESTAMP(true, MICROS)     | 8-byte little-endian                                                                                                |
+| timestamp without time zone | `13`    | TIMESTAMP(false, MICROS)    | 8-byte little-endian                                                                                                |
+| float                       | `14`    | FLOAT                       | IEEE little-endian                                                                                                  |
+| binary                      | `15`    | BINARY                      | 4 byte little-endian size, followed by bytes                                                                        |
+| string                      | `16`    | STRING                      | 4 byte little-endian size, followed by UTF-8 encoded bytes                                                          |
+| binary from metadata        | `17`    | BINARY                      | Little-endian index into the metadata dictionary. Number of bytes is equal to the metadata `offset_size`.           |
+| string from metadata        | `18`    | STRING                      | Little-endian index into the metadata dictionary. Number of bytes is equal to the metadata `offset_size`.           |
+| year-month interval         | `19`    | INT(32, signed)<sup>1</sup> | 1 byte denoting start field (1 bit) and end field (1 bit) starting at LSB followed by 4-byte little-endian value.   |
+| day-time interval           | `20`    | INT(64, signed)<sup>1</sup> | 1 byte denoting start field (2 bits) and end field (2 bits) starting at LSB followed by 8-byte little-endian value. |
 
 | Decimal Precision     | Decimal value type |
 |-----------------------|--------------------|
@@ -364,6 +366,10 @@ The Decimal type contains a scale, but no precision. The implied precision of a
 | 18 <= precision <= 38 | int128             |
 | > 38                  | Not supported      |
 
+The year-month and day-time interval types have one byte at the beginning indicating the start and end fields. In the case of the year-month interval, the least significant bit denotes the start field and the next least significant bit denotes the end field. The remaining 6 bits are unused. A field value of 0 represents YEAR and 1 represents MONTH. In the case of the day-time interval, the least significant 2 bits denote the start field and the next least significant 2 bits denote the end field. The remaining 4 bits are unused. A field value of 0 represents DAY, 1 represents HOUR, 2 represents MINUTE, and 3 represents SECOND.
+
+[1] The parquet format does not have pure equivalents for the year-month and day-time interval types. Year-month intervals are usually represented using int32 values and the day-time intervals are usually represented using int64 values. However, these values don't include the start and end fields of these types. Therefore, Spark stores them in the column metadata.
+
 # Field ID order and uniqueness
 
 For objects, field IDs and offsets must be listed in the order of the corresponding field names, sorted lexicographically. Note that the fields themselves are not required to follow this order. As a result, offsets will not necessarily be listed in ascending order.