apache · belugabehr · Feb 2, 2025 · Dec 29, 2024 · martin-g · Dec 30, 2024
diff --git a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java
@@ -68,6 +68,11 @@ public Utf8(byte[] bytes) {
     this.length = length;
   }
 
+  Utf8(String string, int length) {
+    this(string);
+    this.length = length;
+  }
+
   /**
    * Return UTF-8 encoded bytes. Only valid through {@link #getByteLength()}
    * assuming the bytes have been fully copied into the underlying buffer from the
@@ -173,9 +178,15 @@ public int hashCode() {
     if (h == 0) {
       byte[] bytes = this.bytes;
       int length = this.length;
-      h = 1;
-      for (int i = 0; i < length; i++) {
-        h = h * 31 + bytes[i];
+      // If the array is filled, use the underlying JDK hash functionality.
+      // Starting with JDK 21, the underlying implementation is vectorized.
+      if (length > 7 && bytes.length == length) {
+        h = Arrays.hashCode(bytes);
+      } else {
+        h = 1;
+        for (int i = 0; i < length; i++) {
+          h = h * 31 + bytes[i];
+        }
       }
       this.hash = h;
     }

diff --git a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java
@@ -99,6 +99,26 @@ void hashCodeReused() {
     assertEquals(4122302, u.hashCode());
   }
 
+  /**
+   * There are two different code paths that hashcode() can call depending on the
+   * state of the internal buffer. If the buffer is full (string length is equal
+   * to buffer length) then the JDK hashcode function can be used. However, if the
+   * buffer is not full (string length is less than the internal buffer length),
+   * then the JDK does not support this prior to JDK 23 and a scalar
+   * implementation is the only option today. This difference can be resolved with
+   * JDK 23 as it supports both cases.
+   */
+  @Test
+  void hashCodeBasedOnCapacity() {
+    // string = 8; buffer = 8
+    Utf8 fullCapacity = new Utf8("abcdefgh", 8);
+
+    // string = 8; buffer = 9
+    Utf8 partialCapacity = new Utf8("abcdefghX", 8);
+
+    assertEquals(fullCapacity.hashCode(), partialCapacity.hashCode());
+  }
+
   @Test
   void oversizeUtf8() {
     Utf8 u = new Utf8();