From 4bf1b9420990de1453b9b4bb145d7d37dc750f07 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Thu, 1 Jun 2023 11:37:38 +0200 Subject: [PATCH] hunspell (minor): reduce allocations when reading the dictionary's morphological data (#12323) there can be many entries with morph data, so we'd better avoid compiling and matching regexes and even stream allocation --- .../lucene/analysis/hunspell/Dictionary.java | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index b5d13271c3f2..820acddbc092 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -992,7 +992,7 @@ private int mergeDictionaries( // if we haven't seen any custom morphological data, try to parse one if (!hasCustomMorphData) { int morphStart = line.indexOf(MORPH_SEPARATOR); - if (morphStart >= 0 && morphStart < line.length()) { + if (morphStart >= 0) { String data = line.substring(morphStart + 1); hasCustomMorphData = splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:")); @@ -1321,14 +1321,22 @@ private List splitMorphData(String morphData) { if (morphData.isBlank()) { return Collections.emptyList(); } - return Arrays.stream(morphData.split("\\s+")) - .filter( - s -> - s.length() > 3 - && Character.isLetter(s.charAt(0)) - && Character.isLetter(s.charAt(1)) - && s.charAt(2) == ':') - .collect(Collectors.toList()); + + List result = null; + int start = 0; + for (int i = 0; i <= morphData.length(); i++) { + if (i == morphData.length() || Character.isWhitespace(morphData.charAt(i))) { + if (i - start > 3 + && Character.isLetter(morphData.charAt(start)) + && Character.isLetter(morphData.charAt(start + 1)) + && morphData.charAt(start + 2) == ':') { + if (result == null) result = new ArrayList<>(); + result.add(morphData.substring(start, i)); + } + start = i + 1; + } + } + return result == null ? List.of() : result; } boolean hasFlag(IntsRef forms, char flag) {