apache · gsmiller · May 10, 2023 · Jun 2, 2023
diff --git a/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java b/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
@@ -91,11 +91,47 @@ public AutomatonQuery(final Term term, Automaton automaton, boolean isBinary) {
    */
   public AutomatonQuery(
       final Term term, Automaton automaton, boolean isBinary, RewriteMethod rewriteMethod) {
+    this(term, automaton, false, true, isBinary, rewriteMethod);
+  }
+
+  /**
+   * Create a new AutomatonQuery from an {@link Automaton}.
+   *
+   * @param term Term containing field and possibly some pattern structure. The term text is
+   *     ignored.
+   * @param automaton Automaton to run, terms that are accepted are considered a match.
+   * @param isFinite if true, this automaton is already finite
+   * @param isBinary if true, this automaton is already binary and will not go through the
+   *     UTF32ToUTF8 conversion
+   */
+  public AutomatonQuery(
+      final Term term, Automaton automaton, boolean isFinite, boolean simplify, boolean isBinary) {
+    this(term, automaton, isFinite, simplify, isBinary, CONSTANT_SCORE_BLENDED_REWRITE);
+  }
+
+  /**
+   * Create a new AutomatonQuery from an {@link Automaton}.
+   *
+   * @param term Term containing field and possibly some pattern structure. The term text is
+   *     ignored.
+   * @param automaton Automaton to run, terms that are accepted are considered a match.
+   * @param isFinite if true, this automaton is already finite
+   * @param isBinary if true, this automaton is already binary and will not go through the
+   *     UTF32ToUTF8 conversion
+   * @param rewriteMethod the rewriteMethod to use to build the final query from the automaton
+   */
+  public AutomatonQuery(
+      final Term term,
+      Automaton automaton,
+      boolean isFinite,
+      boolean simplify,
+      boolean isBinary,
+      RewriteMethod rewriteMethod) {
     super(term.field(), rewriteMethod);
     this.term = term;
     this.automaton = automaton;
     this.automatonIsBinary = isBinary;
-    this.compiled = new CompiledAutomaton(automaton, false, true, isBinary);
+    this.compiled = new CompiledAutomaton(automaton, isFinite, simplify, isBinary);
 
     this.ramBytesUsed =
         BASE_RAM_BYTES + term.ramBytesUsed() + automaton.ramBytesUsed() + compiled.ramBytesUsed();

diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -17,26 +17,20 @@
 package org.apache.lucene.search;
 
 import java.io.IOException;
-import java.io.UncheckedIOException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.Collections;
+import java.util.List;
 import java.util.SortedSet;
-import org.apache.lucene.index.FilteredTermsEnum;
-import org.apache.lucene.index.PrefixCodedTerms;
-import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.FiniteStringsIterator;
+import org.apache.lucene.util.fst.Util;
 
 /**
  * Specialization for a disjunction over many terms that, by default, behaves like a {@link
@@ -69,189 +63,83 @@
  *
  * <p>NOTE: This query produces scores that are equal to its boost
  */
-public class TermInSetQuery extends MultiTermQuery implements Accountable {
-
-  private static final long BASE_RAM_BYTES_USED =
-      RamUsageEstimator.shallowSizeOfInstance(TermInSetQuery.class);
-
+public class TermInSetQuery extends AutomatonQuery {
   private final String field;
-  private final PrefixCodedTerms termData;
-  private final int termDataHashCode; // cached hashcode of termData
+  private final int termCount;
 
   public TermInSetQuery(String field, Collection<BytesRef> terms) {
-    this(field, packTerms(field, terms));
+    super(new Term(field), toAutomaton(terms), true, false, true);
+    this.field = field;
+    this.termCount = terms.size();
   }
 
   public TermInSetQuery(String field, BytesRef... terms) {
-    this(field, packTerms(field, Arrays.asList(terms)));
+    this(field, Arrays.asList(terms));
   }
 
   /** Creates a new {@link TermInSetQuery} from the given collection of terms. */
   public TermInSetQuery(RewriteMethod rewriteMethod, String field, Collection<BytesRef> terms) {
-    super(field, rewriteMethod);
+    super(new Term(field), toAutomaton(terms), true, false, true, rewriteMethod);
     this.field = field;
-    this.termData = packTerms(field, terms);
-    termDataHashCode = termData.hashCode();
+    this.termCount = terms.size();
   }
 
   /** Creates a new {@link TermInSetQuery} from the given array of terms. */
   public TermInSetQuery(RewriteMethod rewriteMethod, String field, BytesRef... terms) {
     this(rewriteMethod, field, Arrays.asList(terms));
   }
 
-  private TermInSetQuery(String field, PrefixCodedTerms termData) {
-    super(field, MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE);
-    this.field = field;
-    this.termData = termData;
-    termDataHashCode = termData.hashCode();
-  }
-
-  private static PrefixCodedTerms packTerms(String field, Collection<BytesRef> terms) {
+  private static Automaton toAutomaton(Collection<BytesRef> terms) {
     BytesRef[] sortedTerms = terms.toArray(new BytesRef[0]);
     // already sorted if we are a SortedSet with natural order
     boolean sorted =
         terms instanceof SortedSet && ((SortedSet<BytesRef>) terms).comparator() == null;
     if (sorted == false) {
       ArrayUtil.timSort(sortedTerms);
     }
-    PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
+    List<BytesRef> sortedAndDeduped = new ArrayList<>(terms.size());
     BytesRefBuilder previous = null;
     for (BytesRef term : sortedTerms) {
       if (previous == null) {
         previous = new BytesRefBuilder();
       } else if (previous.get().equals(term)) {
         continue; // deduplicate
       }
-      builder.add(field, term);
+      sortedAndDeduped.add(term);
       previous.copyBytes(term);
     }
 
-    return builder.finish();
+    return Automata.makeBinaryStringUnion(sortedAndDeduped);
   }
 
   @Override
   public long getTermsCount() throws IOException {
-    return termData.size();
-  }
-
-  @Override
-  public void visit(QueryVisitor visitor) {
-    if (visitor.acceptField(field) == false) {
-      return;
-    }
-    if (termData.size() == 1) {
-      visitor.consumeTerms(this, new Term(field, termData.iterator().next()));
-    }
-    if (termData.size() > 1) {
-      visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton);
-    }
-  }
-
-  // TODO: This is pretty heavy-weight. If we have TermInSetQuery directly extend AutomatonQuery
-  // we won't have to do this (see GH#12176).
-  private ByteRunAutomaton asByteRunAutomaton() {
-    try {
-      Automaton a = Automata.makeBinaryStringUnion(termData.iterator());
-      return new ByteRunAutomaton(a, true);
-    } catch (IOException e) {
-      // Shouldn't happen since termData.iterator() provides an interator implementation that
-      // never throws:
-      throw new UncheckedIOException(e);
-    }
-  }
-
-  @Override
-  public boolean equals(Object other) {
-    return sameClassAs(other) && equalsTo(getClass().cast(other));
-  }
-
-  private boolean equalsTo(TermInSetQuery other) {
-    // no need to check 'field' explicitly since it is encoded in 'termData'
-    // termData might be heavy to compare so check the hash code first
-    return termDataHashCode == other.termDataHashCode && termData.equals(other.termData);
-  }
-
-  @Override
-  public int hashCode() {
-    return 31 * classHash() + termDataHashCode;
+    return termCount;
   }
 
+  /**
+   * This is semi-expensive as it fully traverses the automaton to create a helpful toString
+   * representation.
+   */
   @Override
   public String toString(String defaultField) {
     StringBuilder builder = new StringBuilder();
     builder.append(field);
     builder.append(":(");
 
-    TermIterator iterator = termData.iterator();
     boolean first = true;
-    for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
-      if (!first) {
+    FiniteStringsIterator it = new FiniteStringsIterator(automaton);
+    BytesRefBuilder scratch = new BytesRefBuilder();
+    for (IntsRef term = it.next(); term != null; term = it.next()) {
+      if (first == false) {
         builder.append(' ');
       }
       first = false;
-      builder.append(Term.toString(term));
+      BytesRef t = Util.toBytesRef(term, scratch);
+      builder.append(Term.toString(t));
     }
     builder.append(')');
 
     return builder.toString();
   }
-
-  @Override
-  public long ramBytesUsed() {
-    return BASE_RAM_BYTES_USED + termData.ramBytesUsed();
-  }
-
-  @Override
-  public Collection<Accountable> getChildResources() {
-    return Collections.emptyList();
-  }
-
-  @Override
-  protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
-    return new SetEnum(terms.iterator());
-  }
-
-  /**
-   * Like a baby {@link org.apache.lucene.index.AutomatonTermsEnum}, ping-pong intersects the terms
-   * dict against our encoded query terms.
-   */
-  private class SetEnum extends FilteredTermsEnum {
-    private final TermIterator iterator;
-    private BytesRef seekTerm;
-
-    SetEnum(TermsEnum termsEnum) {
-      super(termsEnum);
-      iterator = termData.iterator();
-      seekTerm = iterator.next();
-    }
-
-    @Override
-    protected AcceptStatus accept(BytesRef term) throws IOException {
-      // next() our iterator until it is >= the incoming term
-      // if it matches exactly, it's a hit, otherwise it's a miss
-      int cmp = 0;
-      while (seekTerm != null && (cmp = seekTerm.compareTo(term)) < 0) {
-        seekTerm = iterator.next();
-      }
-      if (seekTerm == null) {
-        return AcceptStatus.END;
-      } else if (cmp == 0) {
-        return AcceptStatus.YES_AND_SEEK;
-      } else {
-        return AcceptStatus.NO_AND_SEEK;
-      }
-    }
-
-    @Override
-    protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException {
-      // next() our iterator until it is > the currentTerm, must always make progress.
-      if (currentTerm == null) {
-        return seekTerm;
-      }
-      while (seekTerm != null && seekTerm.compareTo(currentTerm) <= 0) {
-        seekTerm = iterator.next();
-      }
-      return seekTerm;
-    }
-  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermInSetQuery.java
@@ -25,7 +25,6 @@
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
-import java.util.function.Supplier;
 import java.util.stream.Collectors;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field.Store;
@@ -47,7 +46,6 @@
 import org.apache.lucene.tests.util.TestUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.automaton.ByteRunAutomaton;
 
 public class TestTermInSetQuery extends LuceneTestCase {
 
@@ -359,48 +357,4 @@ public void testIsConsideredCostlyByQueryCache() throws IOException {
     // cached after two uses
     assertTrue(policy.shouldCache(query));
   }
-
-  public void testVisitor() {
-    // singleton reports back to consumeTerms()
-    TermInSetQuery singleton = new TermInSetQuery("field", newBytesRef("term1"));
-    singleton.visit(
-        new QueryVisitor() {
-          @Override
-          public void consumeTerms(Query query, Term... terms) {
-            assertEquals(1, terms.length);
-            assertEquals(new Term("field", newBytesRef("term1")), terms[0]);
-          }
-
-          @Override
-          public void consumeTermsMatching(
-              Query query, String field, Supplier<ByteRunAutomaton> automaton) {
-            fail("Singleton TermInSetQuery should not try to build ByteRunAutomaton");
-          }
-        });
-
-    // multiple values built into automaton
-    List<BytesRef> terms = new ArrayList<>();
-    for (int i = 0; i < 100; i++) {
-      terms.add(newBytesRef("term" + i));
-    }
-    TermInSetQuery t = new TermInSetQuery("field", terms);
-    t.visit(
-        new QueryVisitor() {
-          @Override
-          public void consumeTerms(Query query, Term... terms) {
-            fail("TermInSetQuery with multiple terms should build automaton");
-          }
-
-          @Override
-          public void consumeTermsMatching(
-              Query query, String field, Supplier<ByteRunAutomaton> automaton) {
-            ByteRunAutomaton a = automaton.get();
-            BytesRef test = newBytesRef("nonmatching");
-            assertFalse(a.run(test.bytes, test.offset, test.length));
-            for (BytesRef term : terms) {
-              assertTrue(a.run(term.bytes, term.offset, term.length));
-            }
-          }
-        });
-  }
 }