apache · alessandrobenedetti · Apr 24, 2023 · Feb 23, 2023 · Apr 4, 2023 · Apr 5, 2023
diff --git a/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestRandomChains.java b/lucene/analysis.tests/src/test/org/apache/lucene/analysis/tests/TestRandomChains.java
@@ -89,6 +89,9 @@
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.stempel.StempelStemmer;
 import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.synonym.word2vec.SynonymProvider;
+import org.apache.lucene.analysis.synonym.word2vec.Word2VecModel;
+import org.apache.lucene.analysis.synonym.word2vec.Word2VecSynonymProvider;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.tests.analysis.MockTokenFilter;
@@ -99,8 +102,10 @@
 import org.apache.lucene.tests.util.automaton.AutomatonTestUtil;
 import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IgnoreRandomChains;
+import org.apache.lucene.util.TermAndVector;
 import org.apache.lucene.util.Version;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@@ -415,6 +420,27 @@ private String randomNonEmptyString(Random random) {
                       }
                     }
                   });
+              put(
+                  SynonymProvider.class,
+                  random -> {
+                    final int numEntries = atLeast(10);
+                    final int vectorDimension = random.nextInt(99) + 1;
+                    Word2VecModel model = new Word2VecModel(numEntries, vectorDimension);
+                    for (int j = 0; j < numEntries; j++) {
+                      String s = TestUtil.randomSimpleString(random, 10, 20);
+                      float[] vec = new float[vectorDimension];
+                      for (int i = 0; i < vectorDimension; i++) {
+                        vec[i] = random.nextFloat();
+                      }
+                      model.addTermAndVector(new TermAndVector(new BytesRef(s), vec));
+                    }
+                    try {
+                      return new Word2VecSynonymProvider(model);
+                    } catch (IOException e) {
+                      Rethrow.rethrow(e);
+                      return null; // unreachable code
+                    }
+                  });
               put(
                   DateFormat.class,
                   random -> {

diff --git a/lucene/analysis/common/src/java/module-info.java b/lucene/analysis/common/src/java/module-info.java
@@ -79,6 +79,7 @@
   exports org.apache.lucene.analysis.sr;
   exports org.apache.lucene.analysis.sv;
   exports org.apache.lucene.analysis.synonym;
+  exports org.apache.lucene.analysis.synonym.word2vec;
   exports org.apache.lucene.analysis.ta;
   exports org.apache.lucene.analysis.te;
   exports org.apache.lucene.analysis.th;
@@ -257,6 +258,7 @@
       org.apache.lucene.analysis.sv.SwedishMinimalStemFilterFactory,
       org.apache.lucene.analysis.synonym.SynonymFilterFactory,
       org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory,
+      org.apache.lucene.analysis.synonym.word2vec.Word2VecSynonymFilterFactory,
       org.apache.lucene.analysis.core.FlattenGraphFilterFactory,
       org.apache.lucene.analysis.te.TeluguNormalizationFilterFactory,
       org.apache.lucene.analysis.te.TeluguStemFilterFactory,

diff --git a/...analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Dl4jModelReader.java b/...analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Dl4jModelReader.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym.word2vec;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
+import java.util.Base64;
+import java.util.Locale;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.TermAndVector;
+
+/**
+ * Word2VecModelReader is a Word2VecModelReader that reads the file generated by the library
+ * Deeplearning4j
+ *
+ * <p>Dl4j Word2Vec documentation:
+ * https://deeplearning4j.konduit.ai/v/en-1.0.0-beta7/language-processing/word2vec Example to
+ * generate a model using dl4j:
+ * https://github.com/eclipse/deeplearning4j-examples/blob/master/dl4j-examples/src/main/java/org/deeplearning4j/examples/advanced/modelling/embeddingsfromcorpus/word2vec/Word2VecRawTextExample.java
+ *
+ * @lucene.experimental
+ */
+public class Dl4jModelReader implements Word2VecModelReader {
+
+  private static final String MODEL_FILE_NAME_PREFIX = "syn0";
+
+  private final String word2vecModelFilePath;
+  private final ZipInputStream word2VecModelZipFile;
+
+  public Dl4jModelReader(String word2vecModelFilePath, InputStream stream) {
+    this.word2vecModelFilePath = word2vecModelFilePath;
+    this.word2VecModelZipFile = new ZipInputStream(new BufferedInputStream(stream));
+  }
+
+  @Override
+  public Word2VecModel read() throws IOException {
+
+    ZipEntry entry;
+    while ((entry = word2VecModelZipFile.getNextEntry()) != null) {
+      String fileName = entry.getName();
+      if (fileName.startsWith(MODEL_FILE_NAME_PREFIX)) {
+        BufferedReader reader =
+            new BufferedReader(new InputStreamReader(word2VecModelZipFile, StandardCharsets.UTF_8));
+
+        String header = reader.readLine();
+        String[] headerValues = header.split(" ");
+        int dictionarySize = Integer.parseInt(headerValues[0]);
+        int vectorDimension = Integer.parseInt(headerValues[1]);
+
+        Word2VecModel model = new Word2VecModel(dictionarySize, vectorDimension);
+        reader
+            .lines()
+            .forEach(
+                line -> {
+                  String[] tokens = line.split(" ");
+                  BytesRef term = decodeTerm(tokens[0]);
+
+                  float[] vector = new float[tokens.length - 1];
+
+                  if (vectorDimension != vector.length) {
+                    throw new RuntimeException(
+                        String.format(
+                            Locale.ROOT,
+                            "Word2Vec model file corrupted. "
+                                + "Declared vectors of size %d but found vector of size %d for word %s (%s)",
+                            vectorDimension,
+                            vector.length,
+                            tokens[0],
+                            term.utf8ToString()));
+                  }
+
+                  for (int i = 1; i < tokens.length; i++) {
+                    vector[i - 1] = Float.parseFloat(tokens[i]);
+                  }
+                  model.addTermAndVector(new TermAndVector(term, vector));
+                });
+        return model;
+      }
+    }
+    throw new UnsupportedEncodingException(
+        "The ZIP file '"
+            + word2vecModelFilePath
+            + "' does not contain any "
+            + MODEL_FILE_NAME_PREFIX
+            + " file");
+  }
+
+  static BytesRef decodeTerm(String term) {
+    if (term.toLowerCase(Locale.ROOT).startsWith("b64:")) {
+      byte[] buffer = Base64.getDecoder().decode(term.substring(4).trim());
+      return new BytesRef(buffer, 0, buffer.length);
+    }
+    return new BytesRef(term);
+  }
+
+  @Override
+  public void close() throws IOException {
+    word2VecModelZipFile.close();
+  }
+}
diff --git a/...analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/SynonymProvider.java b/...analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/SynonymProvider.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym.word2vec;
+
+import java.io.IOException;
+import java.util.List;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.TermAndBoost;
+
+/**
+ * Generic synonym provider
+ *
+ * @lucene.experimental
+ */
+public interface SynonymProvider {
+
+  /**
+   * SynonymProvider constructor
+   *
+   * @param term we want to find the synonyms
+   * @param maxSynonymsPerTerm maximum number of result returned by the synonym search
+   * @param minAcceptedSimilarity minimal value of cosine similarity between the searched vector and
+   *     the retrieved ones
+   */
+  List<TermAndBoost> getSynonyms(BytesRef term, int maxSynonymsPerTerm, float minAcceptedSimilarity)
+      throws IOException;
+}
diff --git a/...e/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java b/...e/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym.word2vec;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.TermAndVector;
+import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
+
+/**
+ * Word2VecModel is a class representing the parsed Word2Vec model containing the vectors for each
+ * word in dictionary
+ *
+ * @lucene.experimental
+ */
+public class Word2VecModel implements RandomAccessVectorValues<float[]> {
+
+  private final int dictionarySize;
+  private final int vectorDimension;
+  private final TermAndVector[] data;
+  private final Map<BytesRef, TermAndVector> word2Vec;
+  private int loadedCount = 0;
+
+  public Word2VecModel(int dictionarySize, int vectorDimension) {
+    this.dictionarySize = dictionarySize;
+    this.vectorDimension = vectorDimension;
+    this.data = new TermAndVector[dictionarySize];
+    this.word2Vec = new HashMap<>();
+  }
+
+  private Word2VecModel(
+      int dictionarySize,
+      int vectorDimension,
+      TermAndVector[] data,
+      Map<BytesRef, TermAndVector> word2Vec) {
+    this.dictionarySize = dictionarySize;
+    this.vectorDimension = vectorDimension;
+    this.data = data;
+    this.word2Vec = word2Vec;
+  }
+
+  public void addTermAndVector(TermAndVector modelEntry) {
+    modelEntry.normalizeVector();
+    this.data[loadedCount++] = modelEntry;
+    this.word2Vec.put(modelEntry.getTerm(), modelEntry);
+  }
+
+  @Override
+  public float[] vectorValue(int ord) throws IOException {
+    return data[ord].getVector();
+  }
+
+  public float[] vectorValue(BytesRef term) {
+    TermAndVector entry = word2Vec.get(term);
+    return (entry == null) ? null : entry.getVector();
+  }
+
+  public BytesRef binaryValue(int targetOrd) throws IOException {
+    return data[targetOrd].getTerm();
+  }
+
+  @Override
+  public int dimension() {
+    return vectorDimension;
+  }
+
+  @Override
+  public int size() {
+    return dictionarySize;
+  }
+
+  @Override
+  public RandomAccessVectorValues<float[]> copy() throws IOException {
 if (vectorsToAdd == this.vectors) { 
   throw new IllegalArgumentException( 
       "Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()"); 
 } 
 if (vectorsToAdd == this.vectors) { 
   throw new IllegalArgumentException( 
       "Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()"); 
 } 
+    return new Word2VecModel(this.dictionarySize, this.vectorDimension, this.data, this.word2Vec);
+  }
+}
diff --git a/...ysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModelReader.java b/...ysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModelReader.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.synonym.word2vec;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/**
+ * Each class extending this interface must be able to read a Word2Vec model format and provide a
+ * Word2VecModel with normalized vectors
+ *
+ * @lucene.experimental
+ */
+public interface Word2VecModelReader extends Closeable {
+
+  Word2VecModel read() throws IOException;
+}