diff --git a/AuxIndexStructures.cpp b/AuxIndexStructures.cpp
deleted file mode 100644
index e4e573878f..0000000000
--- a/AuxIndexStructures.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <cstring>
-
-#include "AuxIndexStructures.h"
-
-#include "FaissAssert.h"
-
-
-namespace faiss {
-
-
-/***********************************************************************
- * RangeSearchResult
- ***********************************************************************/
-
-RangeSearchResult::RangeSearchResult (idx_t nq, bool alloc_lims): nq (nq) {
-    if (alloc_lims) {
-        lims = new size_t [nq + 1];
-        memset (lims, 0, sizeof(*lims) * (nq + 1));
-    } else {
-        lims = nullptr;
-    }
-    labels = nullptr;
-    distances = nullptr;
-    buffer_size = 1024 * 256;
-}
-
-/// called when lims contains the nb of elements result entries
-/// for each query
-void RangeSearchResult::do_allocation () {
-    size_t ofs = 0;
-    for (int i = 0; i < nq; i++) {
-        size_t n = lims[i];
-        lims [i] = ofs;
-        ofs += n;
-    }
-    lims [nq] = ofs;
-    labels = new idx_t [ofs];
-    distances = new float [ofs];
-}
-
-RangeSearchResult::~RangeSearchResult () {
-    delete [] labels;
-    delete [] distances;
-    delete [] lims;
-}
-
-
-
-
-
-/***********************************************************************
- * BufferList
- ***********************************************************************/
-
-
-BufferList::BufferList (size_t buffer_size):
-    buffer_size (buffer_size)
-{
-    wp = buffer_size;
-}
-
-BufferList::~BufferList ()
-{
-    for (int i = 0; i < buffers.size(); i++) {
-        delete [] buffers[i].ids;
-        delete [] buffers[i].dis;
-    }
-}
-
-void BufferList::add (idx_t id, float dis) {
-    if (wp == buffer_size) { // need new buffer
-        append_buffer();
-    }
-    Buffer & buf = buffers.back();
-    buf.ids [wp] = id;
-    buf.dis [wp] = dis;
-    wp++;
-}
-
-
-void BufferList::append_buffer ()
-{
-    Buffer buf = {new idx_t [buffer_size], new float [buffer_size]};
-    buffers.push_back (buf);
-    wp = 0;
-}
-
-/// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
-/// tables dest_ids, dest_dis
-void BufferList::copy_range (size_t ofs, size_t n,
-                             idx_t * dest_ids, float *dest_dis)
-{
-    size_t bno = ofs / buffer_size;
-    ofs -= bno * buffer_size;
-    while (n > 0) {
-        size_t ncopy = ofs + n < buffer_size ? n : buffer_size - ofs;
-        Buffer buf = buffers [bno];
-        memcpy (dest_ids, buf.ids + ofs, ncopy * sizeof(*dest_ids));
-        memcpy (dest_dis, buf.dis + ofs, ncopy * sizeof(*dest_dis));
-        dest_ids += ncopy;
-        dest_dis += ncopy;
-        ofs = 0;
-        bno ++;
-        n -= ncopy;
-    }
-}
-
-
-/***********************************************************************
- * RangeSearchPartialResult
- ***********************************************************************/
-
-void RangeQueryResult::add (float dis, idx_t id) {
-    nres++;
-    pres->add (id, dis);
-}
-
-
-
-RangeSearchPartialResult::RangeSearchPartialResult (RangeSearchResult * res_in):
-    BufferList(res_in->buffer_size),
-    res(res_in)
-{}
-
-
-/// begin a new result
-RangeQueryResult &
-    RangeSearchPartialResult::new_result (idx_t qno)
-{
-    RangeQueryResult qres = {qno, 0, this};
-    queries.push_back (qres);
-    return queries.back();
-}
-
-
-void RangeSearchPartialResult::finalize ()
-{
-    set_lims ();
-#pragma omp barrier
-
-#pragma omp single
-    res->do_allocation ();
-
-#pragma omp barrier
-    copy_result ();
-}
-
-
-/// called by range_search before do_allocation
-void RangeSearchPartialResult::set_lims ()
-{
-    for (int i = 0; i < queries.size(); i++) {
-        RangeQueryResult & qres = queries[i];
-        res->lims[qres.qno] = qres.nres;
-    }
-}
-
-/// called by range_search after do_allocation
-void RangeSearchPartialResult::copy_result (bool incremental)
-{
-    size_t ofs = 0;
-    for (int i = 0; i < queries.size(); i++) {
-        RangeQueryResult & qres = queries[i];
-
-        copy_range (ofs, qres.nres,
-                    res->labels + res->lims[qres.qno],
-                    res->distances + res->lims[qres.qno]);
-        if (incremental) {
-            res->lims[qres.qno] += qres.nres;
-        }
-        ofs += qres.nres;
-    }
-}
-
-void RangeSearchPartialResult::merge (std::vector <RangeSearchPartialResult *> &
-                                      partial_results, bool do_delete)
-{
-
-    int npres = partial_results.size();
-    if (npres == 0) return;
-    RangeSearchResult *result = partial_results[0]->res;
-    size_t nx = result->nq;
-
-    // count
-    for (const RangeSearchPartialResult * pres : partial_results) {
-        if (!pres) continue;
-        for (const RangeQueryResult &qres : pres->queries) {
-            result->lims[qres.qno] += qres.nres;
-        }
-    }
-    result->do_allocation ();
-    for (int j = 0; j < npres; j++) {
-        if (!partial_results[j]) continue;
-        partial_results[j]->copy_result (true);
-        if (do_delete) {
-            delete partial_results[j];
-            partial_results[j] = nullptr;
-        }
-    }
-
-    // reset the limits
-    for (size_t i = nx; i > 0; i--) {
-        result->lims [i] = result->lims [i - 1];
-    }
-    result->lims [0] = 0;
-}
-
-/***********************************************************************
- * IDSelectorRange
- ***********************************************************************/
-
-IDSelectorRange::IDSelectorRange (idx_t imin, idx_t imax):
-    imin (imin), imax (imax)
-{
-}
-
-bool IDSelectorRange::is_member (idx_t id) const
-{
-    return id >= imin && id < imax;
-}
-
-
-/***********************************************************************
- * IDSelectorBatch
- ***********************************************************************/
-
-IDSelectorBatch::IDSelectorBatch (size_t n, const idx_t *indices)
-{
-    nbits = 0;
-    while (n > (1L << nbits)) nbits++;
-    nbits += 5;
-    // for n = 1M, nbits = 25 is optimal, see P56659518
-
-    mask = (1L << nbits) - 1;
-    bloom.resize (1UL << (nbits - 3), 0);
-    for (long i = 0; i < n; i++) {
-        Index::idx_t id = indices[i];
-        set.insert(id);
-        id &= mask;
-        bloom[id >> 3] |= 1 << (id & 7);
-    }
-}
-
-bool IDSelectorBatch::is_member (idx_t i) const
-{
-    long im = i & mask;
-    if(!(bloom[im>>3] & (1 << (im & 7)))) {
-        return 0;
-    }
-    return set.count(i);
-}
-
-
-/***********************************************************************
- * IO functions
- ***********************************************************************/
-
-
-int IOReader::fileno ()
-{
-    FAISS_THROW_MSG ("IOReader does not support memory mapping");
-}
-
-int IOWriter::fileno ()
-{
-    FAISS_THROW_MSG ("IOWriter does not support memory mapping");
-}
-
-
-size_t VectorIOWriter::operator()(
-                const void *ptr, size_t size, size_t nitems)
-{
-    size_t o = data.size();
-    data.resize(o + size * nitems);
-    memcpy (&data[o], ptr, size * nitems);
-    return nitems;
-}
-
-size_t VectorIOReader::operator()(
-                  void *ptr, size_t size, size_t nitems)
-{
-    if (rp >= data.size()) return 0;
-    size_t nremain = (data.size() - rp) / size;
-    if (nremain < nitems) nitems = nremain;
-    memcpy (ptr, &data[rp], size * nitems);
-    rp += size * nitems;
-    return nitems;
-}
-
-
-/***********************************************************
- * Interrupt callback
- ***********************************************************/
-
-
-std::unique_ptr<InterruptCallback> InterruptCallback::instance;
-
-std::mutex InterruptCallback::lock;
-
-void InterruptCallback::clear_instance () {
-    delete instance.release ();
-}
-
-void InterruptCallback::check () {
-    if (!instance.get()) {
-        return;
-    }
-    if (instance->want_interrupt ()) {
-        FAISS_THROW_MSG ("computation interrupted");
-    }
-}
-
-bool InterruptCallback::is_interrupted () {
-    if (!instance.get()) {
-        return false;
-    }
-    std::lock_guard<std::mutex> guard(lock);
-    return instance->want_interrupt();
-}
-
-
-size_t InterruptCallback::get_period_hint (size_t flops) {
-    if (!instance.get()) {
-        return 1L << 30; // never check
-    }
-    // for 10M flops, it is reasonable to check once every 10 iterations
-    return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
-}
-
-
-
-
-} // namespace faiss
diff --git a/AuxIndexStructures.h b/AuxIndexStructures.h
deleted file mode 100644
index 37056729b2..0000000000
--- a/AuxIndexStructures.h
+++ /dev/null
@@ -1,286 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-// Auxiliary index structures, that are used in indexes but that can
-// be forward-declared
-
-#ifndef FAISS_AUX_INDEX_STRUCTURES_H
-#define FAISS_AUX_INDEX_STRUCTURES_H
-
-#include <stdint.h>
-
-#include <vector>
-#include <unordered_set>
-#include <memory>
-#include <mutex>
-
-#include "Index.h"
-
-namespace faiss {
-
-/** The objective is to have a simple result structure while
- *  minimizing the number of mem copies in the result. The method
- *  do_allocation can be overloaded to allocate the result tables in
- *  the matrix type of a scripting language like Lua or Python. */
-struct RangeSearchResult {
-    size_t nq;      ///< nb of queries
-    size_t *lims;   ///< size (nq + 1)
-
-    typedef Index::idx_t idx_t;
-
-    idx_t *labels;     ///< result for query i is labels[lims[i]:lims[i+1]]
-    float *distances;  ///< corresponding distances (not sorted)
-
-    size_t buffer_size; ///< size of the result buffers used
-
-    /// lims must be allocated on input to range_search.
-    explicit RangeSearchResult (idx_t nq, bool alloc_lims=true);
-
-    /// called when lims contains the nb of elements result entries
-    /// for each query
-    virtual void do_allocation ();
-
-    virtual ~RangeSearchResult ();
-};
-
-
-/** Encapsulates a set of ids to remove. */
-struct IDSelector {
-    typedef Index::idx_t idx_t;
-    virtual bool is_member (idx_t id) const = 0;
-    virtual ~IDSelector() {}
-};
-
-
-
-/** remove ids between [imni, imax) */
-struct IDSelectorRange: IDSelector {
-    idx_t imin, imax;
-
-    IDSelectorRange (idx_t imin, idx_t imax);
-    bool is_member(idx_t id) const override;
-    ~IDSelectorRange() override {}
-};
-
-
-/** Remove ids from a set. Repetitions of ids in the indices set
- * passed to the constructor does not hurt performance. The hash
- * function used for the bloom filter and GCC's implementation of
- * unordered_set are just the least significant bits of the id. This
- * works fine for random ids or ids in sequences but will produce many
- * hash collisions if lsb's are always the same */
-struct IDSelectorBatch: IDSelector {
-
-    std::unordered_set<idx_t> set;
-
-    typedef unsigned char uint8_t;
-    std::vector<uint8_t> bloom; // assumes low bits of id are a good hash value
-    int nbits;
-    idx_t mask;
-
-    IDSelectorBatch (size_t n, const idx_t *indices);
-    bool is_member(idx_t id) const override;
-    ~IDSelectorBatch() override {}
-};
-
-/****************************************************************
- * Result structures for range search.
- *
- * The main constraint here is that we want to support parallel
- * queries from different threads in various ways: 1 thread per query,
- * several threads per query. We store the actual results in blocks of
- * fixed size rather than exponentially increasing memory. At the end,
- * we copy the block content to a linear result array.
- *****************************************************************/
-
-/** List of temporary buffers used to store results before they are
- *  copied to the RangeSearchResult object. */
-struct BufferList {
-    typedef Index::idx_t idx_t;
-
-    // buffer sizes in # entries
-    size_t buffer_size;
-
-    struct Buffer {
-        idx_t *ids;
-        float *dis;
-    };
-
-    std::vector<Buffer> buffers;
-    size_t wp; ///< write pointer in the last buffer.
-
-    explicit BufferList (size_t buffer_size);
-
-    ~BufferList ();
-
-    /// create a new buffer
-    void append_buffer ();
-
-    /// add one result, possibly appending a new buffer if needed
-    void add (idx_t id, float dis);
-
-    /// copy elemnts ofs:ofs+n-1 seen as linear data in the buffers to
-    /// tables dest_ids, dest_dis
-    void copy_range (size_t ofs, size_t n,
-                     idx_t * dest_ids, float *dest_dis);
-
-};
-
-struct RangeSearchPartialResult;
-
-/// result structure for a single query
-struct RangeQueryResult {
-    using idx_t = Index::idx_t;
-    idx_t qno;    //< id of the query
-    size_t nres;  //< nb of results for this query
-    RangeSearchPartialResult * pres;
-
-    /// called by search function to report a new result
-    void add (float dis, idx_t id);
-};
-
-/// the entries in the buffers are split per query
-struct RangeSearchPartialResult: BufferList {
-    RangeSearchResult * res;
-
-    /// eventually the result will be stored in res_in
-    explicit RangeSearchPartialResult (RangeSearchResult * res_in);
-
-    /// query ids + nb of results per query.
-    std::vector<RangeQueryResult> queries;
-
-    /// begin a new result
-    RangeQueryResult & new_result (idx_t qno);
-
-    /*****************************************
-     * functions used at the end of the search to merge the result
-     * lists */
-    void finalize ();
-
-    /// called by range_search before do_allocation
-    void set_lims ();
-
-    /// called by range_search after do_allocation
-    void copy_result (bool incremental = false);
-
-    /// merge a set of PartialResult's into one RangeSearchResult
-    /// on ouptut the partialresults are empty!
-    static void merge (std::vector <RangeSearchPartialResult *> &
-                       partial_results, bool do_delete=true);
-
-};
-
-/***********************************************************
- * Abstract I/O objects
- ***********************************************************/
-
-struct IOReader {
-    // name that can be used in error messages
-    std::string name;
-
-    // fread
-    virtual size_t operator()(
-         void *ptr, size_t size, size_t nitems) = 0;
-
-    // return a file number that can be memory-mapped
-    virtual int fileno ();
-
-    virtual ~IOReader() {}
-};
-
-struct IOWriter {
-    // name that can be used in error messages
-    std::string name;
-
-    // fwrite
-    virtual size_t operator()(
-         const void *ptr, size_t size, size_t nitems) = 0;
-
-    // return a file number that can be memory-mapped
-    virtual int fileno ();
-
-    virtual ~IOWriter() {}
-};
-
-
-struct VectorIOReader:IOReader {
-    std::vector<uint8_t> data;
-    size_t rp = 0;
-    size_t operator()(void *ptr, size_t size, size_t nitems) override;
-};
-
-struct VectorIOWriter:IOWriter {
-    std::vector<uint8_t> data;
-    size_t operator()(const void *ptr, size_t size, size_t nitems) override;
-};
-
-/***********************************************************
- * The distance computer maintains a current query and computes
- * distances to elements in an index that supports random access.
- *
- * The DistanceComputer is not intended to be thread-safe (eg. because
- * it maintains counters) so the distance functions are not const,
- * instanciate one from each thread if needed.
- ***********************************************************/
-struct DistanceComputer {
-     using idx_t = Index::idx_t;
-
-     /// called before computing distances
-     virtual void set_query(const float *x) = 0;
-
-     /// compute distance of vector i to current query
-     virtual float operator () (idx_t i) = 0;
-
-     /// compute distance between two stored vectors
-     virtual float symmetric_dis (idx_t i, idx_t j) = 0;
-
-     virtual ~DistanceComputer() {}
-};
-
-/***********************************************************
- * Interrupt callback
- ***********************************************************/
-
-struct InterruptCallback {
-    virtual bool want_interrupt () = 0;
-    virtual ~InterruptCallback() {}
-
-    // lock that protects concurrent calls to is_interrupted
-    static std::mutex lock;
-
-    static std::unique_ptr<InterruptCallback> instance;
-
-    static void clear_instance ();
-
-    /** check if:
-     * - an interrupt callback is set
-     * - the callback retuns true
-     * if this is the case, then throw an exception. Should not be called
-     * from multiple threds.
-     */
-    static void check ();
-
-    /// same as check() but return true if is interrupted instead of
-    /// throwing. Can be called from multiple threads.
-    static bool is_interrupted ();
-
-    /** assuming each iteration takes a certain number of flops, what
-     * is a reasonable interval to check for interrupts?
-     */
-    static size_t get_period_hint (size_t flops);
-
-};
-
-
-
-}; // namespace faiss
-
-
-
-#endif
diff --git a/FaissAssert.h b/FaissAssert.h
deleted file mode 100644
index 64a0eafc9a..0000000000
--- a/FaissAssert.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_ASSERT_INCLUDED
-#define FAISS_ASSERT_INCLUDED
-
-#include "FaissException.h"
-#include <cstdlib>
-#include <cstdio>
-#include <string>
-
-///
-/// Assertions
-///
-
-#define FAISS_ASSERT(X)                                                 \
-  do {                                                                  \
-    if (! (X)) {                                                        \
-      fprintf(stderr, "Faiss assertion '%s' failed in %s "              \
-               "at %s:%d\n",                                            \
-               #X, __PRETTY_FUNCTION__, __FILE__, __LINE__);            \
-      abort();                                                          \
-    }                                                                   \
-  } while (false)
-
-#define FAISS_ASSERT_MSG(X, MSG)                                        \
-  do {                                                                  \
-    if (! (X)) {                                                        \
-      fprintf(stderr, "Faiss assertion '%s' failed in %s "              \
-               "at %s:%d; details: " MSG "\n",                          \
-               #X, __PRETTY_FUNCTION__, __FILE__, __LINE__);            \
-      abort();                                                          \
-    }                                                                   \
-  } while (false)
-
-#define FAISS_ASSERT_FMT(X, FMT, ...)                                   \
-  do {                                                                  \
-    if (! (X)) {                                                        \
-      fprintf(stderr, "Faiss assertion '%s' failed in %s "              \
-               "at %s:%d; details: " FMT "\n",                          \
-               #X, __PRETTY_FUNCTION__, __FILE__, __LINE__, __VA_ARGS__); \
-      abort();                                                          \
-    }                                                                   \
-  } while (false)
-
-///
-/// Exceptions for returning user errors
-///
-
-#define FAISS_THROW_MSG(MSG)                                            \
-  do {                                                                  \
-    throw faiss::FaissException(MSG, __PRETTY_FUNCTION__, __FILE__, __LINE__); \
-  } while (false)
-
-#define FAISS_THROW_FMT(FMT, ...)                                       \
-  do {                                                                  \
-    std::string __s;                                                    \
-    int __size = snprintf(nullptr, 0, FMT, __VA_ARGS__);                \
-    __s.resize(__size + 1);                                             \
-    snprintf(&__s[0], __s.size(), FMT, __VA_ARGS__);                    \
-    throw faiss::FaissException(__s, __PRETTY_FUNCTION__, __FILE__, __LINE__); \
-  } while (false)
-
-///
-/// Exceptions thrown upon a conditional failure
-///
-
-#define FAISS_THROW_IF_NOT(X)                           \
-  do {                                                  \
-    if (!(X)) {                                         \
-      FAISS_THROW_FMT("Error: '%s' failed", #X);        \
-    }                                                   \
-  } while (false)
-
-#define FAISS_THROW_IF_NOT_MSG(X, MSG)                  \
-  do {                                                  \
-    if (!(X)) {                                         \
-      FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X);  \
-    }                                                   \
-  } while (false)
-
-#define FAISS_THROW_IF_NOT_FMT(X, FMT, ...)                             \
-  do {                                                                  \
-    if (!(X)) {                                                         \
-      FAISS_THROW_FMT("Error: '%s' failed: " FMT, #X, __VA_ARGS__);     \
-    }                                                                   \
-  } while (false)
-
-#endif
diff --git a/FaissException.cpp b/FaissException.cpp
deleted file mode 100644
index ce3de0fc15..0000000000
--- a/FaissException.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "FaissException.h"
-#include <sstream>
-
-namespace faiss {
-
-FaissException::FaissException(const std::string& m)
-    : msg(m) {
-}
-
-FaissException::FaissException(const std::string& m,
-                               const char* funcName,
-                               const char* file,
-                               int line) {
-  int size = snprintf(nullptr, 0, "Error in %s at %s:%d: %s",
-                      funcName, file, line, m.c_str());
-  msg.resize(size + 1);
-  snprintf(&msg[0], msg.size(), "Error in %s at %s:%d: %s",
-           funcName, file, line, m.c_str());
-}
-
-const char*
-FaissException::what() const noexcept {
-  return msg.c_str();
-}
-
-void handleExceptions(
-  std::vector<std::pair<int, std::exception_ptr>>& exceptions) {
-  if (exceptions.size() == 1) {
-    // throw the single received exception directly
-    std::rethrow_exception(exceptions.front().second);
-
-  } else if (exceptions.size() > 1) {
-    // multiple exceptions; aggregate them and return a single exception
-    std::stringstream ss;
-
-    for (auto& p : exceptions) {
-      try {
-        std::rethrow_exception(p.second);
-      } catch (std::exception& ex) {
-        if (ex.what()) {
-          // exception message available
-          ss << "Exception thrown from index " << p.first << ": "
-             << ex.what() << "\n";
-        } else {
-          // No message available
-          ss << "Unknown exception thrown from index " << p.first << "\n";
-        }
-      } catch (...) {
-        ss << "Unknown exception thrown from index " << p.first << "\n";
-      }
-    }
-
-    throw FaissException(ss.str());
-  }
-}
-
-}
diff --git a/FaissException.h b/FaissException.h
deleted file mode 100644
index 9d54edbad5..0000000000
--- a/FaissException.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_EXCEPTION_INCLUDED
-#define FAISS_EXCEPTION_INCLUDED
-
-#include <exception>
-#include <string>
-#include <vector>
-#include <utility>
-
-namespace faiss {
-
-/// Base class for Faiss exceptions
-class FaissException : public std::exception {
- public:
-  explicit FaissException(const std::string& msg);
-
-  FaissException(const std::string& msg,
-                 const char* funcName,
-                 const char* file,
-                 int line);
-
-  /// from std::exception
-  const char* what() const noexcept override;
-
-  std::string msg;
-};
-
-/// Handle multiple exceptions from worker threads, throwing an appropriate
-/// exception that aggregates the information
-/// The pair int is the thread that generated the exception
-void
-handleExceptions(std::vector<std::pair<int, std::exception_ptr>>& exceptions);
-
-/** bare-bones unique_ptr
- * this one deletes with delete [] */
-template<class T>
-struct ScopeDeleter {
-    const T * ptr;
-    explicit ScopeDeleter (const T* ptr = nullptr): ptr (ptr) {}
-    void release () {ptr = nullptr; }
-    void set (const T * ptr_in) { ptr = ptr_in; }
-    void swap (ScopeDeleter<T> &other) {std::swap (ptr, other.ptr); }
-    ~ScopeDeleter () {
-        delete [] ptr;
-    }
-};
-
-/** same but deletes with the simple delete (least common case) */
-template<class T>
-struct ScopeDeleter1 {
-    const T * ptr;
-    explicit ScopeDeleter1 (const T* ptr = nullptr): ptr (ptr) {}
-    void release () {ptr = nullptr; }
-    void set (const T * ptr_in) { ptr = ptr_in; }
-    void swap (ScopeDeleter1<T> &other) {std::swap (ptr, other.ptr); }
-    ~ScopeDeleter1 () {
-        delete ptr;
-    }
-};
-
-}
-
-#endif
diff --git a/HNSW.cpp b/HNSW.cpp
deleted file mode 100644
index 28ccdcbe44..0000000000
--- a/HNSW.cpp
+++ /dev/null
@@ -1,815 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "HNSW.h"
-#include "AuxIndexStructures.h"
-
-namespace faiss {
-
-using idx_t = Index::idx_t;
-
-/**************************************************************
- * HNSW structure implementation
- **************************************************************/
-
-int HNSW::nb_neighbors(int layer_no) const
-{
-  return cum_nneighbor_per_level[layer_no + 1] -
-    cum_nneighbor_per_level[layer_no];
-}
-
-void HNSW::set_nb_neighbors(int level_no, int n)
-{
-  FAISS_THROW_IF_NOT(levels.size() == 0);
-  int cur_n = nb_neighbors(level_no);
-  for (int i = level_no + 1; i < cum_nneighbor_per_level.size(); i++) {
-    cum_nneighbor_per_level[i] += n - cur_n;
-  }
-}
-
-int HNSW::cum_nb_neighbors(int layer_no) const
-{
-  return cum_nneighbor_per_level[layer_no];
-}
-
-void HNSW::neighbor_range(idx_t no, int layer_no,
-                          size_t * begin, size_t * end) const
-{
-  size_t o = offsets[no];
-  *begin = o + cum_nb_neighbors(layer_no);
-  *end = o + cum_nb_neighbors(layer_no + 1);
-}
-
-
-
-HNSW::HNSW(int M) : rng(12345) {
-  set_default_probas(M, 1.0 / log(M));
-  max_level = -1;
-  entry_point = -1;
-  efSearch = 16;
-  efConstruction = 40;
-  upper_beam = 1;
-  offsets.push_back(0);
-}
-
-
-int HNSW::random_level()
-{
-  double f = rng.rand_float();
-  // could be a bit faster with bissection
-  for (int level = 0; level < assign_probas.size(); level++) {
-    if (f < assign_probas[level]) {
-      return level;
-    }
-    f -= assign_probas[level];
-  }
-  // happens with exponentially low probability
-  return assign_probas.size() - 1;
-}
-
-void HNSW::set_default_probas(int M, float levelMult)
-{
-  int nn = 0;
-  cum_nneighbor_per_level.push_back (0);
-  for (int level = 0; ;level++) {
-    float proba = exp(-level / levelMult) * (1 - exp(-1 / levelMult));
-    if (proba < 1e-9) break;
-    assign_probas.push_back(proba);
-    nn += level == 0 ? M * 2 : M;
-    cum_nneighbor_per_level.push_back (nn);
-  }
-}
-
-void HNSW::clear_neighbor_tables(int level)
-{
-  for (int i = 0; i < levels.size(); i++) {
-    size_t begin, end;
-    neighbor_range(i, level, &begin, &end);
-    for (size_t j = begin; j < end; j++) {
-      neighbors[j] = -1;
-    }
-  }
-}
-
-
-void HNSW::reset() {
-  max_level = -1;
-  entry_point = -1;
-  offsets.clear();
-  offsets.push_back(0);
-  levels.clear();
-  neighbors.clear();
-}
-
-
-
-void HNSW::print_neighbor_stats(int level) const
-{
-  FAISS_THROW_IF_NOT (level < cum_nneighbor_per_level.size());
-  printf("stats on level %d, max %d neighbors per vertex:\n",
-         level, nb_neighbors(level));
-  size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0;
-#pragma omp parallel for reduction(+: tot_neigh) reduction(+: tot_common) \
-  reduction(+: tot_reciprocal) reduction(+: n_node)
-  for (int i = 0; i < levels.size(); i++) {
-    if (levels[i] > level) {
-      n_node++;
-      size_t begin, end;
-      neighbor_range(i, level, &begin, &end);
-      std::unordered_set<int> neighset;
-      for (size_t j = begin; j < end; j++) {
-        if (neighbors [j] < 0) break;
-        neighset.insert(neighbors[j]);
-      }
-      int n_neigh = neighset.size();
-      int n_common = 0;
-      int n_reciprocal = 0;
-      for (size_t j = begin; j < end; j++) {
-        storage_idx_t i2 = neighbors[j];
-        if (i2 < 0) break;
-        FAISS_ASSERT(i2 != i);
-        size_t begin2, end2;
-        neighbor_range(i2, level, &begin2, &end2);
-        for (size_t j2 = begin2; j2 < end2; j2++) {
-          storage_idx_t i3 = neighbors[j2];
-          if (i3 < 0) break;
-          if (i3 == i) {
-            n_reciprocal++;
-            continue;
-          }
-          if (neighset.count(i3)) {
-            neighset.erase(i3);
-            n_common++;
-          }
-        }
-      }
-      tot_neigh += n_neigh;
-      tot_common += n_common;
-      tot_reciprocal += n_reciprocal;
-    }
-  }
-  float normalizer = n_node;
-  printf("   nb of nodes at that level %ld\n", n_node);
-  printf("   neighbors per node: %.2f (%ld)\n",
-         tot_neigh / normalizer, tot_neigh);
-  printf("   nb of reciprocal neighbors: %.2f\n", tot_reciprocal / normalizer);
-  printf("   nb of neighbors that are also neighbor-of-neighbors: %.2f (%ld)\n",
-         tot_common / normalizer, tot_common);
-
-
-
-}
-
-
-void HNSW::fill_with_random_links(size_t n)
-{
-  int max_level = prepare_level_tab(n);
-  RandomGenerator rng2(456);
-
-  for (int level = max_level - 1; level >= 0; --level) {
-    std::vector<int> elts;
-    for (int i = 0; i < n; i++) {
-      if (levels[i] > level) {
-        elts.push_back(i);
-      }
-    }
-    printf ("linking %ld elements in level %d\n",
-            elts.size(), level);
-
-    if (elts.size() == 1) continue;
-
-    for (int ii = 0; ii < elts.size(); ii++) {
-      int i = elts[ii];
-      size_t begin, end;
-      neighbor_range(i, 0, &begin, &end);
-      for (size_t j = begin; j < end; j++) {
-        int other = 0;
-        do {
-          other = elts[rng2.rand_int(elts.size())];
-        } while(other == i);
-
-        neighbors[j] = other;
-      }
-    }
-  }
-}
-
-
-int HNSW::prepare_level_tab(size_t n, bool preset_levels)
-{
-  size_t n0 = offsets.size() - 1;
-
-  if (preset_levels) {
-    FAISS_ASSERT (n0 + n == levels.size());
-  } else {
-    FAISS_ASSERT (n0 == levels.size());
-    for (int i = 0; i < n; i++) {
-      int pt_level = random_level();
-      levels.push_back(pt_level + 1);
-    }
-  }
-
-  int max_level = 0;
-  for (int i = 0; i < n; i++) {
-    int pt_level = levels[i + n0] - 1;
-    if (pt_level > max_level) max_level = pt_level;
-    offsets.push_back(offsets.back() +
-                      cum_nb_neighbors(pt_level + 1));
-    neighbors.resize(offsets.back(), -1);
-  }
-
-  return max_level;
-}
-
-
-/** Enumerate vertices from farthest to nearest from query, keep a
- * neighbor only if there is no previous neighbor that is closer to
- * that vertex than the query.
- */
-void HNSW::shrink_neighbor_list(
-  DistanceComputer& qdis,
-  std::priority_queue<NodeDistFarther>& input,
-  std::vector<NodeDistFarther>& output,
-  int max_size)
-{
-  while (input.size() > 0) {
-    NodeDistFarther v1 = input.top();
-    input.pop();
-    float dist_v1_q = v1.d;
-
-    bool good = true;
-    for (NodeDistFarther v2 : output) {
-      float dist_v1_v2 = qdis.symmetric_dis(v2.id, v1.id);
-
-      if (dist_v1_v2 < dist_v1_q) {
-        good = false;
-        break;
-      }
-    }
-
-    if (good) {
-      output.push_back(v1);
-      if (output.size() >= max_size) {
-        return;
-      }
-    }
-  }
-}
-
-
-namespace {
-
-
-using storage_idx_t = HNSW::storage_idx_t;
-using NodeDistCloser = HNSW::NodeDistCloser;
-using NodeDistFarther = HNSW::NodeDistFarther;
-
-
-/**************************************************************
- * Addition subroutines
- **************************************************************/
-
-
-/// remove neighbors from the list to make it smaller than max_size
-void shrink_neighbor_list(
-  DistanceComputer& qdis,
-  std::priority_queue<NodeDistCloser>& resultSet1,
-  int max_size)
-{
-    if (resultSet1.size() < max_size) {
-        return;
-    }
-    std::priority_queue<NodeDistFarther> resultSet;
-    std::vector<NodeDistFarther> returnlist;
-
-    while (resultSet1.size() > 0) {
-        resultSet.emplace(resultSet1.top().d, resultSet1.top().id);
-        resultSet1.pop();
-    }
-
-    HNSW::shrink_neighbor_list(qdis, resultSet, returnlist, max_size);
-
-    for (NodeDistFarther curen2 : returnlist) {
-        resultSet1.emplace(curen2.d, curen2.id);
-    }
-
-}
-
-
-/// add a link between two elements, possibly shrinking the list
-/// of links to make room for it.
-void add_link(HNSW& hnsw,
-              DistanceComputer& qdis,
-              storage_idx_t src, storage_idx_t dest,
-              int level)
-{
-  size_t begin, end;
-  hnsw.neighbor_range(src, level, &begin, &end);
-  if (hnsw.neighbors[end - 1] == -1) {
-    // there is enough room, find a slot to add it
-    size_t i = end;
-    while(i > begin) {
-      if (hnsw.neighbors[i - 1] != -1) break;
-      i--;
-    }
-    hnsw.neighbors[i] = dest;
-    return;
-  }
-
-  // otherwise we let them fight out which to keep
-
-  // copy to resultSet...
-  std::priority_queue<NodeDistCloser> resultSet;
-  resultSet.emplace(qdis.symmetric_dis(src, dest), dest);
-  for (size_t i = begin; i < end; i++) { // HERE WAS THE BUG
-    storage_idx_t neigh = hnsw.neighbors[i];
-    resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
-  }
-
-  shrink_neighbor_list(qdis, resultSet, end - begin);
-
-  // ...and back
-  size_t i = begin;
-  while (resultSet.size()) {
-    hnsw.neighbors[i++] = resultSet.top().id;
-    resultSet.pop();
-  }
-  // they may have shrunk more than just by 1 element
-  while(i < end) {
-    hnsw.neighbors[i++] = -1;
-  }
-}
-
-/// search neighbors on a single level, starting from an entry point
-void search_neighbors_to_add(
-  HNSW& hnsw,
-  DistanceComputer& qdis,
-  std::priority_queue<NodeDistCloser>& results,
-  int entry_point,
-  float d_entry_point,
-  int level,
-  VisitedTable &vt)
-{
-  // top is nearest candidate
-  std::priority_queue<NodeDistFarther> candidates;
-
-  NodeDistFarther ev(d_entry_point, entry_point);
-  candidates.push(ev);
-  results.emplace(d_entry_point, entry_point);
-  vt.set(entry_point);
-
-  while (!candidates.empty()) {
-    // get nearest
-    const NodeDistFarther &currEv = candidates.top();
-
-    if (currEv.d > results.top().d) {
-      break;
-    }
-    int currNode = currEv.id;
-    candidates.pop();
-
-    // loop over neighbors
-    size_t begin, end;
-    hnsw.neighbor_range(currNode, level, &begin, &end);
-    for(size_t i = begin; i < end; i++) {
-      storage_idx_t nodeId = hnsw.neighbors[i];
-      if (nodeId < 0) break;
-      if (vt.get(nodeId)) continue;
-      vt.set(nodeId);
-
-      float dis = qdis(nodeId);
-      NodeDistFarther evE1(dis, nodeId);
-
-      if (results.size() < hnsw.efConstruction ||
-          results.top().d > dis) {
-
-        results.emplace(dis, nodeId);
-        candidates.emplace(dis, nodeId);
-        if (results.size() > hnsw.efConstruction) {
-          results.pop();
-        }
-      }
-    }
-  }
-  vt.advance();
-}
-
-
-/**************************************************************
- * Searching subroutines
- **************************************************************/
-
-/// greedily update a nearest vector at a given level
-void greedy_update_nearest(const HNSW& hnsw,
-                           DistanceComputer& qdis,
-                           int level,
-                           storage_idx_t& nearest,
-                           float& d_nearest)
-{
-  for(;;) {
-    storage_idx_t prev_nearest = nearest;
-
-    size_t begin, end;
-    hnsw.neighbor_range(nearest, level, &begin, &end);
-    for(size_t i = begin; i < end; i++) {
-      storage_idx_t v = hnsw.neighbors[i];
-      if (v < 0) break;
-      float dis = qdis(v);
-      if (dis < d_nearest) {
-        nearest = v;
-        d_nearest = dis;
-      }
-    }
-    if (nearest == prev_nearest) {
-      return;
-    }
-  }
-}
-
-
-}  // namespace
-
-
-/// Finds neighbors and builds links with them, starting from an entry
-/// point. The own neighbor list is assumed to be locked.
-void HNSW::add_links_starting_from(DistanceComputer& ptdis,
-                                   storage_idx_t pt_id,
-                                   storage_idx_t nearest,
-                                   float d_nearest,
-                                   int level,
-                                   omp_lock_t *locks,
-                                   VisitedTable &vt)
-{
-  std::priority_queue<NodeDistCloser> link_targets;
-
-  search_neighbors_to_add(*this, ptdis, link_targets, nearest, d_nearest,
-                          level, vt);
-
-  // but we can afford only this many neighbors
-  int M = nb_neighbors(level);
-
-  ::faiss::shrink_neighbor_list(ptdis, link_targets, M);
-
-  while (!link_targets.empty()) {
-    int other_id = link_targets.top().id;
-
-    omp_set_lock(&locks[other_id]);
-    add_link(*this, ptdis, other_id, pt_id, level);
-    omp_unset_lock(&locks[other_id]);
-
-    add_link(*this, ptdis, pt_id, other_id, level);
-
-    link_targets.pop();
-  }
-}
-
-
-/**************************************************************
- * Building, parallel
- **************************************************************/
-
-void HNSW::add_with_locks(DistanceComputer& ptdis, int pt_level, int pt_id,
-                          std::vector<omp_lock_t>& locks,
-                          VisitedTable& vt)
-{
-  //  greedy search on upper levels
-
-  storage_idx_t nearest;
-#pragma omp critical
-  {
-    nearest = entry_point;
-
-    if (nearest == -1) {
-      max_level = pt_level;
-      entry_point = pt_id;
-    }
-  }
-
-  if (nearest < 0) {
-    return;
-  }
-
-  omp_set_lock(&locks[pt_id]);
-
-  int level = max_level; // level at which we start adding neighbors
-  float d_nearest = ptdis(nearest);
-
-  for(; level > pt_level; level--) {
-    greedy_update_nearest(*this, ptdis, level, nearest, d_nearest);
-  }
-
-  for(; level >= 0; level--) {
-    add_links_starting_from(ptdis, pt_id, nearest, d_nearest,
-                            level, locks.data(), vt);
-  }
-
-  omp_unset_lock(&locks[pt_id]);
-
-  if (pt_level > max_level) {
-    max_level = pt_level;
-    entry_point = pt_id;
-  }
-}
-
-
-/** Do a BFS on the candidates list */
-
-int HNSW::search_from_candidates(
-  DistanceComputer& qdis, int k,
-  idx_t *I, float *D,
-  MinimaxHeap& candidates,
-  VisitedTable& vt,
-  int level, int nres_in) const
-{
-  int nres = nres_in;
-  int ndis = 0;
-  for (int i = 0; i < candidates.size(); i++) {
-    idx_t v1 = candidates.ids[i];
-    float d = candidates.dis[i];
-    FAISS_ASSERT(v1 >= 0);
-    if (nres < k) {
-      faiss::maxheap_push(++nres, D, I, d, v1);
-    } else if (d < D[0]) {
-      faiss::maxheap_pop(nres--, D, I);
-      faiss::maxheap_push(++nres, D, I, d, v1);
-    }
-    vt.set(v1);
-  }
-
-  bool do_dis_check = check_relative_distance;
-  int nstep = 0;
-
-  while (candidates.size() > 0) {
-    float d0 = 0;
-    int v0 = candidates.pop_min(&d0);
-
-    if (do_dis_check) {
-      // tricky stopping condition: there are more that ef
-      // distances that are processed already that are smaller
-      // than d0
-
-      int n_dis_below = candidates.count_below(d0);
-      if(n_dis_below >= efSearch) {
-        break;
-      }
-    }
-
-    size_t begin, end;
-    neighbor_range(v0, level, &begin, &end);
-
-    for (size_t j = begin; j < end; j++) {
-      int v1 = neighbors[j];
-      if (v1 < 0) break;
-      if (vt.get(v1)) {
-        continue;
-      }
-      vt.set(v1);
-      ndis++;
-      float d = qdis(v1);
-      if (nres < k) {
-        faiss::maxheap_push(++nres, D, I, d, v1);
-      } else if (d < D[0]) {
-        faiss::maxheap_pop(nres--, D, I);
-        faiss::maxheap_push(++nres, D, I, d, v1);
-      }
-      candidates.push(v1, d);
-    }
-
-    nstep++;
-    if (!do_dis_check && nstep > efSearch) {
-      break;
-    }
-  }
-
-  if (level == 0) {
-#pragma omp critical
-    {
-      hnsw_stats.n1 ++;
-      if (candidates.size() == 0) {
-        hnsw_stats.n2 ++;
-      }
-      hnsw_stats.n3 += ndis;
-    }
-  }
-
-  return nres;
-}
-
-
-/**************************************************************
- * Searching
- **************************************************************/
-
-std::priority_queue<HNSW::Node> HNSW::search_from_candidate_unbounded(
-  const Node& node,
-  DistanceComputer& qdis,
-  int ef,
-  VisitedTable *vt) const
-{
-  int ndis = 0;
-  std::priority_queue<Node> top_candidates;
-  std::priority_queue<Node, std::vector<Node>, std::greater<Node>> candidates;
-
-  top_candidates.push(node);
-  candidates.push(node);
-
-  vt->set(node.second);
-
-  while (!candidates.empty()) {
-    float d0;
-    storage_idx_t v0;
-    std::tie(d0, v0) = candidates.top();
-
-    if (d0 > top_candidates.top().first) {
-      break;
-    }
-
-    candidates.pop();
-
-    size_t begin, end;
-    neighbor_range(v0, 0, &begin, &end);
-
-    for (size_t j = begin; j < end; ++j) {
-      int v1 = neighbors[j];
-
-      if (v1 < 0) {
-        break;
-      }
-      if (vt->get(v1)) {
-        continue;
-      }
-
-      vt->set(v1);
-
-      float d1 = qdis(v1);
-      ++ndis;
-
-      if (top_candidates.top().first > d1 || top_candidates.size() < ef) {
-        candidates.emplace(d1, v1);
-        top_candidates.emplace(d1, v1);
-
-        if (top_candidates.size() > ef) {
-          top_candidates.pop();
-        }
-      }
-    }
-  }
-
-#pragma omp critical
-  {
-    ++hnsw_stats.n1;
-    if (candidates.size() == 0) {
-      ++hnsw_stats.n2;
-    }
-    hnsw_stats.n3 += ndis;
-  }
-
-  return top_candidates;
-}
-
-void HNSW::search(DistanceComputer& qdis, int k,
-                  idx_t *I, float *D,
-                  VisitedTable& vt) const
-{
-  if (upper_beam == 1) {
-
-    //  greedy search on upper levels
-    storage_idx_t nearest = entry_point;
-    float d_nearest = qdis(nearest);
-
-    for(int level = max_level; level >= 1; level--) {
-      greedy_update_nearest(*this, qdis, level, nearest, d_nearest);
-    }
-
-    int ef = std::max(efSearch, k);
-    if (search_bounded_queue) {
-      MinimaxHeap candidates(ef);
-
-      candidates.push(nearest, d_nearest);
-
-      search_from_candidates(qdis, k, I, D, candidates, vt, 0);
-    } else {
-      std::priority_queue<Node> top_candidates =
-        search_from_candidate_unbounded(Node(d_nearest, nearest),
-                                        qdis, ef, &vt);
-
-      while (top_candidates.size() > k) {
-        top_candidates.pop();
-      }
-
-      int nres = 0;
-      while (!top_candidates.empty()) {
-        float d;
-        storage_idx_t label;
-        std::tie(d, label) = top_candidates.top();
-        faiss::maxheap_push(++nres, D, I, d, label);
-        top_candidates.pop();
-      }
-    }
-
-    vt.advance();
-
-  } else {
-    int candidates_size = upper_beam;
-    MinimaxHeap candidates(candidates_size);
-
-    std::vector<idx_t> I_to_next(candidates_size);
-    std::vector<float> D_to_next(candidates_size);
-
-    int nres = 1;
-    I_to_next[0] = entry_point;
-    D_to_next[0] = qdis(entry_point);
-
-    for(int level = max_level; level >= 0; level--) {
-
-      // copy I, D -> candidates
-
-      candidates.clear();
-
-      for (int i = 0; i < nres; i++) {
-        candidates.push(I_to_next[i], D_to_next[i]);
-      }
-
-      if (level == 0) {
-        nres = search_from_candidates(qdis, k, I, D, candidates, vt, 0);
-      } else  {
-        nres = search_from_candidates(
-          qdis, candidates_size,
-          I_to_next.data(), D_to_next.data(),
-          candidates, vt, level
-        );
-      }
-      vt.advance();
-    }
-  }
-}
-
-
-void HNSW::MinimaxHeap::push(storage_idx_t i, float v) {
-  if (k == n) {
-    if (v >= dis[0]) return;
-    faiss::heap_pop<HC> (k--, dis.data(), ids.data());
-    --nvalid;
-  }
-  faiss::heap_push<HC> (++k, dis.data(), ids.data(), v, i);
-  ++nvalid;
-}
-
-float HNSW::MinimaxHeap::max() const {
-  return dis[0];
-}
-
-int HNSW::MinimaxHeap::size() const {
-  return nvalid;
-}
-
-void HNSW::MinimaxHeap::clear() {
-  nvalid = k = 0;
-}
-
-int HNSW::MinimaxHeap::pop_min(float *vmin_out) {
-  assert(k > 0);
-  // returns min. This is an O(n) operation
-  int i = k - 1;
-  while (i >= 0) {
-    if (ids[i] != -1) break;
-    i--;
-  }
-  if (i == -1) return -1;
-  int imin = i;
-  float vmin = dis[i];
-  i--;
-  while(i >= 0) {
-    if (ids[i] != -1 && dis[i] < vmin) {
-      vmin = dis[i];
-      imin = i;
-    }
-    i--;
-  }
-  if (vmin_out) *vmin_out = vmin;
-  int ret = ids[imin];
-  ids[imin] = -1;
-  --nvalid;
-
-  return ret;
-}
-
-int HNSW::MinimaxHeap::count_below(float thresh) {
-  int n_below = 0;
-  for(int i = 0; i < k; i++) {
-    if (dis[i] < thresh) {
-      n_below++;
-    }
-  }
-
-  return n_below;
-}
-
-
-}  // namespace faiss
diff --git a/HNSW.h b/HNSW.h
deleted file mode 100644
index bb25006efd..0000000000
--- a/HNSW.h
+++ /dev/null
@@ -1,274 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#pragma once
-
-#include <vector>
-#include <unordered_set>
-#include <queue>
-
-#include <omp.h>
-
-#include "Index.h"
-#include "FaissAssert.h"
-#include "utils.h"
-
-
-namespace faiss {
-
-
-/** Implementation of the Hierarchical Navigable Small World
- * datastructure.
- *
- * Efficient and robust approximate nearest neighbor search using
- * Hierarchical Navigable Small World graphs
- *
- *  Yu. A. Malkov, D. A. Yashunin, arXiv 2017
- *
- * This implmentation is heavily influenced by the NMSlib
- * implementation by Yury Malkov and Leonid Boystov
- * (https://github.com/searchivarius/nmslib)
- *
- * The HNSW object stores only the neighbor link structure, see
- * IndexHNSW.h for the full index object.
- */
-
-
-struct VisitedTable;
-struct DistanceComputer; // from AuxIndexStructures
-
-struct HNSW {
-  /// internal storage of vectors (32 bits: this is expensive)
-  typedef int storage_idx_t;
-
-  /// Faiss results are 64-bit
-  typedef Index::idx_t idx_t;
-
-  typedef std::pair<float, storage_idx_t> Node;
-
-  /** Heap structure that allows fast
-   */
-  struct MinimaxHeap {
-    int n;
-    int k;
-    int nvalid;
-
-    std::vector<storage_idx_t> ids;
-    std::vector<float> dis;
-    typedef faiss::CMax<float, storage_idx_t> HC;
-
-    explicit MinimaxHeap(int n): n(n), k(0), nvalid(0), ids(n), dis(n) {}
-
-    void push(storage_idx_t i, float v);
-
-    float max() const;
-
-    int size() const;
-
-    void clear();
-
-    int pop_min(float *vmin_out = nullptr);
-
-    int count_below(float thresh);
-  };
-
-
-  /// to sort pairs of (id, distance) from nearest to fathest or the reverse
-  struct NodeDistCloser {
-    float d;
-    int id;
-    NodeDistCloser(float d, int id): d(d), id(id) {}
-    bool operator < (const NodeDistCloser &obj1) const { return d < obj1.d; }
-  };
-
-  struct NodeDistFarther {
-    float d;
-    int id;
-    NodeDistFarther(float d, int id): d(d), id(id) {}
-    bool operator < (const NodeDistFarther &obj1) const { return d > obj1.d; }
-  };
-
-
-  /// assignment probability to each layer (sum=1)
-  std::vector<double> assign_probas;
-
-  /// number of neighbors stored per layer (cumulative), should not
-  /// be changed after first add
-  std::vector<int> cum_nneighbor_per_level;
-
-  /// level of each vector (base level = 1), size = ntotal
-  std::vector<int> levels;
-
-  /// offsets[i] is the offset in the neighbors array where vector i is stored
-  /// size ntotal + 1
-  std::vector<size_t> offsets;
-
-  /// neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i
-  /// for all levels. this is where all storage goes.
-  std::vector<storage_idx_t> neighbors;
-
-  /// entry point in the search structure (one of the points with maximum level
-  storage_idx_t entry_point;
-
-  faiss::RandomGenerator rng;
-
-  /// maximum level
-  int max_level;
-
-  /// expansion factor at construction time
-  int efConstruction;
-
-  /// expansion factor at search time
-  int efSearch;
-
-  /// during search: do we check whether the next best distance is good enough?
-  bool check_relative_distance = true;
-
-  /// number of entry points in levels > 0.
-  int upper_beam;
-
-  /// use bounded queue during exploration
-  bool search_bounded_queue = true;
-
-  // methods that initialize the tree sizes
-
-  /// initialize the assign_probas and cum_nneighbor_per_level to
-  /// have 2*M links on level 0 and M links on levels > 0
-  void set_default_probas(int M, float levelMult);
-
-  /// set nb of neighbors for this level (before adding anything)
-  void set_nb_neighbors(int level_no, int n);
-
-  // methods that access the tree sizes
-
-  /// nb of neighbors for this level
-  int nb_neighbors(int layer_no) const;
-
-  /// cumumlative nb up to (and excluding) this level
-  int cum_nb_neighbors(int layer_no) const;
-
-  /// range of entries in the neighbors table of vertex no at layer_no
-  void neighbor_range(idx_t no, int layer_no,
-                      size_t * begin, size_t * end) const;
-
-  /// only mandatory parameter: nb of neighbors
-  explicit HNSW(int M = 32);
-
-  /// pick a random level for a new point
-  int random_level();
-
-  /// add n random levels to table (for debugging...)
-  void fill_with_random_links(size_t n);
-
-  void add_links_starting_from(DistanceComputer& ptdis,
-                               storage_idx_t pt_id,
-                               storage_idx_t nearest,
-                               float d_nearest,
-                               int level,
-                               omp_lock_t *locks,
-                               VisitedTable &vt);
-
-
-  /** add point pt_id on all levels <= pt_level and build the link
-   * structure for them. */
-  void add_with_locks(DistanceComputer& ptdis, int pt_level, int pt_id,
-                      std::vector<omp_lock_t>& locks,
-                      VisitedTable& vt);
-
-  int search_from_candidates(DistanceComputer& qdis, int k,
-                             idx_t *I, float *D,
-                             MinimaxHeap& candidates,
-                             VisitedTable &vt,
-                             int level, int nres_in = 0) const;
-
-  std::priority_queue<Node> search_from_candidate_unbounded(
-    const Node& node,
-    DistanceComputer& qdis,
-    int ef,
-    VisitedTable *vt
-  ) const;
-
-  /// search interface
-  void search(DistanceComputer& qdis, int k,
-              idx_t *I, float *D,
-              VisitedTable& vt) const;
-
-  void reset();
-
-  void clear_neighbor_tables(int level);
-  void print_neighbor_stats(int level) const;
-
-  int prepare_level_tab(size_t n, bool preset_levels = false);
-
-  static void shrink_neighbor_list(
-    DistanceComputer& qdis,
-    std::priority_queue<NodeDistFarther>& input,
-    std::vector<NodeDistFarther>& output,
-    int max_size);
-
-};
-
-
-/**************************************************************
- * Auxiliary structures
- **************************************************************/
-
-/// set implementation optimized for fast access.
-struct VisitedTable {
-  std::vector<uint8_t> visited;
-  int visno;
-
-  explicit VisitedTable(int size)
-    : visited(size), visno(1) {}
-
-  /// set flog #no to true
-  void set(int no) {
-    visited[no] = visno;
-  }
-
-  /// get flag #no
-  bool get(int no) const {
-    return visited[no] == visno;
-  }
-
-  /// reset all flags to false
-  void advance() {
-    visno++;
-    if (visno == 250) {
-      // 250 rather than 255 because sometimes we use visno and visno+1
-      memset(visited.data(), 0, sizeof(visited[0]) * visited.size());
-      visno = 1;
-    }
-  }
-};
-
-
-struct HNSWStats {
-  size_t n1, n2, n3;
-  size_t ndis;
-  size_t nreorder;
-  bool view;
-
-  HNSWStats() {
-    reset();
-  }
-
-  void reset() {
-    n1 = n2 = n3 = 0;
-    ndis = 0;
-    nreorder = 0;
-    view = false;
-  }
-};
-
-// global var that collects them all
-extern HNSWStats hnsw_stats;
-
-
-}  // namespace faiss
diff --git a/Heap.cpp b/Heap.cpp
deleted file mode 100644
index 0621828adf..0000000000
--- a/Heap.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/* Function for soft heap */
-
-#include "Heap.h"
-
-
-namespace faiss {
-
-
-template <typename C>
-void HeapArray<C>::heapify ()
-{
-#pragma omp parallel for
-    for (size_t j = 0; j < nh; j++)
-        heap_heapify<C> (k, val + j * k, ids + j * k);
-}
-
-template <typename C>
-void HeapArray<C>::reorder ()
-{
-#pragma omp parallel for
-    for (size_t j = 0; j < nh; j++)
-        heap_reorder<C> (k, val + j * k, ids + j * k);
-}
-
-template <typename C>
-void HeapArray<C>::addn (size_t nj, const T *vin, TI j0,
-                         size_t i0, int64_t ni)
-{
-    if (ni == -1) ni = nh;
-    assert (i0 >= 0 && i0 + ni <= nh);
-#pragma omp parallel for
-    for (size_t i = i0; i < i0 + ni; i++) {
-        T * __restrict simi = get_val(i);
-        TI * __restrict idxi = get_ids (i);
-        const T *ip_line = vin + (i - i0) * nj;
-
-        for (size_t j = 0; j < nj; j++) {
-            T ip = ip_line [j];
-            if (C::cmp(simi[0], ip)) {
-                heap_pop<C> (k, simi, idxi);
-                heap_push<C> (k, simi, idxi, ip, j + j0);
-            }
-        }
-    }
-}
-
-template <typename C>
-void HeapArray<C>::addn_with_ids (
-     size_t nj, const T *vin, const TI *id_in,
-     int64_t id_stride, size_t i0, int64_t ni)
-{
-    if (id_in == nullptr) {
-        addn (nj, vin, 0, i0, ni);
-        return;
-    }
-    if (ni == -1) ni = nh;
-    assert (i0 >= 0 && i0 + ni <= nh);
-#pragma omp parallel for
-    for (size_t i = i0; i < i0 + ni; i++) {
-        T * __restrict simi = get_val(i);
-        TI * __restrict idxi = get_ids (i);
-        const T *ip_line = vin + (i - i0) * nj;
-        const TI *id_line = id_in + (i - i0) * id_stride;
-
-        for (size_t j = 0; j < nj; j++) {
-            T ip = ip_line [j];
-            if (C::cmp(simi[0], ip)) {
-                heap_pop<C> (k, simi, idxi);
-                heap_push<C> (k, simi, idxi, ip, id_line [j]);
-            }
-        }
-    }
-}
-
-template <typename C>
-void HeapArray<C>::per_line_extrema (
-                   T * out_val,
-                   TI * out_ids) const
-{
-#pragma omp parallel for
-    for (size_t j = 0; j < nh; j++) {
-        int64_t imin = -1;
-        typename C::T xval = C::Crev::neutral ();
-        const typename C::T * x_ = val + j * k;
-        for (size_t i = 0; i < k; i++)
-            if (C::cmp (x_[i], xval)) {
-                xval = x_[i];
-                imin = i;
-            }
-        if (out_val)
-            out_val[j] = xval;
-
-        if (out_ids) {
-            if (ids && imin != -1)
-                out_ids[j] = ids [j * k + imin];
-            else
-                out_ids[j] = imin;
-        }
-    }
-}
-
-
-
-
-// explicit instanciations
-
-template struct HeapArray<CMin <float, int64_t> >;
-template struct HeapArray<CMax <float, int64_t> >;
-template struct HeapArray<CMin <int, int64_t> >;
-template struct HeapArray<CMax <int, int64_t> >;
-
-
-}  // END namespace fasis
diff --git a/Heap.h b/Heap.h
deleted file mode 100644
index e691c36c7f..0000000000
--- a/Heap.h
+++ /dev/null
@@ -1,495 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/*
- * C++ support for heaps. The set of functions is tailored for
- * efficient similarity search.
- *
- * There is no specific object for a heap, and the functions that
- * operate on a signle heap are inlined, because heaps are often
- * small. More complex functions are implemented in Heaps.cpp
- *
- */
-
-
-#ifndef FAISS_Heap_h
-#define FAISS_Heap_h
-
-#include <climits>
-#include <cstring>
-#include <cmath>
-
-#include <cassert>
-#include <cstdio>
-#include <stdint.h>
-
-#include <limits>
-
-
-namespace faiss {
-
-/*******************************************************************
- * C object: uniform handling of min and max heap
- *******************************************************************/
-
-/** The C object gives the type T of the values in the heap, the type
- *  of the keys, TI and the comparison that is done: > for the minheap
- *  and < for the maxheap. The neutral value will always be dropped in
- *  favor of any other value in the heap.
- */
-
-template <typename T_, typename TI_>
-struct CMax;
-
-// traits of minheaps = heaps where the minimum value is stored on top
-// useful to find the *max* values of an array
-template <typename T_, typename TI_>
-struct CMin {
-    typedef T_ T;
-    typedef TI_ TI;
-    typedef CMax<T_, TI_> Crev;
-    inline static bool cmp (T a, T b) {
-        return a < b;
-    }
-    // value that will be popped first -> must be smaller than all others
-    // for int types this is not strictly the smallest val (-max - 1)
-    inline static T neutral () {
-        return -std::numeric_limits<T>::max();
-    }
-};
-
-
-template <typename T_, typename TI_>
-struct CMax {
-    typedef T_ T;
-    typedef TI_ TI;
-    typedef CMin<T_, TI_> Crev;
-    inline static bool cmp (T a, T b) {
-        return a > b;
-    }
-    inline static T neutral () {
-        return std::numeric_limits<T>::max();
-    }
-};
-
-
-/*******************************************************************
- * Basic heap ops: push and pop
- *******************************************************************/
-
-/** Pops the top element from the heap defined by bh_val[0..k-1] and
- * bh_ids[0..k-1].  on output the element at k-1 is undefined.
- */
-template <class C> inline
-void heap_pop (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
-{
-    bh_val--; /* Use 1-based indexing for easier node->child translation */
-    bh_ids--;
-    typename C::T val = bh_val[k];
-    size_t i = 1, i1, i2;
-    while (1) {
-        i1 = i << 1;
-        i2 = i1 + 1;
-        if (i1 > k)
-            break;
-        if (i2 == k + 1 || C::cmp(bh_val[i1], bh_val[i2])) {
-            if (C::cmp(val, bh_val[i1]))
-                break;
-            bh_val[i] = bh_val[i1];
-            bh_ids[i] = bh_ids[i1];
-            i = i1;
-        }
-        else {
-            if (C::cmp(val, bh_val[i2]))
-                break;
-            bh_val[i] = bh_val[i2];
-            bh_ids[i] = bh_ids[i2];
-            i = i2;
-        }
-    }
-    bh_val[i] = bh_val[k];
-    bh_ids[i] = bh_ids[k];
-}
-
-
-
-/** Pushes the element (val, ids) into the heap bh_val[0..k-2] and
- * bh_ids[0..k-2].  on output the element at k-1 is defined.
- */
-template <class C> inline
-void heap_push (size_t k,
-                typename C::T * bh_val, typename C::TI * bh_ids,
-                typename C::T val, typename C::TI ids)
-{
-    bh_val--; /* Use 1-based indexing for easier node->child translation */
-    bh_ids--;
-    size_t i = k, i_father;
-    while (i > 1) {
-        i_father = i >> 1;
-        if (!C::cmp (val, bh_val[i_father]))  /* the heap structure is ok */
-            break;
-        bh_val[i] = bh_val[i_father];
-        bh_ids[i] = bh_ids[i_father];
-        i = i_father;
-    }
-    bh_val[i] = val;
-    bh_ids[i] = ids;
-}
-
-
-
-/* Partial instanciation for heaps with TI = int64_t */
-
-template <typename T> inline
-void minheap_pop (size_t k, T * bh_val, int64_t * bh_ids)
-{
-    heap_pop<CMin<T, int64_t> > (k, bh_val, bh_ids);
-}
-
-
-template <typename T> inline
-void minheap_push (size_t k, T * bh_val, int64_t * bh_ids, T val, int64_t ids)
-{
-    heap_push<CMin<T, int64_t> > (k, bh_val, bh_ids, val, ids);
-}
-
-
-template <typename T> inline
-void maxheap_pop (size_t k, T * bh_val, int64_t * bh_ids)
-{
-    heap_pop<CMax<T, int64_t> > (k, bh_val, bh_ids);
-}
-
-
-template <typename T> inline
-void maxheap_push (size_t k, T * bh_val, int64_t * bh_ids, T val, int64_t ids)
-{
-    heap_push<CMax<T, int64_t> > (k, bh_val, bh_ids, val, ids);
-}
-
-
-
-/*******************************************************************
- * Heap initialization
- *******************************************************************/
-
-/* Initialization phase for the heap (with unconditionnal pushes).
- * Store k0 elements in a heap containing up to k values. Note that
- * (bh_val, bh_ids) can be the same as (x, ids) */
-template <class C> inline
-void heap_heapify (
-        size_t k,
-        typename C::T *  bh_val,
-        typename C::TI *  bh_ids,
-        const typename C::T * x = nullptr,
-        const typename C::TI * ids = nullptr,
-        size_t k0 = 0)
-{
-   if (k0 > 0) assert (x);
-
-   if (ids) {
-       for (size_t i = 0; i < k0; i++)
-           heap_push<C> (i+1, bh_val, bh_ids, x[i], ids[i]);
-   } else {
-       for (size_t i = 0; i < k0; i++)
-           heap_push<C> (i+1, bh_val, bh_ids, x[i], i);
-   }
-
-   for (size_t i = k0; i < k; i++) {
-       bh_val[i] = C::neutral();
-       bh_ids[i] = -1;
-   }
-
-}
-
-template <typename T> inline
-void minheap_heapify (
-        size_t k, T *  bh_val,
-        int64_t * bh_ids,
-        const T * x = nullptr,
-        const int64_t * ids = nullptr,
-        size_t k0 = 0)
-{
-    heap_heapify< CMin<T, int64_t> > (k, bh_val, bh_ids, x, ids, k0);
-}
-
-
-template <typename T> inline
-void maxheap_heapify (
-        size_t k,
-        T * bh_val,
-        int64_t * bh_ids,
-         const T * x = nullptr,
-         const int64_t * ids = nullptr,
-         size_t k0 = 0)
-{
-    heap_heapify< CMax<T, int64_t> > (k, bh_val, bh_ids, x, ids, k0);
-}
-
-
-
-/*******************************************************************
- * Add n elements to the heap
- *******************************************************************/
-
-
-/* Add some elements to the heap  */
-template <class C> inline
-void heap_addn (size_t k,
-                typename C::T * bh_val, typename C::TI * bh_ids,
-                const typename C::T * x,
-                const typename C::TI * ids,
-                size_t n)
-{
-    size_t i;
-    if (ids)
-        for (i = 0; i < n; i++) {
-            if (C::cmp (bh_val[0], x[i])) {
-                heap_pop<C> (k, bh_val, bh_ids);
-                heap_push<C> (k, bh_val, bh_ids, x[i], ids[i]);
-            }
-        }
-    else
-        for (i = 0; i < n; i++) {
-            if (C::cmp (bh_val[0], x[i])) {
-                heap_pop<C> (k, bh_val, bh_ids);
-                heap_push<C> (k, bh_val, bh_ids, x[i], i);
-            }
-        }
-}
-
-
-/* Partial instanciation for heaps with TI = int64_t */
-
-template <typename T> inline
-void minheap_addn (size_t k, T * bh_val, int64_t * bh_ids,
-                   const T * x, const int64_t * ids, size_t n)
-{
-    heap_addn<CMin<T, int64_t> > (k, bh_val, bh_ids, x, ids, n);
-}
-
-template <typename T> inline
-void maxheap_addn (size_t k, T * bh_val, int64_t * bh_ids,
-                   const T * x, const int64_t * ids, size_t n)
-{
-    heap_addn<CMax<T, int64_t> > (k, bh_val, bh_ids, x, ids, n);
-}
-
-
-
-
-
-
-/*******************************************************************
- * Heap finalization (reorder elements)
- *******************************************************************/
-
-
-/* This function maps a binary heap into an sorted structure.
-   It returns the number  */
-template <typename C> inline
-size_t heap_reorder (size_t k, typename C::T * bh_val, typename C::TI * bh_ids)
-{
-    size_t i, ii;
-
-    for (i = 0, ii = 0; i < k; i++) {
-        /* top element should be put at the end of the list */
-        typename C::T val = bh_val[0];
-        typename C::TI id = bh_ids[0];
-
-        /* boundary case: we will over-ride this value if not a true element */
-        heap_pop<C> (k-i, bh_val, bh_ids);
-        bh_val[k-ii-1] = val;
-        bh_ids[k-ii-1] = id;
-        if (id != -1) ii++;
-    }
-    /* Count the number of elements which are effectively returned */
-    size_t nel = ii;
-
-    memmove (bh_val, bh_val+k-ii, ii * sizeof(*bh_val));
-    memmove (bh_ids, bh_ids+k-ii, ii * sizeof(*bh_ids));
-
-    for (; ii < k; ii++) {
-        bh_val[ii] = C::neutral();
-        bh_ids[ii] = -1;
-    }
-    return nel;
-}
-
-template <typename T> inline
-size_t minheap_reorder (size_t k, T * bh_val, int64_t * bh_ids)
-{
-    return heap_reorder< CMin<T, int64_t> > (k, bh_val, bh_ids);
-}
-
-template <typename T> inline
-size_t maxheap_reorder (size_t k, T * bh_val, int64_t * bh_ids)
-{
-    return heap_reorder< CMax<T, int64_t> > (k, bh_val, bh_ids);
-}
-
-
-
-
-
-/*******************************************************************
- * Operations on heap arrays
- *******************************************************************/
-
-/** a template structure for a set of [min|max]-heaps it is tailored
- * so that the actual data of the heaps can just live in compact
- * arrays.
- */
-template <typename C>
-struct HeapArray {
-    typedef typename C::TI TI;
-    typedef typename C::T T;
-
-    size_t nh;    ///< number of heaps
-    size_t k;     ///< allocated size per heap
-    TI * ids;     ///< identifiers (size nh * k)
-    T * val;      ///< values (distances or similarities), size nh * k
-
-    /// Return the list of values for a heap
-    T * get_val (size_t key) { return val + key * k; }
-
-    /// Correspponding identifiers
-    TI * get_ids (size_t key) { return ids + key * k; }
-
-    /// prepare all the heaps before adding
-    void heapify ();
-
-    /** add nj elements to heaps i0:i0+ni, with sequential ids
-     *
-     * @param nj    nb of elements to add to each heap
-     * @param vin   elements to add, size ni * nj
-     * @param j0    add this to the ids that are added
-     * @param i0    first heap to update
-     * @param ni    nb of elements to update (-1 = use nh)
-     */
-    void addn (size_t nj, const T *vin, TI j0 = 0,
-               size_t i0 = 0, int64_t ni = -1);
-
-    /** same as addn
-     *
-     * @param id_in     ids of the elements to add, size ni * nj
-     * @param id_stride stride for id_in
-     */
-    void addn_with_ids (
-        size_t nj, const T *vin, const TI *id_in = nullptr,
-        int64_t id_stride = 0, size_t i0 = 0, int64_t ni = -1);
-
-    /// reorder all the heaps
-    void reorder ();
-
-    /** this is not really a heap function. It just finds the per-line
-     *   extrema of each line of array D
-     * @param vals_out    extreme value of each line (size nh, or NULL)
-     * @param idx_out     index of extreme value (size nh or NULL)
-     */
-    void per_line_extrema (T *vals_out, TI *idx_out) const;
-
-};
-
-
-/* Define useful heaps */
-typedef HeapArray<CMin<float, int64_t> > float_minheap_array_t;
-typedef HeapArray<CMin<int, int64_t> > int_minheap_array_t;
-
-typedef HeapArray<CMax<float, int64_t> > float_maxheap_array_t;
-typedef HeapArray<CMax<int, int64_t> > int_maxheap_array_t;
-
-// The heap templates are instanciated explicitly in Heap.cpp
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-/*********************************************************************
- * Indirect heaps: instead of having
- *
- *          node i = (bh_ids[i], bh_val[i]),
- *
- * in indirect heaps,
- *
- *          node i = (bh_ids[i], bh_val[bh_ids[i]]),
- *
- *********************************************************************/
-
-
-template <class C>
-inline
-void indirect_heap_pop (
-    size_t k,
-    const typename C::T * bh_val,
-    typename C::TI * bh_ids)
-{
-    bh_ids--; /* Use 1-based indexing for easier node->child translation */
-    typename C::T val = bh_val[bh_ids[k]];
-    size_t i = 1;
-    while (1) {
-        size_t i1 = i << 1;
-        size_t i2 = i1 + 1;
-        if (i1 > k)
-            break;
-        typename C::TI id1 = bh_ids[i1], id2 = bh_ids[i2];
-        if (i2 == k + 1 || C::cmp(bh_val[id1], bh_val[id2])) {
-            if (C::cmp(val, bh_val[id1]))
-                break;
-            bh_ids[i] = id1;
-            i = i1;
-        } else {
-            if (C::cmp(val, bh_val[id2]))
-                break;
-            bh_ids[i] = id2;
-            i = i2;
-        }
-    }
-    bh_ids[i] = bh_ids[k];
-}
-
-
-
-template <class C>
-inline
-void indirect_heap_push (size_t k,
-                         const typename C::T * bh_val, typename C::TI * bh_ids,
-                         typename C::TI id)
-{
-    bh_ids--; /* Use 1-based indexing for easier node->child translation */
-    typename C::T val = bh_val[id];
-    size_t i = k;
-    while (i > 1) {
-        size_t i_father = i >> 1;
-        if (!C::cmp (val, bh_val[bh_ids[i_father]]))
-            break;
-        bh_ids[i] = bh_ids[i_father];
-        i = i_father;
-    }
-    bh_ids[i] = id;
-}
-
-
-} // namespace faiss
-
-#endif  /* FAISS_Heap_h */
diff --git a/PolysemousTraining.cpp b/PolysemousTraining.cpp
deleted file mode 100644
index ebfc5c217b..0000000000
--- a/PolysemousTraining.cpp
+++ /dev/null
@@ -1,951 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "PolysemousTraining.h"
-
-#include <cstdlib>
-#include <cmath>
-#include <cstring>
-#include <stdint.h>
-
-#include <algorithm>
-
-#include "utils.h"
-#include "hamming.h"
-
-#include "FaissAssert.h"
-
-/*****************************************
- * Mixed PQ / Hamming
- ******************************************/
-
-namespace faiss {
-
-
-/****************************************************
- * Optimization code
- ****************************************************/
-
-SimulatedAnnealingParameters::SimulatedAnnealingParameters ()
-{
-    // set some reasonable defaults for the optimization
-    init_temperature = 0.7;
-    temperature_decay = pow (0.9, 1/500.);
-    // reduce by a factor 0.9 every 500 it
-    n_iter = 500000;
-    n_redo = 2;
-    seed = 123;
-    verbose = 0;
-    only_bit_flips = false;
-    init_random = false;
-}
-
-// what would the cost update be if iw and jw were swapped?
-// default implementation just computes both and computes the difference
-double PermutationObjective::cost_update (
-        const int *perm, int iw, int jw) const
-{
-    double orig_cost = compute_cost (perm);
-
-    std::vector<int> perm2 (n);
-    for (int i = 0; i < n; i++)
-        perm2[i] = perm[i];
-    perm2[iw] = perm[jw];
-    perm2[jw] = perm[iw];
-
-    double new_cost = compute_cost (perm2.data());
-    return new_cost - orig_cost;
-}
-
-
-
-
-SimulatedAnnealingOptimizer::SimulatedAnnealingOptimizer (
-        PermutationObjective *obj,
-        const SimulatedAnnealingParameters &p):
-    SimulatedAnnealingParameters (p),
-    obj (obj),
-    n(obj->n),
-    logfile (nullptr)
-{
-    rnd = new RandomGenerator (p.seed);
-    FAISS_THROW_IF_NOT (n < 100000 && n >=0 );
-}
-
-SimulatedAnnealingOptimizer::~SimulatedAnnealingOptimizer ()
-{
-    delete rnd;
-}
-
-// run the optimization and return the best result in best_perm
-double SimulatedAnnealingOptimizer::run_optimization (int * best_perm)
-{
-    double min_cost = 1e30;
-
-    // just do a few runs of the annealing and keep the lowest output cost
-    for (int it = 0; it < n_redo; it++) {
-        std::vector<int> perm(n);
-        for (int i = 0; i < n; i++)
-            perm[i] = i;
-         if (init_random) {
-            for (int i = 0; i < n; i++) {
-                int j = i + rnd->rand_int (n - i);
-                std::swap (perm[i], perm[j]);
-            }
-        }
-         float cost = optimize (perm.data());
-        if (logfile) fprintf (logfile, "\n");
-        if(verbose > 1) {
-            printf ("    optimization run %d: cost=%g %s\n",
-                    it, cost, cost < min_cost ? "keep" : "");
-        }
-        if (cost < min_cost) {
-            memcpy (best_perm, perm.data(), sizeof(perm[0]) * n);
-            min_cost = cost;
-        }
-    }
-     return min_cost;
-}
-
-// perform the optimization loop, starting from and modifying
-// permutation in-place
-double SimulatedAnnealingOptimizer::optimize (int *perm)
-{
-    double cost = init_cost = obj->compute_cost (perm);
-    int log2n = 0;
-    while (!(n <= (1 << log2n))) log2n++;
-    double temperature = init_temperature;
-     int n_swap = 0, n_hot = 0;
-    for (int it = 0; it < n_iter; it++) {
-        temperature = temperature * temperature_decay;
-        int iw, jw;
-        if (only_bit_flips) {
-            iw = rnd->rand_int (n);
-            jw = iw ^ (1 << rnd->rand_int (log2n));
-        } else {
-            iw = rnd->rand_int (n);
-            jw = rnd->rand_int (n - 1);
-            if (jw == iw) jw++;
-        }
-         double delta_cost = obj->cost_update (perm, iw, jw);
-         if (delta_cost < 0 || rnd->rand_float () < temperature) {
-            std::swap (perm[iw], perm[jw]);
-            cost += delta_cost;
-            n_swap++;
-            if (delta_cost >= 0) n_hot++;
-        }
-         if (verbose > 2 || (verbose > 1 && it % 10000 == 0)) {
-            printf ("      iteration %d cost %g temp %g n_swap %d "
-                    "(%d hot)     \r",
-                    it, cost, temperature, n_swap, n_hot);
-            fflush(stdout);
-        }
-        if (logfile) {
-            fprintf (logfile, "%d %g %g %d %d\n",
-                    it, cost, temperature, n_swap, n_hot);
-        }
-     }
-    if (verbose > 1) printf("\n");
-    return cost;
-}
-
-
-
-
-
-/****************************************************
- * Cost functions: ReproduceDistanceTable
- ****************************************************/
-
-
-
-
-
-
-static inline int hamming_dis (uint64_t a, uint64_t b)
-{
-    return __builtin_popcountl (a ^ b);
-}
-
-namespace {
-
-/// optimize permutation to reproduce a distance table with Hamming distances
-struct ReproduceWithHammingObjective : PermutationObjective {
-    int nbits;
-    double dis_weight_factor;
-
-    static double sqr (double x) { return x * x; }
-
-
-    // weihgting of distances: it is more important to reproduce small
-    // distances well
-    double dis_weight (double x) const
-    {
-        return exp (-dis_weight_factor * x);
-    }
-
-    std::vector<double> target_dis; // wanted distances (size n^2)
-    std::vector<double> weights;    // weights for each distance (size n^2)
-
-    // cost = quadratic difference between actual distance and Hamming distance
-    double compute_cost(const int* perm) const override {
-      double cost = 0;
-      for (int i = 0; i < n; i++) {
-        for (int j = 0; j < n; j++) {
-          double wanted = target_dis[i * n + j];
-          double w = weights[i * n + j];
-          double actual = hamming_dis(perm[i], perm[j]);
-          cost += w * sqr(wanted - actual);
-        }
-      }
-      return cost;
-    }
-
-
-    // what would the cost update be if iw and jw were swapped?
-    // computed in O(n) instead of O(n^2) for the full re-computation
-    double cost_update(const int* perm, int iw, int jw) const override {
-      double delta_cost = 0;
-
-      for (int i = 0; i < n; i++) {
-        if (i == iw) {
-          for (int j = 0; j < n; j++) {
-            double wanted = target_dis[i * n + j], w = weights[i * n + j];
-            double actual = hamming_dis(perm[i], perm[j]);
-            delta_cost -= w * sqr(wanted - actual);
-            double new_actual =
-                hamming_dis(perm[jw], perm[j == iw ? jw : j == jw ? iw : j]);
-            delta_cost += w * sqr(wanted - new_actual);
-          }
-        } else if (i == jw) {
-          for (int j = 0; j < n; j++) {
-            double wanted = target_dis[i * n + j], w = weights[i * n + j];
-            double actual = hamming_dis(perm[i], perm[j]);
-            delta_cost -= w * sqr(wanted - actual);
-            double new_actual =
-                hamming_dis(perm[iw], perm[j == iw ? jw : j == jw ? iw : j]);
-            delta_cost += w * sqr(wanted - new_actual);
-          }
-        } else {
-          int j = iw;
-          {
-            double wanted = target_dis[i * n + j], w = weights[i * n + j];
-            double actual = hamming_dis(perm[i], perm[j]);
-            delta_cost -= w * sqr(wanted - actual);
-            double new_actual = hamming_dis(perm[i], perm[jw]);
-            delta_cost += w * sqr(wanted - new_actual);
-          }
-          j = jw;
-          {
-            double wanted = target_dis[i * n + j], w = weights[i * n + j];
-            double actual = hamming_dis(perm[i], perm[j]);
-            delta_cost -= w * sqr(wanted - actual);
-            double new_actual = hamming_dis(perm[i], perm[iw]);
-            delta_cost += w * sqr(wanted - new_actual);
-          }
-        }
-      }
-
-      return delta_cost;
-    }
-
-
-
-    ReproduceWithHammingObjective (
-           int nbits,
-           const std::vector<double> & dis_table,
-           double dis_weight_factor):
-        nbits (nbits), dis_weight_factor (dis_weight_factor)
-    {
-        n = 1 << nbits;
-        FAISS_THROW_IF_NOT (dis_table.size() == n * n);
-        set_affine_target_dis (dis_table);
-    }
-
-    void set_affine_target_dis (const std::vector<double> & dis_table)
-    {
-        double sum = 0, sum2 = 0;
-        int n2 = n * n;
-        for (int i = 0; i < n2; i++) {
-            sum += dis_table [i];
-            sum2 += dis_table [i] * dis_table [i];
-        }
-        double mean = sum / n2;
-        double stddev = sqrt(sum2 / n2 - (sum / n2) * (sum / n2));
-
-        target_dis.resize (n2);
-
-        for (int i = 0; i < n2; i++) {
-            // the mapping function
-            double td = (dis_table [i] - mean) / stddev * sqrt(nbits / 4) +
-                nbits / 2;
-            target_dis[i] = td;
-            // compute a weight
-            weights.push_back (dis_weight (td));
-        }
-
-    }
-
-    ~ReproduceWithHammingObjective() override {}
-};
-
-} // anonymous namespace
-
-// weihgting of distances: it is more important to reproduce small
-// distances well
-double ReproduceDistancesObjective::dis_weight (double x) const
-{
-    return exp (-dis_weight_factor * x);
-}
-
-
-double ReproduceDistancesObjective::get_source_dis (int i, int j) const
-{
-    return source_dis [i * n + j];
-}
-
-// cost = quadratic difference between actual distance and Hamming distance
-double ReproduceDistancesObjective::compute_cost (const int *perm) const
-{
-    double cost = 0;
-    for (int i = 0; i < n; i++) {
-        for (int j = 0; j < n; j++) {
-            double wanted = target_dis [i * n + j];
-            double w = weights [i * n + j];
-            double actual = get_source_dis (perm[i], perm[j]);
-            cost += w * sqr (wanted - actual);
-        }
-    }
-    return cost;
-}
-
-// what would the cost update be if iw and jw were swapped?
-// computed in O(n) instead of O(n^2) for the full re-computation
-double ReproduceDistancesObjective::cost_update(
-        const int *perm, int iw, int jw) const
-{
-    double delta_cost = 0;
-     for (int i = 0; i < n; i++) {
-        if (i == iw) {
-            for (int j = 0; j < n; j++) {
-                double wanted = target_dis [i * n + j],
-                    w = weights [i * n + j];
-                double actual = get_source_dis (perm[i], perm[j]);
-                delta_cost -= w * sqr (wanted - actual);
-                double new_actual = get_source_dis (
-                       perm[jw],
-                       perm[j == iw ? jw : j == jw ? iw : j]);
-                delta_cost += w * sqr (wanted - new_actual);
-            }
-        } else if (i == jw) {
-            for (int j = 0; j < n; j++) {
-                double wanted = target_dis [i * n + j],
-                    w = weights [i * n + j];
-                double actual = get_source_dis (perm[i], perm[j]);
-                delta_cost -= w * sqr (wanted - actual);
-                double new_actual = get_source_dis (
-                       perm[iw],
-                       perm[j == iw ? jw : j == jw ? iw : j]);
-                delta_cost += w * sqr (wanted - new_actual);
-            }
-        } else  {
-            int j = iw;
-            {
-                double wanted = target_dis [i * n + j],
-                    w = weights [i * n + j];
-                double actual = get_source_dis (perm[i], perm[j]);
-                delta_cost -= w * sqr (wanted - actual);
-                double new_actual = get_source_dis (perm[i], perm[jw]);
-                delta_cost += w * sqr (wanted - new_actual);
-            }
-            j = jw;
-            {
-                double wanted = target_dis [i * n + j],
-                    w = weights [i * n + j];
-                double actual = get_source_dis (perm[i], perm[j]);
-                delta_cost -= w * sqr (wanted - actual);
-                double new_actual = get_source_dis (perm[i], perm[iw]);
-                delta_cost += w * sqr (wanted - new_actual);
-            }
-        }
-    }
-     return delta_cost;
-}
-
-
-
-ReproduceDistancesObjective::ReproduceDistancesObjective (
-       int n,
-       const double *source_dis_in,
-       const double *target_dis_in,
-       double dis_weight_factor):
-    dis_weight_factor (dis_weight_factor),
-    target_dis (target_dis_in)
-{
-    this->n = n;
-    set_affine_target_dis (source_dis_in);
-}
-
-void ReproduceDistancesObjective::compute_mean_stdev (
-          const double *tab, size_t n2,
-          double *mean_out, double *stddev_out)
-{
-    double sum = 0, sum2 = 0;
-    for (int i = 0; i < n2; i++) {
-        sum += tab [i];
-        sum2 += tab [i] * tab [i];
-    }
-    double mean = sum / n2;
-    double stddev = sqrt(sum2 / n2 - (sum / n2) * (sum / n2));
-    *mean_out = mean;
-    *stddev_out = stddev;
-}
-
-void ReproduceDistancesObjective::set_affine_target_dis (
-          const double *source_dis_in)
-{
-    int n2 = n * n;
-
-    double mean_src, stddev_src;
-    compute_mean_stdev (source_dis_in, n2, &mean_src, &stddev_src);
-
-    double mean_target, stddev_target;
-    compute_mean_stdev (target_dis, n2, &mean_target, &stddev_target);
-
-    printf ("map mean %g std %g -> mean %g std %g\n",
-            mean_src, stddev_src, mean_target, stddev_target);
-
-    source_dis.resize (n2);
-    weights.resize (n2);
-
-    for (int i = 0; i < n2; i++) {
-        // the mapping function
-        source_dis[i] = (source_dis_in[i] - mean_src) / stddev_src
-            * stddev_target + mean_target;
-
-        // compute a weight
-        weights [i] = dis_weight (target_dis[i]);
-    }
-
-}
-
-/****************************************************
- * Cost functions: RankingScore
- ****************************************************/
-
-/// Maintains a 3D table of elementary costs.
-/// Accumulates elements based on Hamming distance comparisons
-template <typename Ttab, typename Taccu>
-struct Score3Computer: PermutationObjective {
-
-    int nc;
-
-    // cost matrix of size nc * nc *nc
-    // n_gt (i,j,k) = count of d_gt(x, y-) < d_gt(x, y+)
-    // where x has PQ code i, y- PQ code j and y+ PQ code k
-    std::vector<Ttab> n_gt;
-
-
-    /// the cost is a triple loop on the nc * nc * nc matrix of entries.
-    ///
-    Taccu compute (const int * perm) const
-    {
-        Taccu accu = 0;
-        const Ttab *p = n_gt.data();
-        for (int i = 0; i < nc; i++) {
-            int ip = perm [i];
-            for (int j = 0; j < nc; j++) {
-                int jp = perm [j];
-                for (int k = 0; k < nc; k++) {
-                    int kp = perm [k];
-                    if (hamming_dis (ip, jp) <
-                        hamming_dis (ip, kp)) {
-                        accu += *p; // n_gt [ ( i * nc + j) * nc + k];
-                    }
-                    p++;
-                }
-            }
-        }
-        return accu;
-    }
-
-
-    /** cost update if entries iw and jw of the permutation would be
-     * swapped.
-     *
-     * The computation is optimized by avoiding elements in the
-     * nc*nc*nc cube that are known not to change. For nc=256, this
-     * reduces the nb of cells to visit to about 6/256 th of the
-     * cells. Practical speedup is about 8x, and the code is quite
-     * complex :-/
-     */
-    Taccu compute_update (const int *perm, int iw, int jw) const
-    {
-        assert (iw != jw);
-        if (iw > jw) std::swap (iw, jw);
-
-        Taccu accu = 0;
-        const Ttab * n_gt_i = n_gt.data();
-        for (int i = 0; i < nc; i++) {
-            int ip0 = perm [i];
-            int ip = perm [i == iw ? jw : i == jw ? iw : i];
-
-            //accu += update_i (perm, iw, jw, ip0, ip, n_gt_i);
-
-            accu += update_i_cross (perm, iw, jw,
-                                    ip0, ip, n_gt_i);
-
-            if (ip != ip0)
-                accu += update_i_plane (perm, iw, jw,
-                                       ip0, ip, n_gt_i);
-
-            n_gt_i += nc * nc;
-        }
-
-        return accu;
-    }
-
-
-    Taccu update_i (const int *perm, int iw, int jw,
-                   int ip0, int ip, const Ttab * n_gt_i) const
-    {
-        Taccu accu = 0;
-        const Ttab *n_gt_ij = n_gt_i;
-        for (int j = 0; j < nc; j++) {
-            int jp0 = perm[j];
-            int jp = perm [j == iw ? jw : j == jw ? iw : j];
-            for (int k = 0; k < nc; k++) {
-                int kp0 = perm [k];
-                int kp = perm [k == iw ? jw : k == jw ? iw : k];
-                int ng = n_gt_ij [k];
-                if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
-                    accu += ng;
-                }
-                if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp0)) {
-                    accu -= ng;
-                }
-            }
-            n_gt_ij += nc;
-        }
-        return accu;
-    }
-
-    // 2 inner loops for the case ip0 != ip
-    Taccu update_i_plane (const int *perm, int iw, int jw,
-                         int ip0, int ip, const Ttab * n_gt_i) const
-    {
-        Taccu accu = 0;
-        const Ttab *n_gt_ij = n_gt_i;
-
-        for (int j = 0; j < nc; j++) {
-            if (j != iw && j != jw) {
-                int jp = perm[j];
-                for (int k = 0; k < nc; k++) {
-                    if (k != iw && k != jw) {
-                        int kp = perm [k];
-                        Ttab ng = n_gt_ij [k];
-                        if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
-                            accu += ng;
-                        }
-                        if (hamming_dis (ip0, jp) < hamming_dis (ip0, kp)) {
-                            accu -= ng;
-                        }
-                    }
-                }
-            }
-            n_gt_ij += nc;
-        }
-        return accu;
-    }
-
-    /// used for the 8 cells were the 3 indices are swapped
-    inline Taccu update_k (const int *perm, int iw, int jw,
-                          int ip0, int ip, int jp0, int jp,
-                          int k,
-                          const Ttab * n_gt_ij) const
-    {
-        Taccu accu = 0;
-        int kp0 = perm [k];
-        int kp = perm [k == iw ? jw : k == jw ? iw : k];
-        Ttab ng = n_gt_ij [k];
-        if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
-            accu += ng;
-        }
-        if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp0)) {
-            accu -= ng;
-        }
-        return accu;
-    }
-
-    /// compute update on a line of k's, where i and j are swapped
-    Taccu update_j_line (const int *perm, int iw, int jw,
-                        int ip0, int ip, int jp0, int jp,
-                        const Ttab * n_gt_ij) const
-    {
-        Taccu accu = 0;
-        for (int k = 0; k < nc; k++) {
-            if (k == iw || k == jw) continue;
-            int kp = perm [k];
-            Ttab ng = n_gt_ij [k];
-            if (hamming_dis (ip, jp) < hamming_dis (ip, kp)) {
-                accu += ng;
-            }
-            if (hamming_dis (ip0, jp0) < hamming_dis (ip0, kp)) {
-                accu -= ng;
-            }
-        }
-        return accu;
-    }
-
-
-    /// considers the 2 pairs of crossing lines j=iw or jw and k = iw or kw
-    Taccu update_i_cross (const int *perm, int iw, int jw,
-                        int ip0, int ip, const Ttab * n_gt_i) const
-    {
-        Taccu accu = 0;
-        const Ttab *n_gt_ij = n_gt_i;
-
-        for (int j = 0; j < nc; j++) {
-            int jp0 = perm[j];
-            int jp = perm [j == iw ? jw : j == jw ? iw : j];
-
-            accu += update_k (perm, iw, jw, ip0, ip, jp0, jp, iw, n_gt_ij);
-            accu += update_k (perm, iw, jw, ip0, ip, jp0, jp, jw, n_gt_ij);
-
-            if (jp != jp0)
-                accu += update_j_line (perm, iw, jw, ip0, ip, jp0, jp, n_gt_ij);
-
-            n_gt_ij += nc;
-        }
-        return accu;
-    }
-
-
-    /// PermutationObjective implementeation (just negates the scores
-    /// for minimization)
-
-    double compute_cost(const int* perm) const override {
-      return -compute(perm);
-    }
-
-    double cost_update(const int* perm, int iw, int jw) const override {
-      double ret = -compute_update(perm, iw, jw);
-      return ret;
-    }
-
-    ~Score3Computer() override {}
-};
-
-
-
-
-
-struct IndirectSort {
-    const float *tab;
-    bool operator () (int a, int b) {return tab[a] < tab[b]; }
-};
-
-
-
-struct RankingScore2: Score3Computer<float, double> {
-    int nbits;
-    int nq, nb;
-    const uint32_t *qcodes, *bcodes;
-    const float *gt_distances;
-
-    RankingScore2 (int nbits, int nq, int nb,
-                  const uint32_t *qcodes, const uint32_t *bcodes,
-                  const float *gt_distances):
-        nbits(nbits), nq(nq), nb(nb), qcodes(qcodes),
-        bcodes(bcodes), gt_distances(gt_distances)
-    {
-        n = nc = 1 << nbits;
-        n_gt.resize (nc * nc * nc);
-        init_n_gt ();
-    }
-
-
-    double rank_weight (int r)
-    {
-        return 1.0 / (r + 1);
-    }
-
-    /// count nb of i, j in a x b st. i < j
-    /// a and b should be sorted on input
-    /// they are the ranks of j and k respectively.
-    /// specific version for diff-of-rank weighting, cannot optimized
-    /// with a cumulative table
-    double accum_gt_weight_diff (const std::vector<int> & a,
-                                 const std::vector<int> & b)
-    {
-        int nb = b.size(), na = a.size();
-
-        double accu = 0;
-        int j = 0;
-        for (int i = 0; i < na; i++) {
-            int ai = a[i];
-            while (j < nb && ai >= b[j]) j++;
-
-            double accu_i = 0;
-            for (int k = j; k < b.size(); k++)
-                accu_i += rank_weight (b[k] - ai);
-
-            accu += rank_weight (ai) * accu_i;
-
-        }
-        return accu;
-    }
-
-    void init_n_gt ()
-    {
-        for (int q = 0; q < nq; q++) {
-            const float *gtd = gt_distances + q * nb;
-            const uint32_t *cb = bcodes;// all same codes
-            float * n_gt_q = & n_gt [qcodes[q] * nc * nc];
-
-            printf("init gt for q=%d/%d    \r", q, nq); fflush(stdout);
-
-            std::vector<int> rankv (nb);
-            int * ranks = rankv.data();
-
-            // elements in each code bin, ordered by rank within each bin
-            std::vector<std::vector<int> > tab (nc);
-
-            { // build rank table
-                IndirectSort s = {gtd};
-                for (int j = 0; j < nb; j++) ranks[j] = j;
-                std::sort (ranks, ranks + nb, s);
-            }
-
-            for (int rank = 0; rank < nb; rank++) {
-                int i = ranks [rank];
-                tab [cb[i]].push_back (rank);
-            }
-
-
-            // this is very expensive. Any suggestion for improvement
-            // welcome.
-            for (int i = 0; i < nc; i++) {
-                std::vector<int> & di = tab[i];
-                for (int j = 0; j < nc; j++) {
-                    std::vector<int> & dj = tab[j];
-                    n_gt_q [i * nc + j] += accum_gt_weight_diff (di, dj);
-
-                }
-            }
-
-        }
-
-    }
-
-};
-
-
-/*****************************************
- * PolysemousTraining
- ******************************************/
-
-
-
-PolysemousTraining::PolysemousTraining ()
-{
-    optimization_type = OT_ReproduceDistances_affine;
-    ntrain_permutation = 0;
-    dis_weight_factor = log(2);
-}
-
-
-
-void PolysemousTraining::optimize_reproduce_distances (
-       ProductQuantizer &pq) const
-{
-
-    int dsub = pq.dsub;
-
-    int n = pq.ksub;
-    int nbits = pq.nbits;
-
-#pragma omp parallel for
-    for (int m = 0; m < pq.M; m++) {
-        std::vector<double> dis_table;
-
-        // printf ("Optimizing quantizer %d\n", m);
-
-        float * centroids = pq.get_centroids (m, 0);
-
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < n; j++) {
-                dis_table.push_back (fvec_L2sqr (centroids + i * dsub,
-                                                 centroids + j * dsub,
-                                                 dsub));
-            }
-        }
-
-        std::vector<int> perm (n);
-        ReproduceWithHammingObjective obj (
-               nbits, dis_table,
-               dis_weight_factor);
-
-
-        SimulatedAnnealingOptimizer optim (&obj, *this);
-
-        if (log_pattern.size()) {
-            char fname[256];
-            snprintf (fname, 256, log_pattern.c_str(), m);
-            printf ("opening log file %s\n", fname);
-            optim.logfile = fopen (fname, "w");
-            FAISS_THROW_IF_NOT_MSG (optim.logfile, "could not open logfile");
-        }
-        double final_cost = optim.run_optimization (perm.data());
-
-        if (verbose > 0) {
-            printf ("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
-                    m, optim.init_cost, final_cost);
-        }
-
-        if (log_pattern.size()) fclose (optim.logfile);
-
-        std::vector<float> centroids_copy;
-        for (int i = 0; i < dsub * n; i++)
-            centroids_copy.push_back (centroids[i]);
-
-        for (int i = 0; i < n; i++)
-            memcpy (centroids + perm[i] * dsub,
-                    centroids_copy.data() + i * dsub,
-                    dsub * sizeof(centroids[0]));
-
-    }
-
-}
-
-
-void PolysemousTraining::optimize_ranking (
-      ProductQuantizer &pq, size_t n, const float *x) const
-{
-
-    int dsub = pq.dsub;
-
-    int nbits = pq.nbits;
-
-    std::vector<uint8_t> all_codes (pq.code_size * n);
-
-    pq.compute_codes (x, all_codes.data(), n);
-
-    FAISS_THROW_IF_NOT (pq.nbits == 8);
-
-    if (n == 0)
-        pq.compute_sdc_table ();
-
-#pragma omp parallel for
-    for (int m = 0; m < pq.M; m++) {
-        size_t nq, nb;
-        std::vector <uint32_t> codes; // query codes, then db codes
-        std::vector <float> gt_distances; // nq * nb matrix of distances
-
-        if (n > 0) {
-            std::vector<float> xtrain (n * dsub);
-            for (int i = 0; i < n; i++)
-                memcpy (xtrain.data() + i * dsub,
-                        x + i * pq.d + m * dsub,
-                        sizeof(float) * dsub);
-
-            codes.resize (n);
-            for (int i = 0; i < n; i++)
-                codes [i] = all_codes [i * pq.code_size + m];
-
-            nq = n / 4; nb = n - nq;
-            const float *xq = xtrain.data();
-            const float *xb = xq + nq * dsub;
-
-            gt_distances.resize (nq * nb);
-
-            pairwise_L2sqr (dsub,
-                            nq, xq,
-                            nb, xb,
-                            gt_distances.data());
-        } else {
-            nq = nb = pq.ksub;
-            codes.resize (2 * nq);
-            for (int i = 0; i < nq; i++)
-                codes[i] = codes [i + nq] = i;
-
-            gt_distances.resize (nq * nb);
-
-            memcpy (gt_distances.data (),
-                    pq.sdc_table.data () + m * nq * nb,
-                    sizeof (float) * nq * nb);
-        }
-
-        double t0 = getmillisecs ();
-
-        PermutationObjective *obj = new RankingScore2 (
-                  nbits, nq, nb,
-                  codes.data(), codes.data() + nq,
-                  gt_distances.data ());
-        ScopeDeleter1<PermutationObjective> del (obj);
-
-        if (verbose > 0) {
-            printf("   m=%d, nq=%ld, nb=%ld, intialize RankingScore "
-                   "in %.3f ms\n",
-                   m, nq, nb, getmillisecs () - t0);
-        }
-
-        SimulatedAnnealingOptimizer optim (obj, *this);
-
-        if (log_pattern.size()) {
-            char fname[256];
-            snprintf (fname, 256, log_pattern.c_str(), m);
-            printf ("opening log file %s\n", fname);
-            optim.logfile = fopen (fname, "w");
-            FAISS_THROW_IF_NOT_FMT (optim.logfile,
-                                    "could not open logfile %s", fname);
-        }
-
-        std::vector<int> perm (pq.ksub);
-
-        double final_cost = optim.run_optimization (perm.data());
-        printf ("SimulatedAnnealingOptimizer for m=%d: %g -> %g\n",
-                m, optim.init_cost, final_cost);
-
-        if (log_pattern.size()) fclose (optim.logfile);
-
-        float * centroids = pq.get_centroids (m, 0);
-
-        std::vector<float> centroids_copy;
-        for (int i = 0; i < dsub * pq.ksub; i++)
-            centroids_copy.push_back (centroids[i]);
-
-        for (int i = 0; i < pq.ksub; i++)
-            memcpy (centroids + perm[i] * dsub,
-                    centroids_copy.data() + i * dsub,
-                    dsub * sizeof(centroids[0]));
-
-    }
-
-}
-
-
-
-void PolysemousTraining::optimize_pq_for_hamming (ProductQuantizer &pq,
-                                                size_t n, const float *x) const
-{
-    if (optimization_type == OT_None) {
-
-    } else if (optimization_type == OT_ReproduceDistances_affine) {
-        optimize_reproduce_distances (pq);
-    } else {
-        optimize_ranking (pq, n, x);
-    }
-
-    pq.compute_sdc_table ();
-
-}
-
-
-} // namespace faiss
diff --git a/PolysemousTraining.h b/PolysemousTraining.h
deleted file mode 100644
index ada8512941..0000000000
--- a/PolysemousTraining.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_POLYSEMOUS_TRAINING_INCLUDED
-#define FAISS_POLYSEMOUS_TRAINING_INCLUDED
-
-
-#include "ProductQuantizer.h"
-
-
-namespace faiss {
-
-
-/// parameters used for the simulated annealing method
-struct SimulatedAnnealingParameters {
-
-    // optimization parameters
-    double init_temperature;   // init probaility of accepting a bad swap
-    double temperature_decay;  // at each iteration the temp is multiplied by this
-    int n_iter; // nb of iterations
-    int n_redo; // nb of runs of the simulation
-    int seed;   // random seed
-    int verbose;
-    bool only_bit_flips; // restrict permutation changes to bit flips
-    bool init_random; // intialize with a random permutation (not identity)
-
-    // set reasonable defaults
-    SimulatedAnnealingParameters ();
-
-};
-
-
-/// abstract class for the loss function
-struct PermutationObjective {
-
-    int n;
-
-    virtual double compute_cost (const int *perm) const = 0;
-
-    // what would the cost update be if iw and jw were swapped?
-    // default implementation just computes both and computes the difference
-    virtual double cost_update (const int *perm, int iw, int jw) const;
-
-    virtual ~PermutationObjective () {}
-};
-
-
-struct ReproduceDistancesObjective : PermutationObjective {
-
-    double dis_weight_factor;
-
-    static double sqr (double x) { return x * x; }
-
-    // weihgting of distances: it is more important to reproduce small
-    // distances well
-    double dis_weight (double x) const;
-
-    std::vector<double> source_dis; ///< "real" corrected distances (size n^2)
-    const double *      target_dis; ///< wanted distances (size n^2)
-    std::vector<double> weights;    ///< weights for each distance (size n^2)
-
-    double get_source_dis (int i, int j) const;
-
-    // cost = quadratic difference between actual distance and Hamming distance
-    double compute_cost(const int* perm) const override;
-
-    // what would the cost update be if iw and jw were swapped?
-    // computed in O(n) instead of O(n^2) for the full re-computation
-    double cost_update(const int* perm, int iw, int jw) const override;
-
-    ReproduceDistancesObjective (
-           int n,
-           const double *source_dis_in,
-           const double *target_dis_in,
-           double dis_weight_factor);
-
-    static void compute_mean_stdev (const double *tab, size_t n2,
-                                    double *mean_out, double *stddev_out);
-
-    void set_affine_target_dis (const double *source_dis_in);
-
-    ~ReproduceDistancesObjective() override {}
-};
-
-struct RandomGenerator;
-
-/// Simulated annealing optimization algorithm for permutations.
- struct SimulatedAnnealingOptimizer: SimulatedAnnealingParameters {
-
-    PermutationObjective *obj;
-    int n;         ///< size of the permutation
-    FILE *logfile; /// logs values of the cost function
-
-    SimulatedAnnealingOptimizer (PermutationObjective *obj,
-                                 const SimulatedAnnealingParameters &p);
-    RandomGenerator *rnd;
-
-    /// remember intial cost of optimization
-    double init_cost;
-
-    // main entry point. Perform the optimization loop, starting from
-    // and modifying permutation in-place
-    double optimize (int *perm);
-
-    // run the optimization and return the best result in best_perm
-    double run_optimization (int * best_perm);
-
-    virtual ~SimulatedAnnealingOptimizer ();
-};
-
-
-
-
-/// optimizes the order of indices in a ProductQuantizer
-struct PolysemousTraining: SimulatedAnnealingParameters {
-
-    enum Optimization_type_t {
-        OT_None,
-        OT_ReproduceDistances_affine,  ///< default
-        OT_Ranking_weighted_diff  /// same as _2, but use rank of y+ - rank of y-
-    };
-    Optimization_type_t optimization_type;
-
-    // use 1/4 of the training points for the optimization, with
-    // max. ntrain_permutation. If ntrain_permutation == 0: train on
-    // centroids
-    int ntrain_permutation;
-    double dis_weight_factor; // decay of exp that weights distance loss
-
-    // filename pattern for the logging of iterations
-    std::string log_pattern;
-
-    // sets default values
-    PolysemousTraining ();
-
-    /// reorder the centroids so that the Hamming distace becomes a
-    /// good approximation of the SDC distance (called by train)
-    void optimize_pq_for_hamming (ProductQuantizer & pq,
-                                  size_t n, const float *x) const;
-
-    /// called by optimize_pq_for_hamming
-    void optimize_ranking (ProductQuantizer &pq, size_t n, const float *x) const;
-    /// called by optimize_pq_for_hamming
-    void optimize_reproduce_distances (ProductQuantizer &pq) const;
-
-};
-
-
-} // namespace faiss
-
-
-#endif
diff --git a/ProductQuantizer.cpp b/ProductQuantizer.cpp
deleted file mode 100644
index 2b709fe3d8..0000000000
--- a/ProductQuantizer.cpp
+++ /dev/null
@@ -1,876 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "ProductQuantizer.h"
-
-
-#include <cstddef>
-#include <cstring>
-#include <cstdio>
-#include <memory>
-
-#include <algorithm>
-
-#include "FaissAssert.h"
-#include "VectorTransform.h"
-#include "IndexFlat.h"
-#include "utils.h"
-
-
-extern "C" {
-
-/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-
-int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
-            n, FINTEGER *k, const float *alpha, const float *a,
-            FINTEGER *lda, const float *b, FINTEGER *
-            ldb, float *beta, float *c, FINTEGER *ldc);
-
-}
-
-
-namespace faiss {
-
-
-/* compute an estimator using look-up tables for typical values of M */
-template <typename CT, class C>
-void pq_estimators_from_tables_Mmul4 (int M, const CT * codes,
-                                      size_t ncodes,
-                                      const float * __restrict dis_table,
-                                      size_t ksub,
-                                      size_t k,
-                                      float * heap_dis,
-                                      int64_t * heap_ids)
-{
-
-    for (size_t j = 0; j < ncodes; j++) {
-        float dis = 0;
-        const float *dt = dis_table;
-
-        for (size_t m = 0; m < M; m+=4) {
-            float dism = 0;
-            dism  = dt[*codes++]; dt += ksub;
-            dism += dt[*codes++]; dt += ksub;
-            dism += dt[*codes++]; dt += ksub;
-            dism += dt[*codes++]; dt += ksub;
-            dis += dism;
-        }
-
-        if (C::cmp (heap_dis[0], dis)) {
-            heap_pop<C> (k, heap_dis, heap_ids);
-            heap_push<C> (k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-
-template <typename CT, class C>
-void pq_estimators_from_tables_M4 (const CT * codes,
-                                   size_t ncodes,
-                                   const float * __restrict dis_table,
-                                   size_t ksub,
-                                   size_t k,
-                                   float * heap_dis,
-                                   int64_t * heap_ids)
-{
-
-    for (size_t j = 0; j < ncodes; j++) {
-        float dis = 0;
-        const float *dt = dis_table;
-        dis  = dt[*codes++]; dt += ksub;
-        dis += dt[*codes++]; dt += ksub;
-        dis += dt[*codes++]; dt += ksub;
-        dis += dt[*codes++];
-
-        if (C::cmp (heap_dis[0], dis)) {
-            heap_pop<C> (k, heap_dis, heap_ids);
-            heap_push<C> (k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-
-template <typename CT, class C>
-static inline void pq_estimators_from_tables (const ProductQuantizer& pq,
-                                              const CT * codes,
-                                              size_t ncodes,
-                                              const float * dis_table,
-                                              size_t k,
-                                              float * heap_dis,
-                                              int64_t * heap_ids)
-{
-
-    if (pq.M == 4)  {
-
-        pq_estimators_from_tables_M4<CT, C> (codes, ncodes,
-                                             dis_table, pq.ksub, k,
-                                             heap_dis, heap_ids);
-        return;
-    }
-
-    if (pq.M % 4 == 0) {
-        pq_estimators_from_tables_Mmul4<CT, C> (pq.M, codes, ncodes,
-                                                dis_table, pq.ksub, k,
-                                                heap_dis, heap_ids);
-        return;
-    }
-
-    /* Default is relatively slow */
-    const size_t M = pq.M;
-    const size_t ksub = pq.ksub;
-    for (size_t j = 0; j < ncodes; j++) {
-        float dis = 0;
-        const float * __restrict dt = dis_table;
-        for (int m = 0; m < M; m++) {
-            dis += dt[*codes++];
-            dt += ksub;
-        }
-        if (C::cmp (heap_dis[0], dis)) {
-            heap_pop<C> (k, heap_dis, heap_ids);
-            heap_push<C> (k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-template <class C>
-static inline void pq_estimators_from_tables_generic(const ProductQuantizer& pq,
-                                                     size_t nbits,
-                                                     const uint8_t *codes,
-                                                     size_t ncodes,
-                                                     const float *dis_table,
-                                                     size_t k,
-                                                     float *heap_dis,
-                                                     int64_t *heap_ids)
-{
-  const size_t M = pq.M;
-  const size_t ksub = pq.ksub;
-  for (size_t j = 0; j < ncodes; ++j) {
-    faiss::ProductQuantizer::PQDecoderGeneric decoder(
-      codes + j * pq.code_size, nbits
-    );
-    float dis = 0;
-    const float * __restrict dt = dis_table;
-    for (size_t m = 0; m < M; m++) {
-      uint64_t c = decoder.decode();
-      dis += dt[c];
-      dt += ksub;
-    }
-
-    if (C::cmp(heap_dis[0], dis)) {
-      heap_pop<C>(k, heap_dis, heap_ids);
-      heap_push<C>(k, heap_dis, heap_ids, dis, j);
-    }
-  }
-}
-
-/*********************************************
- * PQ implementation
- *********************************************/
-
-
-
-ProductQuantizer::ProductQuantizer (size_t d, size_t M, size_t nbits):
-    d(d), M(M), nbits(nbits), assign_index(nullptr)
-{
-    set_derived_values ();
-}
-
-ProductQuantizer::ProductQuantizer ()
-    : ProductQuantizer(0, 1, 0) {}
-
-void ProductQuantizer::set_derived_values () {
-    // quite a few derived values
-    FAISS_THROW_IF_NOT (d % M == 0);
-    dsub = d / M;
-    code_size = (nbits * M + 7) / 8;
-    ksub = 1 << nbits;
-    centroids.resize (d * ksub);
-    verbose = false;
-    train_type = Train_default;
-}
-
-void ProductQuantizer::set_params (const float * centroids_, int m)
-{
-  memcpy (get_centroids(m, 0), centroids_,
-            ksub * dsub * sizeof (centroids_[0]));
-}
-
-
-static void init_hypercube (int d, int nbits,
-                            int n, const float * x,
-                            float *centroids)
-{
-
-    std::vector<float> mean (d);
-    for (int i = 0; i < n; i++)
-        for (int j = 0; j < d; j++)
-            mean [j] += x[i * d + j];
-
-    float maxm = 0;
-    for (int j = 0; j < d; j++) {
-        mean [j] /= n;
-        if (fabs(mean[j]) > maxm) maxm = fabs(mean[j]);
-    }
-
-    for (int i = 0; i < (1 << nbits); i++) {
-        float * cent = centroids + i * d;
-        for (int j = 0; j < nbits; j++)
-            cent[j] = mean [j] + (((i >> j) & 1) ? 1 : -1) * maxm;
-        for (int j = nbits; j < d; j++)
-            cent[j] = mean [j];
-    }
-
-
-}
-
-static void init_hypercube_pca (int d, int nbits,
-                                int n, const float * x,
-                                float *centroids)
-{
-    PCAMatrix pca (d, nbits);
-    pca.train (n, x);
-
-
-    for (int i = 0; i < (1 << nbits); i++) {
-        float * cent = centroids + i * d;
-        for (int j = 0; j < d; j++) {
-            cent[j] = pca.mean[j];
-            float f = 1.0;
-            for (int k = 0; k < nbits; k++)
-                cent[j] += f *
-                    sqrt (pca.eigenvalues [k]) *
-                    (((i >> k) & 1) ? 1 : -1) *
-                    pca.PCAMat [j + k * d];
-        }
-    }
-
-}
-
-void ProductQuantizer::train (int n, const float * x)
-{
-    if (train_type != Train_shared) {
-        train_type_t final_train_type;
-        final_train_type = train_type;
-        if (train_type == Train_hypercube ||
-            train_type == Train_hypercube_pca) {
-            if (dsub < nbits) {
-                final_train_type = Train_default;
-                printf ("cannot train hypercube: nbits=%ld > log2(d=%ld)\n",
-                        nbits, dsub);
-            }
-        }
-
-        float * xslice = new float[n * dsub];
-        ScopeDeleter<float> del (xslice);
-        for (int m = 0; m < M; m++) {
-            for (int j = 0; j < n; j++)
-                memcpy (xslice + j * dsub,
-                        x + j * d + m * dsub,
-                        dsub * sizeof(float));
-
-            Clustering clus (dsub, ksub, cp);
-
-            // we have some initialization for the centroids
-            if (final_train_type != Train_default) {
-                clus.centroids.resize (dsub * ksub);
-            }
-
-            switch (final_train_type) {
-            case Train_hypercube:
-                init_hypercube (dsub, nbits, n, xslice,
-                                clus.centroids.data ());
-                break;
-            case  Train_hypercube_pca:
-                init_hypercube_pca (dsub, nbits, n, xslice,
-                                    clus.centroids.data ());
-                break;
-            case  Train_hot_start:
-                memcpy (clus.centroids.data(),
-                        get_centroids (m, 0),
-                        dsub * ksub * sizeof (float));
-                break;
-            default: ;
-            }
-
-            if(verbose) {
-                clus.verbose = true;
-                printf ("Training PQ slice %d/%zd\n", m, M);
-            }
-            IndexFlatL2 index (dsub);
-            clus.train (n, xslice, assign_index ? *assign_index : index);
-            set_params (clus.centroids.data(), m);
-        }
-
-
-    } else {
-
-        Clustering clus (dsub, ksub, cp);
-
-        if(verbose) {
-            clus.verbose = true;
-            printf ("Training all PQ slices at once\n");
-        }
-
-        IndexFlatL2 index (dsub);
-
-        clus.train (n * M, x, assign_index ? *assign_index : index);
-        for (int m = 0; m < M; m++) {
-            set_params (clus.centroids.data(), m);
-        }
-
-    }
-}
-
-template<class PQEncoder>
-void compute_code(const ProductQuantizer& pq, const float *x, uint8_t *code) {
-  float distances [pq.ksub];
-  PQEncoder encoder(code, pq.nbits);
-  for (size_t m = 0; m < pq.M; m++) {
-    float mindis = 1e20;
-    uint64_t idxm = 0;
-    const float * xsub = x + m * pq.dsub;
-
-    fvec_L2sqr_ny(distances, xsub, pq.get_centroids(m, 0), pq.dsub, pq.ksub);
-
-    /* Find best centroid */
-    for (size_t i = 0; i < pq.ksub; i++) {
-      float dis = distances[i];
-      if (dis < mindis) {
-        mindis = dis;
-        idxm = i;
-      }
-    }
-
-    encoder.encode(idxm);
-  }
-}
-
-void ProductQuantizer::compute_code(const float * x, uint8_t * code) const {
-  switch (nbits) {
-    case 8:
-      faiss::compute_code<PQEncoder8>(*this, x, code);
-      break;
-
-    case 16:
-      faiss::compute_code<PQEncoder16>(*this, x, code);
-      break;
-
-    default:
-      faiss::compute_code<PQEncoderGeneric>(*this, x, code);
-      break;
-  }
-}
-
-template<class PQDecoder>
-void decode(const ProductQuantizer& pq, const uint8_t *code, float *x)
-{
-  PQDecoder decoder(code, pq.nbits);
-  for (size_t m = 0; m < pq.M; m++) {
-    uint64_t c = decoder.decode();
-    memcpy(x + m * pq.dsub, pq.get_centroids(m, c), sizeof(float) * pq.dsub);
-  }
-}
-
-void ProductQuantizer::decode (const uint8_t *code, float *x) const
-{
-  switch (nbits) {
-    case 8:
-      faiss::decode<PQDecoder8>(*this, code, x);
-      break;
-
-    case 16:
-      faiss::decode<PQDecoder16>(*this, code, x);
-      break;
-
-    default:
-      faiss::decode<PQDecoderGeneric>(*this, code, x);
-      break;
-  }
-}
-
-
-void ProductQuantizer::decode (const uint8_t *code, float *x, size_t n) const
-{
-    for (size_t i = 0; i < n; i++) {
-        this->decode (code + code_size * i, x + d * i);
-    }
-}
-
-
-void ProductQuantizer::compute_code_from_distance_table (const float *tab,
-                                                         uint8_t *code) const
-{
-  PQEncoderGeneric encoder(code, nbits);
-  for (size_t m = 0; m < M; m++) {
-    float mindis = 1e20;
-    uint64_t idxm = 0;
-
-    /* Find best centroid */
-    for (size_t j = 0; j < ksub; j++) {
-      float dis = *tab++;
-      if (dis < mindis) {
-        mindis = dis;
-        idxm = j;
-      }
-    }
-
-    encoder.encode(idxm);
-  }
-}
-
-void ProductQuantizer::compute_codes_with_assign_index (
-                const float * x,
-                uint8_t * codes,
-                size_t n)
-{
-    FAISS_THROW_IF_NOT (assign_index && assign_index->d == dsub);
-
-    for (size_t m = 0; m < M; m++) {
-        assign_index->reset ();
-        assign_index->add (ksub, get_centroids (m, 0));
-        size_t bs = 65536;
-        float * xslice = new float[bs * dsub];
-        ScopeDeleter<float> del (xslice);
-        idx_t *assign = new idx_t[bs];
-        ScopeDeleter<idx_t> del2 (assign);
-
-        for (size_t i0 = 0; i0 < n; i0 += bs) {
-            size_t i1 = std::min(i0 + bs, n);
-
-            for (size_t i = i0; i < i1; i++) {
-                memcpy (xslice + (i - i0) * dsub,
-                        x + i * d + m * dsub,
-                        dsub * sizeof(float));
-            }
-
-            assign_index->assign (i1 - i0, xslice, assign);
-
-            if (nbits == 8) {
-              uint8_t *c = codes + code_size * i0 + m;
-              for (size_t i = i0; i < i1; i++) {
-                *c = assign[i - i0];
-                c += M;
-              }
-            } else if (nbits == 16) {
-              uint16_t *c = (uint16_t*)(codes + code_size * i0 + m * 2);
-              for (size_t i = i0; i < i1; i++) {
-                *c = assign[i - i0];
-                c += M;
-              }
-            } else {
-              for (size_t i = i0; i < i1; ++i) {
-                uint8_t *c = codes + code_size * i + ((m * nbits) / 8);
-                uint8_t offset = (m * nbits) % 8;
-                uint64_t ass = assign[i - i0];
-
-                PQEncoderGeneric encoder(c, nbits, offset);
-                encoder.encode(ass);
-              }
-            }
-
-        }
-    }
-
-}
-
-void ProductQuantizer::compute_codes (const float * x,
-                                      uint8_t * codes,
-                                      size_t n)  const
-{
-  // process by blocks to avoid using too much RAM
-    size_t bs = 256 * 1024;
-    if (n > bs) {
-        for (size_t i0 = 0; i0 < n; i0 += bs) {
-            size_t i1 = std::min(i0 + bs, n);
-            compute_codes (x + d * i0, codes + code_size * i0, i1 - i0);
-        }
-        return;
-    }
-
-    if (dsub < 16) { // simple direct computation
-
-#pragma omp parallel for
-        for (size_t i = 0; i < n; i++)
-            compute_code (x + i * d, codes + i * code_size);
-
-    } else { // worthwile to use BLAS
-        float *dis_tables = new float [n * ksub * M];
-        ScopeDeleter<float> del (dis_tables);
-        compute_distance_tables (n, x, dis_tables);
-
-#pragma omp parallel for
-        for (size_t i = 0; i < n; i++) {
-            uint8_t * code = codes + i * code_size;
-            const float * tab = dis_tables + i * ksub * M;
-            compute_code_from_distance_table (tab, code);
-        }
-    }
-}
-
-
-void ProductQuantizer::compute_distance_table (const float * x,
-                                               float * dis_table) const
-{
-    size_t m;
-
-    for (m = 0; m < M; m++) {
-        fvec_L2sqr_ny (dis_table + m * ksub,
-                       x + m * dsub,
-                       get_centroids(m, 0),
-                       dsub,
-                       ksub);
-    }
-}
-
-void ProductQuantizer::compute_inner_prod_table (const float * x,
-                                                 float * dis_table) const
-{
-    size_t m;
-
-    for (m = 0; m < M; m++) {
-        fvec_inner_products_ny (dis_table + m * ksub,
-                                x + m * dsub,
-                                get_centroids(m, 0),
-                                dsub,
-                                ksub);
-    }
-}
-
-
-void ProductQuantizer::compute_distance_tables (
-           size_t nx,
-           const float * x,
-           float * dis_tables) const
-{
-
-    if (dsub < 16) {
-
-#pragma omp parallel for
-        for (size_t i = 0; i < nx; i++) {
-            compute_distance_table (x + i * d, dis_tables + i * ksub * M);
-        }
-
-    } else { // use BLAS
-
-        for (int m = 0; m < M; m++) {
-            pairwise_L2sqr (dsub,
-                            nx, x + dsub * m,
-                            ksub, centroids.data() + m * dsub * ksub,
-                            dis_tables + ksub * m,
-                            d, dsub, ksub * M);
-        }
-    }
-}
-
-void ProductQuantizer::compute_inner_prod_tables (
-           size_t nx,
-           const float * x,
-           float * dis_tables) const
-{
-
-    if (dsub < 16) {
-
-#pragma omp parallel for
-        for (size_t i = 0; i < nx; i++) {
-            compute_inner_prod_table (x + i * d, dis_tables + i * ksub * M);
-        }
-
-    } else { // use BLAS
-
-        // compute distance tables
-        for (int m = 0; m < M; m++) {
-            FINTEGER ldc = ksub * M, nxi = nx, ksubi = ksub,
-                dsubi = dsub, di = d;
-            float one = 1.0, zero = 0;
-
-            sgemm_ ("Transposed", "Not transposed",
-                    &ksubi, &nxi, &dsubi,
-                    &one, &centroids [m * dsub * ksub], &dsubi,
-                    x + dsub * m, &di,
-                    &zero, dis_tables + ksub * m, &ldc);
-        }
-
-    }
-}
-
-template <class C>
-static void pq_knn_search_with_tables (
-      const ProductQuantizer& pq,
-      size_t nbits,
-      const float *dis_tables,
-      const uint8_t * codes,
-      const size_t ncodes,
-      HeapArray<C> * res,
-      bool init_finalize_heap)
-{
-    size_t k = res->k, nx = res->nh;
-    size_t ksub = pq.ksub, M = pq.M;
-
-
-#pragma omp parallel for
-    for (size_t i = 0; i < nx; i++) {
-        /* query preparation for asymmetric search: compute look-up tables */
-        const float* dis_table = dis_tables + i * ksub * M;
-
-        /* Compute distances and keep smallest values */
-        int64_t * __restrict heap_ids = res->ids + i * k;
-        float * __restrict heap_dis = res->val + i * k;
-
-        if (init_finalize_heap) {
-            heap_heapify<C> (k, heap_dis, heap_ids);
-        }
-
-        switch (nbits) {
-          case 8:
-              pq_estimators_from_tables<uint8_t, C> (pq,
-                                                     codes, ncodes,
-                                                     dis_table,
-                                                     k, heap_dis, heap_ids);
-              break;
-
-          case 16:
-              pq_estimators_from_tables<uint16_t, C> (pq,
-                                                      (uint16_t*)codes, ncodes,
-                                                      dis_table,
-                                                      k, heap_dis, heap_ids);
-              break;
-
-          default:
-              pq_estimators_from_tables_generic<C> (pq,
-                                                    nbits,
-                                                    codes, ncodes,
-                                                    dis_table,
-                                                    k, heap_dis, heap_ids);
-              break;
-        }
-
-        if (init_finalize_heap) {
-            heap_reorder<C> (k, heap_dis, heap_ids);
-        }
-    }
-}
-
-void ProductQuantizer::search (const float * __restrict x,
-                               size_t nx,
-                               const uint8_t * codes,
-                               const size_t ncodes,
-                               float_maxheap_array_t * res,
-                               bool init_finalize_heap) const
-{
-    FAISS_THROW_IF_NOT (nx == res->nh);
-    std::unique_ptr<float[]> dis_tables(new float [nx * ksub * M]);
-    compute_distance_tables (nx, x, dis_tables.get());
-
-    pq_knn_search_with_tables<CMax<float, int64_t>> (
-      *this, nbits, dis_tables.get(), codes, ncodes, res, init_finalize_heap);
-}
-
-void ProductQuantizer::search_ip (const float * __restrict x,
-                               size_t nx,
-                               const uint8_t * codes,
-                               const size_t ncodes,
-                               float_minheap_array_t * res,
-                               bool init_finalize_heap) const
-{
-    FAISS_THROW_IF_NOT (nx == res->nh);
-    std::unique_ptr<float[]> dis_tables(new float [nx * ksub * M]);
-    compute_inner_prod_tables (nx, x, dis_tables.get());
-
-    pq_knn_search_with_tables<CMin<float, int64_t> > (
-      *this, nbits, dis_tables.get(), codes, ncodes, res, init_finalize_heap);
-}
-
-
-
-static float sqr (float x) {
-    return x * x;
-}
-
-void ProductQuantizer::compute_sdc_table ()
-{
-    sdc_table.resize (M * ksub * ksub);
-
-    for (int m = 0; m < M; m++) {
-
-        const float *cents = centroids.data() + m * ksub * dsub;
-        float * dis_tab = sdc_table.data() + m * ksub * ksub;
-
-        // TODO optimize with BLAS
-        for (int i = 0; i < ksub; i++) {
-            const float *centi = cents + i * dsub;
-            for (int j = 0; j < ksub; j++) {
-                float accu = 0;
-                const float *centj = cents + j * dsub;
-                for (int k = 0; k < dsub; k++)
-                    accu += sqr (centi[k] - centj[k]);
-                dis_tab [i + j * ksub] = accu;
-            }
-        }
-    }
-}
-
-void ProductQuantizer::search_sdc (const uint8_t * qcodes,
-                     size_t nq,
-                     const uint8_t * bcodes,
-                     const size_t nb,
-                     float_maxheap_array_t * res,
-                     bool init_finalize_heap) const
-{
-    FAISS_THROW_IF_NOT (sdc_table.size() == M * ksub * ksub);
-    FAISS_THROW_IF_NOT (nbits == 8);
-    size_t k = res->k;
-
-
-#pragma omp parallel for
-    for (size_t i = 0; i < nq; i++) {
-
-        /* Compute distances and keep smallest values */
-        idx_t * heap_ids = res->ids + i * k;
-        float *  heap_dis = res->val + i * k;
-        const uint8_t * qcode = qcodes + i * code_size;
-
-        if (init_finalize_heap)
-            maxheap_heapify (k, heap_dis, heap_ids);
-
-        const uint8_t * bcode = bcodes;
-        for (size_t j = 0; j < nb; j++) {
-            float dis = 0;
-            const float * tab = sdc_table.data();
-            for (int m = 0; m < M; m++) {
-                dis += tab[bcode[m] + qcode[m] * ksub];
-                tab += ksub * ksub;
-            }
-            if (dis < heap_dis[0]) {
-                maxheap_pop (k, heap_dis, heap_ids);
-                maxheap_push (k, heap_dis, heap_ids, dis, j);
-            }
-            bcode += code_size;
-        }
-
-        if (init_finalize_heap)
-            maxheap_reorder (k, heap_dis, heap_ids);
-    }
-
-}
-
-
-ProductQuantizer::PQEncoderGeneric::PQEncoderGeneric(uint8_t *code, int nbits,
-                                                     uint8_t offset)
-    : code(code), offset(offset), nbits(nbits), reg(0) {
-  assert(nbits <= 64);
-  if (offset > 0) {
-    reg = (*code & ((1 << offset) - 1));
-  }
-}
-
-void ProductQuantizer::PQEncoderGeneric::encode(uint64_t x) {
-  reg |= (uint8_t)(x << offset);
-  x >>= (8 - offset);
-  if (offset + nbits >= 8) {
-    *code++ = reg;
-
-    for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
-      *code++ = (uint8_t)x;
-      x >>= 8;
-    }
-
-    offset += nbits;
-    offset &= 7;
-    reg = (uint8_t)x;
-  } else {
-    offset += nbits;
-  }
-}
-
-ProductQuantizer::PQEncoderGeneric::~PQEncoderGeneric() {
-  if (offset > 0) {
-    *code = reg;
-  }
-}
-
-
-ProductQuantizer::PQEncoder8::PQEncoder8(uint8_t *code, int nbits)
-    : code(code) {
-  assert(8 == nbits);
-}
-
-void ProductQuantizer::PQEncoder8::encode(uint64_t x) {
-  *code++ = (uint8_t)x;
-}
-
-
-ProductQuantizer::PQEncoder16::PQEncoder16(uint8_t *code, int nbits)
-    : code((uint16_t *)code) {
-  assert(16 == nbits);
-}
-
-void ProductQuantizer::PQEncoder16::encode(uint64_t x) {
-  *code++ = (uint16_t)x;
-}
-
-
-ProductQuantizer::PQDecoderGeneric::PQDecoderGeneric(const uint8_t *code,
-                                                     int nbits)
-    : code(code),
-      offset(0),
-      nbits(nbits),
-      mask((1ull << nbits) - 1),
-      reg(0) {
-  assert(nbits <= 64);
-}
-
-uint64_t ProductQuantizer::PQDecoderGeneric::decode() {
-  if (offset == 0) {
-    reg = *code;
-  }
-  uint64_t c = (reg >> offset);
-
-  if (offset + nbits >= 8) {
-    uint64_t e = 8 - offset;
-    ++code;
-    for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
-      c |= ((uint64_t)(*code++) << e);
-      e += 8;
-    }
-
-    offset += nbits;
-    offset &= 7;
-    if (offset > 0) {
-      reg = *code;
-      c |= ((uint64_t)reg << e);
-    }
-  } else {
-    offset += nbits;
-  }
-
-  return c & mask;
-}
-
-
-ProductQuantizer::PQDecoder8::PQDecoder8(const uint8_t *code, int nbits)
-    : code(code) {
-  assert(8 == nbits);
-}
-
-uint64_t ProductQuantizer::PQDecoder8::decode() {
-  return (uint64_t)(*code++);
-}
-
-
-ProductQuantizer::PQDecoder16::PQDecoder16(const uint8_t *code, int nbits)
-    : code((uint16_t *)code) {
-  assert(16 == nbits);
-}
-
-uint64_t ProductQuantizer::PQDecoder16::decode() {
-  return (uint64_t)(*code++);
-}
-
-
-}  // namespace faiss
diff --git a/ProductQuantizer.h b/ProductQuantizer.h
deleted file mode 100644
index 0c3cc9eb5e..0000000000
--- a/ProductQuantizer.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_PRODUCT_QUANTIZER_H
-#define FAISS_PRODUCT_QUANTIZER_H
-
-#include <stdint.h>
-
-#include <vector>
-
-#include "Clustering.h"
-#include "Heap.h"
-
-namespace faiss {
-
-/** Product Quantizer. Implemented only for METRIC_L2 */
-struct ProductQuantizer {
-
-    using idx_t = Index::idx_t;
-
-    size_t d;              ///< size of the input vectors
-    size_t M;              ///< number of subquantizers
-    size_t nbits;          ///< number of bits per quantization index
-
-    // values derived from the above
-    size_t dsub;           ///< dimensionality of each subvector
-    size_t code_size;      ///< byte per indexed vector
-    size_t ksub;           ///< number of centroids for each subquantizer
-    bool verbose;          ///< verbose during training?
-
-    /// initialization
-    enum train_type_t {
-        Train_default,
-        Train_hot_start,   ///< the centroids are already initialized
-        Train_shared,      ///< share dictionary accross PQ segments
-        Train_hypercube,   ///< intialize centroids with nbits-D hypercube
-        Train_hypercube_pca,   ///< intialize centroids with nbits-D hypercube
-    };
-    train_type_t train_type;
-
-    ClusteringParameters cp; ///< parameters used during clustering
-
-    /// if non-NULL, use this index for assignment (should be of size
-    /// d / M)
-    Index *assign_index;
-
-    /// Centroid table, size M * ksub * dsub
-    std::vector<float> centroids;
-
-    /// return the centroids associated with subvector m
-    float * get_centroids (size_t m, size_t i) {
-        return &centroids [(m * ksub + i) * dsub];
-    }
-    const float * get_centroids (size_t m, size_t i) const {
-        return &centroids [(m * ksub + i) * dsub];
-    }
-
-    // Train the product quantizer on a set of points. A clustering
-    // can be set on input to define non-default clustering parameters
-    void train (int n, const float *x);
-
-    ProductQuantizer(size_t d, /* dimensionality of the input vectors */
-            size_t M,          /* number of subquantizers */
-            size_t nbits);     /* number of bit per subvector index */
-
-    ProductQuantizer ();
-
-    /// compute derived values when d, M and nbits have been set
-    void set_derived_values ();
-
-    /// Define the centroids for subquantizer m
-    void set_params (const float * centroids, int m);
-
-    /// Quantize one vector with the product quantizer
-    void compute_code (const float * x, uint8_t * code) const ;
-
-    /// same as compute_code for several vectors
-    void compute_codes (const float * x,
-                        uint8_t * codes,
-                        size_t n) const ;
-
-    /// speed up code assignment using assign_index
-    /// (non-const because the index is changed)
-    void compute_codes_with_assign_index (
-                const float * x,
-                uint8_t * codes,
-                size_t n);
-
-    /// decode a vector from a given code (or n vectors if third argument)
-    void decode (const uint8_t *code, float *x) const;
-    void decode (const uint8_t *code, float *x, size_t n) const;
-
-    /// If we happen to have the distance tables precomputed, this is
-    /// more efficient to compute the codes.
-    void compute_code_from_distance_table (const float *tab,
-                                           uint8_t *code) const;
-
-
-    /** Compute distance table for one vector.
-     *
-     * The distance table for x = [x_0 x_1 .. x_(M-1)] is a M * ksub
-     * matrix that contains
-     *
-     *   dis_table (m, j) = || x_m - c_(m, j)||^2
-     *   for m = 0..M-1 and j = 0 .. ksub - 1
-     *
-     * where c_(m, j) is the centroid no j of sub-quantizer m.
-     *
-     * @param x         input vector size d
-     * @param dis_table output table, size M * ksub
-     */
-    void compute_distance_table (const float * x,
-                                 float * dis_table) const;
-
-    void compute_inner_prod_table (const float * x,
-                                   float * dis_table) const;
-
-
-    /** compute distance table for several vectors
-     * @param nx        nb of input vectors
-     * @param x         input vector size nx * d
-     * @param dis_table output table, size nx * M * ksub
-     */
-    void compute_distance_tables (size_t nx,
-                                  const float * x,
-                                  float * dis_tables) const;
-
-    void compute_inner_prod_tables (size_t nx,
-                                    const float * x,
-                                    float * dis_tables) const;
-
-
-    /** perform a search (L2 distance)
-     * @param x        query vectors, size nx * d
-     * @param nx       nb of queries
-     * @param codes    database codes, size ncodes * code_size
-     * @param ncodes   nb of nb vectors
-     * @param res      heap array to store results (nh == nx)
-     * @param init_finalize_heap  initialize heap (input) and sort (output)?
-     */
-    void search (const float * x,
-                 size_t nx,
-                 const uint8_t * codes,
-                 const size_t ncodes,
-                 float_maxheap_array_t *res,
-                 bool init_finalize_heap = true) const;
-
-    /** same as search, but with inner product similarity */
-    void search_ip (const float * x,
-                 size_t nx,
-                 const uint8_t * codes,
-                 const size_t ncodes,
-                 float_minheap_array_t *res,
-                 bool init_finalize_heap = true) const;
-
-
-    /// Symmetric Distance Table
-    std::vector<float> sdc_table;
-
-    // intitialize the SDC table from the centroids
-    void compute_sdc_table ();
-
-    void search_sdc (const uint8_t * qcodes,
-                     size_t nq,
-                     const uint8_t * bcodes,
-                     const size_t ncodes,
-                     float_maxheap_array_t * res,
-                     bool init_finalize_heap = true) const;
-
-    struct PQEncoderGeneric {
-        uint8_t *code;   ///< code for this vector
-        uint8_t offset;
-        const int nbits; ///< number of bits per subquantizer index
-
-        uint8_t reg;
-
-        PQEncoderGeneric(uint8_t *code, int nbits, uint8_t offset = 0);
-
-        void encode(uint64_t x);
-
-        ~PQEncoderGeneric();
-    };
-
-
-    struct PQEncoder8 {
-        uint8_t *code;
-
-        PQEncoder8(uint8_t *code, int nbits);
-
-        void encode(uint64_t x);
-    };
-
-    struct PQEncoder16 {
-        uint16_t *code;
-
-        PQEncoder16(uint8_t *code, int nbits);
-
-        void encode(uint64_t x);
-    };
-
-
-    struct PQDecoderGeneric {
-        const uint8_t *code;
-        uint8_t offset;
-        const int nbits;
-        const uint64_t mask;
-        uint8_t reg;
-
-        PQDecoderGeneric(const uint8_t *code, int nbits);
-
-        uint64_t decode();
-    };
-
-    struct PQDecoder8 {
-        const uint8_t *code;
-
-        PQDecoder8(const uint8_t *code, int nbits);
-
-        uint64_t decode();
-    };
-
-    struct PQDecoder16 {
-        const uint16_t *code;
-
-        PQDecoder16(const uint8_t *code, int nbits);
-
-        uint64_t decode();
-    };
-
-};
-
-
-}  // namespace faiss
-
-
-#endif
diff --git a/ThreadedIndex-inl.h b/ThreadedIndex-inl.h
deleted file mode 100644
index 7416fe2c1d..0000000000
--- a/ThreadedIndex-inl.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "FaissAssert.h"
-#include <exception>
-#include <iostream>
-
-namespace faiss {
-
-template <typename IndexT>
-ThreadedIndex<IndexT>::ThreadedIndex(bool threaded)
-    // 0 is default dimension
-    : ThreadedIndex(0, threaded) {
-}
-
-template <typename IndexT>
-ThreadedIndex<IndexT>::ThreadedIndex(int d, bool threaded)
-    : IndexT(d),
-      own_fields(false),
-      isThreaded_(threaded) {
-  }
-
-template <typename IndexT>
-ThreadedIndex<IndexT>::~ThreadedIndex() {
-  for (auto& p : indices_) {
-    if (isThreaded_) {
-      // should have worker thread
-      FAISS_ASSERT((bool) p.second);
-
-      // This will also flush all pending work
-      p.second->stop();
-      p.second->waitForThreadExit();
-    } else {
-      // should not have worker thread
-      FAISS_ASSERT(!(bool) p.second);
-    }
-
-    if (own_fields) {
-      delete p.first;
-    }
-  }
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::addIndex(IndexT* index) {
-  // We inherit the dimension from the first index added to us if we don't have
-  // a set dimension
-  if (indices_.empty() && this->d == 0) {
-    this->d = index->d;
-  }
-
-  // The new index must match our set dimension
-  FAISS_THROW_IF_NOT_FMT(this->d == index->d,
-                         "addIndex: dimension mismatch for "
-                         "newly added index; expecting dim %d, "
-                         "new index has dim %d",
-                         this->d, index->d);
-
-  if (!indices_.empty()) {
-    auto& existing = indices_.front().first;
-
-    FAISS_THROW_IF_NOT_MSG(index->metric_type == existing->metric_type,
-                           "addIndex: newly added index is "
-                           "of different metric type than old index");
-
-    // Make sure this index is not duplicated
-    for (auto& p : indices_) {
-      FAISS_THROW_IF_NOT_MSG(p.first != index,
-                             "addIndex: attempting to add index "
-                             "that is already in the collection");
-    }
-  }
-
-  indices_.emplace_back(
-    std::make_pair(
-      index,
-      std::unique_ptr<WorkerThread>(isThreaded_ ?
-                                    new WorkerThread : nullptr)));
-
-  onAfterAddIndex(index);
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::removeIndex(IndexT* index) {
-  for (auto it = indices_.begin(); it != indices_.end(); ++it) {
-    if (it->first == index) {
-      // This is our index; stop the worker thread before removing it,
-      // to ensure that it has finished before function exit
-      if (isThreaded_) {
-        // should have worker thread
-        FAISS_ASSERT((bool) it->second);
-        it->second->stop();
-        it->second->waitForThreadExit();
-      } else {
-        // should not have worker thread
-        FAISS_ASSERT(!(bool) it->second);
-      }
-
-      indices_.erase(it);
-      onAfterRemoveIndex(index);
-
-      if (own_fields) {
-        delete index;
-      }
-
-      return;
-    }
-  }
-
-  // could not find our index
-  FAISS_THROW_MSG("IndexReplicas::removeIndex: index not found");
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::runOnIndex(std::function<void(int, IndexT*)> f) {
-  if (isThreaded_) {
-    std::vector<std::future<bool>> v;
-
-    for (int i = 0; i < this->indices_.size(); ++i) {
-      auto& p = this->indices_[i];
-      auto indexPtr = p.first;
-      v.emplace_back(p.second->add([f, i, indexPtr](){ f(i, indexPtr); }));
-    }
-
-    waitAndHandleFutures(v);
-  } else {
-    // Multiple exceptions may be thrown; gather them as we encounter them,
-    // while letting everything else run to completion
-    std::vector<std::pair<int, std::exception_ptr>> exceptions;
-
-    for (int i = 0; i < this->indices_.size(); ++i) {
-      auto& p = this->indices_[i];
-      try {
-        f(i, p.first);
-      } catch (...) {
-        exceptions.emplace_back(std::make_pair(i, std::current_exception()));
-      }
-    }
-
-    handleExceptions(exceptions);
-  }
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::runOnIndex(
-  std::function<void(int, const IndexT*)> f) const {
-  const_cast<ThreadedIndex<IndexT>*>(this)->runOnIndex(
-    [f](int i, IndexT* idx){ f(i, idx); });
-}
-
-template <typename IndexT>
-void ThreadedIndex<IndexT>::reset() {
-  runOnIndex([](int, IndexT* index){ index->reset(); });
-  this->ntotal = 0;
-  this->is_trained = false;
-}
-
-template <typename IndexT>
-void
-ThreadedIndex<IndexT>::onAfterAddIndex(IndexT* index) {
-}
-
-template <typename IndexT>
-void
-ThreadedIndex<IndexT>::onAfterRemoveIndex(IndexT* index) {
-}
-
-template <typename IndexT>
-void
-ThreadedIndex<IndexT>::waitAndHandleFutures(std::vector<std::future<bool>>& v) {
-  // Blocking wait for completion for all of the indices, capturing any
-  // exceptions that are generated
-  std::vector<std::pair<int, std::exception_ptr>> exceptions;
-
-  for (int i = 0; i < v.size(); ++i) {
-    auto& fut = v[i];
-
-    try {
-      fut.get();
-    } catch (...) {
-      exceptions.emplace_back(std::make_pair(i, std::current_exception()));
-    }
-  }
-
-  handleExceptions(exceptions);
-}
-
-} // namespace
diff --git a/ThreadedIndex.h b/ThreadedIndex.h
deleted file mode 100644
index 2e6632a72f..0000000000
--- a/ThreadedIndex.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include "Index.h"
-#include "IndexBinary.h"
-#include "WorkerThread.h"
-#include <memory>
-#include <vector>
-
-namespace faiss {
-
-/// A holder of indices in a collection of threads
-/// The interface to this class itself is not thread safe
-template <typename IndexT>
-class ThreadedIndex : public IndexT {
- public:
-  explicit ThreadedIndex(bool threaded);
-  explicit ThreadedIndex(int d, bool threaded);
-
-  ~ThreadedIndex() override;
-
-  /// override an index that is managed by ourselves.
-  /// WARNING: once an index is added, it becomes unsafe to touch it from any
-  /// other thread than that on which is managing it, until we are shut
-  /// down. Use runOnIndex to perform work on it instead.
-  void addIndex(IndexT* index);
-
-  /// Remove an index that is managed by ourselves.
-  /// This will flush all pending work on that index, and then shut
-  /// down its managing thread, and will remove the index.
-  void removeIndex(IndexT* index);
-
-  /// Run a function on all indices, in the thread that the index is
-  /// managed in.
-  /// Function arguments are (index in collection, index pointer)
-  void runOnIndex(std::function<void(int, IndexT*)> f);
-  void runOnIndex(std::function<void(int, const IndexT*)> f) const;
-
-  /// faiss::Index API
-  /// All indices receive the same call
-  void reset() override;
-
-  /// Returns the number of sub-indices
-  int count() const { return indices_.size(); }
-
-  /// Returns the i-th sub-index
-  IndexT* at(int i) { return indices_[i].first; }
-
-  /// Returns the i-th sub-index (const version)
-  const IndexT* at(int i) const { return indices_[i].first; }
-
-  /// Whether or not we are responsible for deleting our contained indices
-  bool own_fields;
-
- protected:
-  /// Called just after an index is added
-  virtual void onAfterAddIndex(IndexT* index);
-
-  /// Called just after an index is removed
-  virtual void onAfterRemoveIndex(IndexT* index);
-
-protected:
-  static void waitAndHandleFutures(std::vector<std::future<bool>>& v);
-
-  /// Collection of Index instances, with their managing worker thread if any
-  std::vector<std::pair<IndexT*, std::unique_ptr<WorkerThread>>> indices_;
-
-  /// Is this index multi-threaded?
-  bool isThreaded_;
-};
-
-} // namespace
-
-#include "ThreadedIndex-inl.h"
diff --git a/WorkerThread.cpp b/WorkerThread.cpp
deleted file mode 100644
index 6e9c5a5dc5..0000000000
--- a/WorkerThread.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-
-#include "WorkerThread.h"
-#include "FaissAssert.h"
-#include <exception>
-
-namespace faiss {
-
-namespace {
-
-// Captures any exceptions thrown by the lambda and returns them via the promise
-void runCallback(std::function<void()>& fn,
-                 std::promise<bool>& promise) {
-  try {
-    fn();
-    promise.set_value(true);
-  } catch (...) {
-    promise.set_exception(std::current_exception());
-  }
-}
-
-} // namespace
-
-WorkerThread::WorkerThread() :
-    wantStop_(false) {
-  startThread();
-
-  // Make sure that the thread has started before continuing
-  add([](){}).get();
-}
-
-WorkerThread::~WorkerThread() {
-  stop();
-  waitForThreadExit();
-}
-
-void
-WorkerThread::startThread() {
-  thread_ = std::thread([this](){ threadMain(); });
-}
-
-void
-WorkerThread::stop() {
-  std::lock_guard<std::mutex> guard(mutex_);
-
-  wantStop_ = true;
-  monitor_.notify_one();
-}
-
-std::future<bool>
-WorkerThread::add(std::function<void()> f) {
-  std::lock_guard<std::mutex> guard(mutex_);
-
-  if (wantStop_) {
-    // The timer thread has been stopped, or we want to stop; we can't
-    // schedule anything else
-    std::promise<bool> p;
-    auto fut = p.get_future();
-
-    // did not execute
-    p.set_value(false);
-    return fut;
-  }
-
-  auto pr = std::promise<bool>();
-  auto fut = pr.get_future();
-
-  queue_.emplace_back(std::make_pair(std::move(f), std::move(pr)));
-
-  // Wake up our thread
-  monitor_.notify_one();
-  return fut;
-}
-
-void
-WorkerThread::threadMain() {
-  threadLoop();
-
-  // Call all pending tasks
-  FAISS_ASSERT(wantStop_);
-
-  // flush all pending operations
-  for (auto& f : queue_) {
-    runCallback(f.first, f.second);
-  }
-}
-
-void
-WorkerThread::threadLoop() {
-  while (true) {
-    std::pair<std::function<void()>, std::promise<bool>> data;
-
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-
-      while (!wantStop_ && queue_.empty()) {
-        monitor_.wait(lock);
-      }
-
-      if (wantStop_) {
-        return;
-      }
-
-      data = std::move(queue_.front());
-      queue_.pop_front();
-    }
-
-    runCallback(data.first, data.second);
-  }
-}
-
-void
-WorkerThread::waitForThreadExit() {
-  try {
-    thread_.join();
-  } catch (...) {
-  }
-}
-
-} // namespace
diff --git a/WorkerThread.h b/WorkerThread.h
deleted file mode 100644
index 7ab21e9f90..0000000000
--- a/WorkerThread.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-
-#pragma once
-
-#include <condition_variable>
-#include <future>
-#include <deque>
-#include <thread>
-
-namespace faiss {
-
-class WorkerThread {
- public:
-  WorkerThread();
-
-  /// Stops and waits for the worker thread to exit, flushing all
-  /// pending lambdas
-  ~WorkerThread();
-
-  /// Request that the worker thread stop itself
-  void stop();
-
-  /// Blocking waits in the current thread for the worker thread to
-  /// stop
-  void waitForThreadExit();
-
-  /// Adds a lambda to run on the worker thread; returns a future that
-  /// can be used to block on its completion.
-  /// Future status is `true` if the lambda was run in the worker
-  /// thread; `false` if it was not run, because the worker thread is
-  /// exiting or has exited.
-  std::future<bool> add(std::function<void()> f);
-
- private:
-  void startThread();
-  void threadMain();
-  void threadLoop();
-
-  /// Thread that all queued lambdas are run on
-  std::thread thread_;
-
-  /// Mutex for the queue and exit status
-  std::mutex mutex_;
-
-  /// Monitor for the exit status and the queue
-  std::condition_variable monitor_;
-
-  /// Whether or not we want the thread to exit
-  bool wantStop_;
-
-  /// Queue of pending lambdas to call
-  std::deque<std::pair<std::function<void()>, std::promise<bool>>> queue_;
-};
-
-} // namespace
diff --git a/depend b/depend
index 3d59c92978..36d44cc072 100644
--- a/depend
+++ b/depend
@@ -1,19 +1,15 @@
-utils.o: utils.cpp utils.h Heap.h AuxIndexStructures.h Index.h \
- FaissAssert.h FaissException.h
-IndexIVFPQR.o: IndexIVFPQR.cpp faiss/IndexIVFPQR.h faiss/IndexIVFPQ.h \
- faiss/IndexIVF.h faiss/Index.h faiss/InvertedLists.h faiss/Clustering.h \
- faiss/utils/Heap.h faiss/IndexPQ.h faiss/impl/ProductQuantizer.h \
- faiss/impl/PolysemousTraining.h faiss/utils/utils.h \
- faiss/utils/distances.h faiss/impl/FaissAssert.h \
- faiss/impl/FaissException.h
-OnDiskInvertedLists.o: OnDiskInvertedLists.cpp \
- faiss/OnDiskInvertedLists.h faiss/IndexIVF.h faiss/Index.h \
- faiss/InvertedLists.h faiss/Clustering.h faiss/utils/Heap.h \
- faiss/impl/FaissAssert.h faiss/impl/FaissException.h faiss/utils/utils.h
+IndexPreTransform.o: IndexPreTransform.cpp faiss/IndexPreTransform.h \
+ faiss/Index.h faiss/VectorTransform.h faiss/utils/utils.h \
+ faiss/utils/Heap.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h
 IndexFlat.o: IndexFlat.cpp faiss/IndexFlat.h faiss/Index.h \
  faiss/utils/distances.h faiss/utils/Heap.h faiss/utils/extra_distances.h \
  faiss/utils/utils.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
  faiss/impl/AuxIndexStructures.h
+IndexBinaryFlat.o: IndexBinaryFlat.cpp faiss/IndexBinaryFlat.h \
+ faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
+ faiss/Index.h faiss/utils/hamming.h faiss/utils/Heap.h \
+ faiss/utils/hamming-inl.h faiss/utils/utils.h \
+ faiss/impl/AuxIndexStructures.h
 IndexIVFSpectralHash.o: IndexIVFSpectralHash.cpp \
  faiss/IndexIVFSpectralHash.h faiss/IndexIVF.h faiss/Index.h \
  faiss/InvertedLists.h faiss/Clustering.h faiss/utils/Heap.h \
@@ -23,6 +19,53 @@ IndexIVFSpectralHash.o: IndexIVFSpectralHash.cpp \
 InvertedLists.o: InvertedLists.cpp faiss/InvertedLists.h faiss/Index.h \
  faiss/utils/utils.h faiss/utils/Heap.h faiss/impl/FaissAssert.h \
  faiss/impl/FaissException.h
+IndexLSH.o: IndexLSH.cpp faiss/IndexLSH.h faiss/Index.h \
+ faiss/VectorTransform.h faiss/utils/utils.h faiss/utils/Heap.h \
+ faiss/utils/hamming.h faiss/utils/hamming-inl.h faiss/impl/FaissAssert.h \
+ faiss/impl/FaissException.h
+IndexShards.o: IndexShards.cpp faiss/IndexShards.h faiss/Index.h \
+ faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
+ faiss/impl/ThreadedIndex.h faiss/utils/WorkerThread.h \
+ faiss/impl/ThreadedIndex-inl.h faiss/utils/Heap.h
+IndexBinaryIVF.o: IndexBinaryIVF.cpp faiss/IndexBinaryIVF.h \
+ faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
+ faiss/Index.h faiss/IndexIVF.h faiss/InvertedLists.h faiss/Clustering.h \
+ faiss/utils/Heap.h faiss/utils/hamming.h faiss/utils/hamming-inl.h \
+ faiss/utils/utils.h faiss/impl/AuxIndexStructures.h faiss/IndexFlat.h
+IndexHNSW.o: IndexHNSW.cpp faiss/IndexHNSW.h faiss/impl/HNSW.h \
+ faiss/Index.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
+ faiss/utils/random.h faiss/utils/Heap.h faiss/IndexFlat.h \
+ faiss/IndexPQ.h faiss/impl/ProductQuantizer.h faiss/Clustering.h \
+ faiss/impl/PolysemousTraining.h faiss/IndexScalarQuantizer.h \
+ faiss/IndexIVF.h faiss/InvertedLists.h faiss/impl/ScalarQuantizer.h \
+ faiss/impl/AuxIndexStructures.h faiss/utils/utils.h \
+ faiss/utils/distances.h faiss/IndexIVFPQ.h faiss/Index2Layer.h
+IndexBinaryFromFloat.o: IndexBinaryFromFloat.cpp \
+ faiss/IndexBinaryFromFloat.h faiss/IndexBinary.h \
+ faiss/impl/FaissAssert.h faiss/impl/FaissException.h faiss/Index.h \
+ faiss/utils/utils.h faiss/utils/Heap.h
+AutoTune.o: AutoTune.cpp faiss/AutoTune.h faiss/Index.h \
+ faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
+ faiss/utils/utils.h faiss/utils/Heap.h faiss/utils/random.h \
+ faiss/IndexFlat.h faiss/VectorTransform.h faiss/IndexPreTransform.h \
+ faiss/IndexLSH.h faiss/IndexPQ.h faiss/impl/ProductQuantizer.h \
+ faiss/Clustering.h faiss/impl/PolysemousTraining.h faiss/IndexIVF.h \
+ faiss/InvertedLists.h faiss/IndexIVFPQ.h faiss/IndexIVFPQR.h \
+ faiss/IndexIVFFlat.h faiss/MetaIndexes.h faiss/IndexShards.h \
+ faiss/impl/ThreadedIndex.h faiss/utils/WorkerThread.h \
+ faiss/impl/ThreadedIndex-inl.h faiss/IndexReplicas.h \
+ faiss/IndexScalarQuantizer.h faiss/impl/ScalarQuantizer.h \
+ faiss/impl/AuxIndexStructures.h faiss/IndexHNSW.h faiss/impl/HNSW.h \
+ faiss/IndexBinaryFlat.h faiss/IndexBinaryHNSW.h faiss/IndexBinaryIVF.h
+Clustering.o: Clustering.cpp faiss/Clustering.h faiss/Index.h \
+ faiss/impl/AuxIndexStructures.h faiss/utils/utils.h faiss/utils/Heap.h \
+ faiss/utils/random.h faiss/utils/distances.h faiss/impl/FaissAssert.h \
+ faiss/impl/FaissException.h faiss/IndexFlat.h
+MetaIndexes.o: MetaIndexes.cpp faiss/MetaIndexes.h faiss/Index.h \
+ faiss/IndexShards.h faiss/IndexBinary.h faiss/impl/FaissAssert.h \
+ faiss/impl/FaissException.h faiss/impl/ThreadedIndex.h \
+ faiss/utils/WorkerThread.h faiss/impl/ThreadedIndex-inl.h \
+ faiss/IndexReplicas.h faiss/utils/Heap.h faiss/impl/AuxIndexStructures.h
 index_factory.o: index_factory.cpp faiss/AutoTune.h faiss/Index.h \
  faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
  faiss/utils/utils.h faiss/utils/Heap.h faiss/utils/random.h \
@@ -38,21 +81,43 @@ index_factory.o: index_factory.cpp faiss/AutoTune.h faiss/Index.h \
  faiss/IndexHNSW.h faiss/impl/HNSW.h faiss/IndexLattice.h \
  faiss/impl/lattice_Zn.h faiss/IndexBinaryFlat.h faiss/IndexBinaryHNSW.h \
  faiss/IndexBinaryIVF.h
-IndexBinaryIVF.o: IndexBinaryIVF.cpp faiss/IndexBinaryIVF.h \
- faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
- faiss/Index.h faiss/IndexIVF.h faiss/InvertedLists.h faiss/Clustering.h \
- faiss/utils/Heap.h faiss/utils/hamming.h faiss/utils/hamming-inl.h \
- faiss/utils/utils.h faiss/impl/AuxIndexStructures.h faiss/IndexFlat.h
-ProductQuantizer.o: ProductQuantizer.cpp ProductQuantizer.h Clustering.h \
- faiss/Index.h Heap.h FaissAssert.h FaissException.h VectorTransform.h \
- IndexFlat.h utils.h
-Heap.o: Heap.cpp Heap.h
 VectorTransform.o: VectorTransform.cpp faiss/VectorTransform.h \
  faiss/Index.h faiss/utils/distances.h faiss/utils/Heap.h \
  faiss/utils/random.h faiss/utils/utils.h faiss/impl/FaissAssert.h \
  faiss/impl/FaissException.h faiss/IndexPQ.h \
  faiss/impl/ProductQuantizer.h faiss/Clustering.h \
  faiss/impl/PolysemousTraining.h
+IndexIVF.o: IndexIVF.cpp faiss/IndexIVF.h faiss/Index.h \
+ faiss/InvertedLists.h faiss/Clustering.h faiss/utils/Heap.h \
+ faiss/utils/utils.h faiss/utils/hamming.h faiss/utils/hamming-inl.h \
+ faiss/impl/FaissAssert.h faiss/impl/FaissException.h faiss/IndexFlat.h \
+ faiss/impl/AuxIndexStructures.h
+IndexIVFPQ.o: IndexIVFPQ.cpp faiss/IndexIVFPQ.h faiss/IndexIVF.h \
+ faiss/Index.h faiss/InvertedLists.h faiss/Clustering.h \
+ faiss/utils/Heap.h faiss/IndexPQ.h faiss/impl/ProductQuantizer.h \
+ faiss/impl/PolysemousTraining.h faiss/utils/utils.h \
+ faiss/utils/distances.h faiss/IndexFlat.h faiss/utils/hamming.h \
+ faiss/utils/hamming-inl.h faiss/impl/FaissAssert.h \
+ faiss/impl/FaissException.h faiss/impl/AuxIndexStructures.h
+OnDiskInvertedLists.o: OnDiskInvertedLists.cpp \
+ faiss/OnDiskInvertedLists.h faiss/IndexIVF.h faiss/Index.h \
+ faiss/InvertedLists.h faiss/Clustering.h faiss/utils/Heap.h \
+ faiss/impl/FaissAssert.h faiss/impl/FaissException.h faiss/utils/utils.h
+IndexIVFPQR.o: IndexIVFPQR.cpp faiss/IndexIVFPQR.h faiss/IndexIVFPQ.h \
+ faiss/IndexIVF.h faiss/Index.h faiss/InvertedLists.h faiss/Clustering.h \
+ faiss/utils/Heap.h faiss/IndexPQ.h faiss/impl/ProductQuantizer.h \
+ faiss/impl/PolysemousTraining.h faiss/utils/utils.h \
+ faiss/utils/distances.h faiss/impl/FaissAssert.h \
+ faiss/impl/FaissException.h
+MatrixStats.o: MatrixStats.cpp faiss/MatrixStats.h faiss/utils/utils.h \
+ faiss/utils/Heap.h
+IndexBinary.o: IndexBinary.cpp faiss/IndexBinary.h \
+ faiss/impl/FaissAssert.h faiss/impl/FaissException.h faiss/Index.h
+IndexPQ.o: IndexPQ.cpp faiss/IndexPQ.h faiss/Index.h \
+ faiss/impl/ProductQuantizer.h faiss/Clustering.h faiss/utils/Heap.h \
+ faiss/impl/PolysemousTraining.h faiss/impl/FaissAssert.h \
+ faiss/impl/FaissException.h faiss/impl/AuxIndexStructures.h \
+ faiss/utils/hamming.h faiss/utils/hamming-inl.h
 clone_index.o: clone_index.cpp faiss/clone_index.h \
  faiss/impl/FaissAssert.h faiss/impl/FaissException.h faiss/IndexFlat.h \
  faiss/Index.h faiss/VectorTransform.h faiss/IndexPreTransform.h \
@@ -67,80 +132,33 @@ clone_index.o: clone_index.cpp faiss/clone_index.h \
  faiss/impl/ScalarQuantizer.h faiss/impl/AuxIndexStructures.h \
  faiss/IndexHNSW.h faiss/impl/HNSW.h faiss/utils/random.h \
  faiss/utils/utils.h faiss/IndexLattice.h faiss/impl/lattice_Zn.h
-Index.o: Index.cpp faiss/Index.h faiss/impl/AuxIndexStructures.h \
- faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
- faiss/utils/distances.h faiss/utils/Heap.h
-AuxIndexStructures.o: AuxIndexStructures.cpp AuxIndexStructures.h Index.h \
- FaissAssert.h FaissException.h
-IndexHNSW.o: IndexHNSW.cpp faiss/IndexHNSW.h faiss/impl/HNSW.h \
- faiss/Index.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
- faiss/utils/random.h faiss/utils/Heap.h faiss/IndexFlat.h \
- faiss/IndexPQ.h faiss/impl/ProductQuantizer.h faiss/Clustering.h \
- faiss/impl/PolysemousTraining.h faiss/IndexScalarQuantizer.h \
- faiss/IndexIVF.h faiss/InvertedLists.h faiss/impl/ScalarQuantizer.h \
- faiss/impl/AuxIndexStructures.h faiss/utils/utils.h \
- faiss/utils/distances.h faiss/IndexIVFPQ.h faiss/Index2Layer.h
-IndexIVF.o: IndexIVF.cpp faiss/IndexIVF.h faiss/Index.h \
- faiss/InvertedLists.h faiss/Clustering.h faiss/utils/Heap.h \
- faiss/utils/utils.h faiss/utils/hamming.h faiss/utils/hamming-inl.h \
- faiss/impl/FaissAssert.h faiss/impl/FaissException.h faiss/IndexFlat.h \
+IndexIVFFlat.o: IndexIVFFlat.cpp faiss/IndexIVFFlat.h faiss/IndexIVF.h \
+ faiss/Index.h faiss/InvertedLists.h faiss/Clustering.h \
+ faiss/utils/Heap.h faiss/IndexFlat.h faiss/utils/distances.h \
+ faiss/utils/utils.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
  faiss/impl/AuxIndexStructures.h
-FaissException.o: FaissException.cpp FaissException.h
-MatrixStats.o: MatrixStats.cpp faiss/MatrixStats.h faiss/utils/utils.h \
- faiss/utils/Heap.h
 IndexReplicas.o: IndexReplicas.cpp faiss/IndexReplicas.h faiss/Index.h \
  faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
  faiss/impl/ThreadedIndex.h faiss/utils/WorkerThread.h \
  faiss/impl/ThreadedIndex-inl.h
-HNSW.o: HNSW.cpp HNSW.h Index.h FaissAssert.h FaissException.h utils.h \
- Heap.h AuxIndexStructures.h
-IndexLattice.o: IndexLattice.cpp faiss/IndexLattice.h faiss/IndexIVF.h \
- faiss/Index.h faiss/InvertedLists.h faiss/Clustering.h \
- faiss/utils/Heap.h faiss/impl/lattice_Zn.h faiss/utils/hamming.h \
- faiss/utils/hamming-inl.h faiss/impl/FaissAssert.h \
- faiss/impl/FaissException.h faiss/utils/distances.h
-hamming.o: hamming.cpp hamming.h Heap.h FaissAssert.h FaissException.h
-IndexBinaryFlat.o: IndexBinaryFlat.cpp faiss/IndexBinaryFlat.h \
- faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
- faiss/Index.h faiss/utils/hamming.h faiss/utils/Heap.h \
- faiss/utils/hamming-inl.h faiss/utils/utils.h \
- faiss/impl/AuxIndexStructures.h
-IndexLSH.o: IndexLSH.cpp faiss/IndexLSH.h faiss/Index.h \
- faiss/VectorTransform.h faiss/utils/utils.h faiss/utils/Heap.h \
- faiss/utils/hamming.h faiss/utils/hamming-inl.h faiss/impl/FaissAssert.h \
- faiss/impl/FaissException.h
-IndexShards.o: IndexShards.cpp faiss/IndexShards.h faiss/Index.h \
- faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
- faiss/impl/ThreadedIndex.h faiss/utils/WorkerThread.h \
- faiss/impl/ThreadedIndex-inl.h faiss/utils/Heap.h
-IndexPreTransform.o: IndexPreTransform.cpp faiss/IndexPreTransform.h \
- faiss/Index.h faiss/VectorTransform.h faiss/utils/utils.h \
- faiss/utils/Heap.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h
-PolysemousTraining.o: PolysemousTraining.cpp PolysemousTraining.h \
- ProductQuantizer.h Clustering.h faiss/Index.h Heap.h utils.h hamming.h \
- FaissAssert.h FaissException.h
-MetaIndexes.o: MetaIndexes.cpp faiss/MetaIndexes.h faiss/Index.h \
- faiss/IndexShards.h faiss/IndexBinary.h faiss/impl/FaissAssert.h \
- faiss/impl/FaissException.h faiss/impl/ThreadedIndex.h \
- faiss/utils/WorkerThread.h faiss/impl/ThreadedIndex-inl.h \
- faiss/IndexReplicas.h faiss/utils/Heap.h faiss/impl/AuxIndexStructures.h
-IndexIVFPQ.o: IndexIVFPQ.cpp faiss/IndexIVFPQ.h faiss/IndexIVF.h \
- faiss/Index.h faiss/InvertedLists.h faiss/Clustering.h \
- faiss/utils/Heap.h faiss/IndexPQ.h faiss/impl/ProductQuantizer.h \
- faiss/impl/PolysemousTraining.h faiss/utils/utils.h \
- faiss/utils/distances.h faiss/IndexFlat.h faiss/utils/hamming.h \
- faiss/utils/hamming-inl.h faiss/impl/FaissAssert.h \
- faiss/impl/FaissException.h faiss/impl/AuxIndexStructures.h
+IVFlib.o: IVFlib.cpp faiss/IVFlib.h faiss/IndexIVF.h faiss/Index.h \
+ faiss/InvertedLists.h faiss/Clustering.h faiss/utils/Heap.h \
+ faiss/IndexPreTransform.h faiss/VectorTransform.h \
+ faiss/impl/FaissAssert.h faiss/impl/FaissException.h
+Index.o: Index.cpp faiss/Index.h faiss/impl/AuxIndexStructures.h \
+ faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
+ faiss/utils/distances.h faiss/utils/Heap.h
+IndexScalarQuantizer.o: IndexScalarQuantizer.cpp \
+ faiss/IndexScalarQuantizer.h faiss/IndexIVF.h faiss/Index.h \
+ faiss/InvertedLists.h faiss/Clustering.h faiss/utils/Heap.h \
+ faiss/impl/ScalarQuantizer.h faiss/impl/AuxIndexStructures.h \
+ faiss/utils/utils.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h
 IndexBinaryHNSW.o: IndexBinaryHNSW.cpp faiss/IndexBinaryHNSW.h \
  faiss/impl/HNSW.h faiss/Index.h faiss/impl/FaissAssert.h \
  faiss/impl/FaissException.h faiss/utils/random.h faiss/utils/Heap.h \
  faiss/IndexBinaryFlat.h faiss/IndexBinary.h faiss/utils/utils.h \
  faiss/utils/hamming.h faiss/utils/hamming-inl.h \
  faiss/impl/AuxIndexStructures.h
-IndexBinaryFromFloat.o: IndexBinaryFromFloat.cpp \
- faiss/IndexBinaryFromFloat.h faiss/IndexBinary.h \
- faiss/impl/FaissAssert.h faiss/impl/FaissException.h faiss/Index.h \
- faiss/utils/utils.h faiss/utils/Heap.h
 Index2Layer.o: Index2Layer.cpp faiss/Index2Layer.h faiss/IndexPQ.h \
  faiss/Index.h faiss/impl/ProductQuantizer.h faiss/Clustering.h \
  faiss/utils/Heap.h faiss/impl/PolysemousTraining.h faiss/IndexIVF.h \
@@ -148,64 +166,11 @@ Index2Layer.o: Index2Layer.cpp faiss/Index2Layer.h faiss/IndexPQ.h \
  faiss/impl/FaissException.h faiss/utils/utils.h \
  faiss/impl/AuxIndexStructures.h faiss/IndexFlat.h \
  faiss/utils/distances.h
-WorkerThread.o: WorkerThread.cpp WorkerThread.h FaissAssert.h \
- FaissException.h
-IndexPQ.o: IndexPQ.cpp faiss/IndexPQ.h faiss/Index.h \
- faiss/impl/ProductQuantizer.h faiss/Clustering.h faiss/utils/Heap.h \
- faiss/impl/PolysemousTraining.h faiss/impl/FaissAssert.h \
- faiss/impl/FaissException.h faiss/impl/AuxIndexStructures.h \
- faiss/utils/hamming.h faiss/utils/hamming-inl.h
-IndexIVFFlat.o: IndexIVFFlat.cpp faiss/IndexIVFFlat.h faiss/IndexIVF.h \
+IndexLattice.o: IndexLattice.cpp faiss/IndexLattice.h faiss/IndexIVF.h \
  faiss/Index.h faiss/InvertedLists.h faiss/Clustering.h \
- faiss/utils/Heap.h faiss/IndexFlat.h faiss/utils/distances.h \
- faiss/utils/utils.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
- faiss/impl/AuxIndexStructures.h
-IndexBinary.o: IndexBinary.cpp faiss/IndexBinary.h \
- faiss/impl/FaissAssert.h faiss/impl/FaissException.h faiss/Index.h
-IndexScalarQuantizer.o: IndexScalarQuantizer.cpp \
- faiss/IndexScalarQuantizer.h faiss/IndexIVF.h faiss/Index.h \
- faiss/InvertedLists.h faiss/Clustering.h faiss/utils/Heap.h \
- faiss/impl/ScalarQuantizer.h faiss/impl/AuxIndexStructures.h \
- faiss/utils/utils.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h
-utils_simd.o: utils_simd.cpp utils.h Heap.h
-AutoTune.o: AutoTune.cpp faiss/AutoTune.h faiss/Index.h \
- faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/FaissException.h \
- faiss/utils/utils.h faiss/utils/Heap.h faiss/utils/random.h \
- faiss/IndexFlat.h faiss/VectorTransform.h faiss/IndexPreTransform.h \
- faiss/IndexLSH.h faiss/IndexPQ.h faiss/impl/ProductQuantizer.h \
- faiss/Clustering.h faiss/impl/PolysemousTraining.h faiss/IndexIVF.h \
- faiss/InvertedLists.h faiss/IndexIVFPQ.h faiss/IndexIVFPQR.h \
- faiss/IndexIVFFlat.h faiss/MetaIndexes.h faiss/IndexShards.h \
- faiss/impl/ThreadedIndex.h faiss/utils/WorkerThread.h \
- faiss/impl/ThreadedIndex-inl.h faiss/IndexReplicas.h \
- faiss/IndexScalarQuantizer.h faiss/impl/ScalarQuantizer.h \
- faiss/impl/AuxIndexStructures.h faiss/IndexHNSW.h faiss/impl/HNSW.h \
- faiss/IndexBinaryFlat.h faiss/IndexBinaryHNSW.h faiss/IndexBinaryIVF.h
-Clustering.o: Clustering.cpp faiss/Clustering.h faiss/Index.h \
- faiss/impl/AuxIndexStructures.h faiss/utils/utils.h faiss/utils/Heap.h \
- faiss/utils/random.h faiss/utils/distances.h faiss/impl/FaissAssert.h \
- faiss/impl/FaissException.h faiss/IndexFlat.h
-IVFlib.o: IVFlib.cpp faiss/IVFlib.h faiss/IndexIVF.h faiss/Index.h \
- faiss/InvertedLists.h faiss/Clustering.h faiss/utils/Heap.h \
- faiss/IndexPreTransform.h faiss/VectorTransform.h \
- faiss/impl/FaissAssert.h faiss/impl/FaissException.h
-index_io.o: index_io.cpp index_io.h FaissAssert.h FaissException.h \
- AuxIndexStructures.h Index.h IndexFlat.h faiss/Index.h VectorTransform.h \
- IndexLSH.h faiss/VectorTransform.h IndexPQ.h \
- faiss/impl/ProductQuantizer.h faiss/Clustering.h faiss/utils/Heap.h \
- faiss/impl/PolysemousTraining.h IndexIVF.h faiss/InvertedLists.h \
- IndexIVFPQ.h faiss/IndexIVF.h faiss/IndexPQ.h IndexIVFFlat.h \
- IndexIVFSpectralHash.h MetaIndexes.h faiss/IndexShards.h \
- faiss/IndexBinary.h faiss/impl/FaissAssert.h faiss/impl/ThreadedIndex.h \
- faiss/utils/WorkerThread.h faiss/impl/ThreadedIndex-inl.h \
- faiss/IndexReplicas.h IndexScalarQuantizer.h \
- faiss/impl/ScalarQuantizer.h faiss/impl/AuxIndexStructures.h IndexHNSW.h \
- faiss/impl/HNSW.h faiss/utils/random.h faiss/IndexFlat.h \
- faiss/IndexScalarQuantizer.h faiss/utils/utils.h OnDiskInvertedLists.h \
- IndexBinaryFlat.h IndexBinaryFromFloat.h IndexBinaryHNSW.h \
- faiss/IndexBinaryFlat.h IndexBinaryIVF.h
-distances.o: distances.cpp distances.h Index.h Heap.h utils.h \
- FaissAssert.h FaissException.h AuxIndexStructures.h
+ faiss/utils/Heap.h faiss/impl/lattice_Zn.h faiss/utils/hamming.h \
+ faiss/utils/hamming-inl.h faiss/impl/FaissAssert.h \
+ faiss/impl/FaissException.h faiss/utils/distances.h
 GpuCloner.o: gpu/GpuCloner.cpp faiss/gpu/GpuCloner.h faiss/Index.h \
  faiss/clone_index.h faiss/gpu/GpuClonerOptions.h \
  faiss/gpu/GpuIndicesOptions.h faiss/gpu/GpuIndex.h \
@@ -364,15 +329,6 @@ GpuDistance.o: gpu/GpuDistance.cu faiss/gpu/GpuDistance.h faiss/Index.h \
  faiss/gpu/utils/Float16.cuh faiss/gpu/utils/ConversionOperators.cuh \
  faiss/gpu/utils/CopyUtils.cuh faiss/gpu/utils/HostTensor.cuh \
  faiss/gpu/utils/HostTensor-inl.cuh
-InvertedListAppend.o: gpu/impl/InvertedListAppend.cu \
- gpu/impl/InvertedListAppend.cuh gpu/impl/../GpuIndicesOptions.h \
- gpu/impl/../utils/Tensor.cuh faiss/gpu/utils/Tensor-inl.cuh \
- faiss/gpu/GpuFaissAssert.h faiss/impl/FaissAssert.h \
- faiss/impl/FaissException.h faiss/gpu/utils/DeviceUtils.h \
- gpu/impl/../../FaissAssert.h gpu/impl/../utils/Float16.cuh \
- faiss/gpu/GpuResources.h faiss/gpu/utils/DeviceMemory.h \
- faiss/gpu/utils/DeviceTensor.cuh faiss/gpu/utils/MemorySpace.h \
- faiss/gpu/utils/DeviceTensor-inl.cuh gpu/impl/../utils/StaticUtils.h
 Distance.o: gpu/impl/Distance.cu faiss/gpu/impl/Distance.cuh \
  faiss/gpu/utils/DeviceTensor.cuh faiss/gpu/utils/Tensor.cuh \
  faiss/gpu/utils/Tensor-inl.cuh faiss/gpu/GpuFaissAssert.h \
diff --git a/distances.cpp b/distances.cpp
deleted file mode 100644
index adf23e0e88..0000000000
--- a/distances.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "distances.h"
-
-#include <cmath>
-#include <omp.h>
-
-
-#include "utils.h"
-#include "FaissAssert.h"
-#include "AuxIndexStructures.h"
-
-namespace faiss {
-
-/***************************************************************************
- * Distance functions (other than L2 and IP)
- ***************************************************************************/
-
-struct VectorDistanceL2 {
-    size_t d;
-
-    float operator () (const float *x, const float *y) const {
-        return fvec_L2sqr (x, y, d);
-    }
-};
-
-struct VectorDistanceL1 {
-    size_t d;
-
-    float operator () (const float *x, const float *y) const {
-        return fvec_L1 (x, y, d);
-    }
-};
-
-struct VectorDistanceLinf {
-    size_t d;
-
-    float operator () (const float *x, const float *y) const {
-        return fvec_Linf (x, y, d);
-        /*
-        float vmax = 0;
-        for (size_t i = 0; i < d; i++) {
-            float diff = fabs (x[i] - y[i]);
-            if (diff > vmax) vmax = diff;
-        }
-        return vmax;*/
-    }
-};
-
-struct VectorDistanceLp {
-    size_t d;
-    const float p;
-
-    float operator () (const float *x, const float *y) const {
-        float accu = 0;
-        for (size_t i = 0; i < d; i++) {
-            float diff = fabs (x[i] - y[i]);
-            accu += powf (diff, p);
-        }
-        return accu;
-    }
-};
-
-struct VectorDistanceCanberra {
-    size_t d;
-
-    float operator () (const float *x, const float *y) const {
-        float accu = 0;
-        for (size_t i = 0; i < d; i++) {
-            float xi = x[i], yi = y[i];
-            accu += fabs (xi - yi) / (fabs(xi) + fabs(yi));
-        }
-        return accu;
-    }
-};
-
-struct VectorDistanceBrayCurtis {
-    size_t d;
-
-    float operator () (const float *x, const float *y) const {
-        float accu_num = 0, accu_den = 0;
-        for (size_t i = 0; i < d; i++) {
-            float xi = x[i], yi = y[i];
-            accu_num += fabs (xi - yi);
-            accu_den += fabs (xi + yi);
-        }
-        return accu_num / accu_den;
-    }
-};
-
-struct VectorDistanceJensenShannon {
-    size_t d;
-
-    float operator () (const float *x, const float *y) const {
-        float accu = 0;
-
-        for (size_t i = 0; i < d; i++) {
-            float xi = x[i], yi = y[i];
-            float mi = 0.5 * (xi + yi);
-            float kl1 = - xi * log(mi / xi);
-            float kl2 = - yi * log(mi / yi);
-            accu += kl1 + kl2;
-        }
-        return 0.5 * accu;
-    }
-};
-
-
-
-
-
-
-
-
-
-
-namespace {
-
-template<class VD>
-void pairwise_extra_distances_template (
-                     VD vd,
-                     int64_t nq, const float *xq,
-                     int64_t nb, const float *xb,
-                     float *dis,
-                     int64_t ldq, int64_t ldb, int64_t ldd)
-{
-
-#pragma omp parallel for if(nq > 10)
-    for (int64_t i = 0; i < nq; i++) {
-        const float *xqi = xq + i * ldq;
-        const float *xbj = xb;
-        float *disi = dis + ldd * i;
-
-        for (int64_t j = 0; j < nb; j++) {
-            disi[j] = vd (xqi, xbj);
-            xbj += ldb;
-        }
-    }
-}
-
-
-template<class VD>
-void knn_extra_metrics_template (
-        VD vd,
-        const float * x,
-        const float * y,
-        size_t nx, size_t ny,
-        float_maxheap_array_t * res)
-{
-    size_t k = res->k;
-    size_t d = vd.d;
-    size_t check_period = InterruptCallback::get_period_hint (ny * d);
-    check_period *= omp_get_max_threads();
-
-    for (size_t i0 = 0; i0 < nx; i0 += check_period) {
-        size_t i1 = std::min(i0 + check_period, nx);
-
-#pragma omp parallel for
-        for (size_t i = i0; i < i1; i++) {
-            const float * x_i = x + i * d;
-            const float * y_j = y;
-            size_t j;
-            float * simi = res->get_val(i);
-            int64_t * idxi = res->get_ids (i);
-
-            maxheap_heapify (k, simi, idxi);
-            for (j = 0; j < ny; j++) {
-                float disij = vd (x_i, y_j);
-
-                if (disij < simi[0]) {
-                    maxheap_pop (k, simi, idxi);
-                    maxheap_push (k, simi, idxi, disij, j);
-                }
-                y_j += d;
-            }
-            maxheap_reorder (k, simi, idxi);
-        }
-        InterruptCallback::check ();
-    }
-
-}
-
-
-template<class VD>
-struct ExtraDistanceComputer : DistanceComputer {
-    VD vd;
-    Index::idx_t nb;
-    const float *q;
-    const float *b;
-
-    float operator () (idx_t i) override {
-        return vd (q, b + i * vd.d);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return vd (b + j * vd.d, b + i * vd.d);
-    }
-
-    ExtraDistanceComputer(const VD & vd, const float *xb,
-                          size_t nb, const float *q = nullptr)
-        : vd(vd), nb(nb), q(q), b(xb) {}
-
-    void set_query(const float *x) override {
-        q = x;
-    }
-};
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-} // anonymous namespace
-
-void pairwise_extra_distances (
-                     int64_t d,
-                     int64_t nq, const float *xq,
-                     int64_t nb, const float *xb,
-                     MetricType mt, float metric_arg,
-                     float *dis,
-                     int64_t ldq, int64_t ldb, int64_t ldd)
-{
-    if (nq == 0 || nb == 0) return;
-    if (ldq == -1) ldq = d;
-    if (ldb == -1) ldb = d;
-    if (ldd == -1) ldd = nb;
-
-    switch(mt) {
-#define HANDLE_VAR(kw)                                          \
-     case METRIC_ ## kw: {                                      \
-        VectorDistance ## kw vd({(size_t)d});                   \
-        pairwise_extra_distances_template (vd, nq, xq, nb, xb,  \
-                                           dis, ldq, ldb, ldd); \
-        break;                                                  \
-    }
-        HANDLE_VAR(L2);
-        HANDLE_VAR(L1);
-        HANDLE_VAR(Linf);
-        HANDLE_VAR(Canberra);
-        HANDLE_VAR(BrayCurtis);
-        HANDLE_VAR(JensenShannon);
-#undef HANDLE_VAR
-    case METRIC_Lp: {
-        VectorDistanceLp vd({(size_t)d, metric_arg});
-        pairwise_extra_distances_template (vd, nq, xq, nb, xb,
-                                           dis, ldq, ldb, ldd);
-        break;
-    }
-    default:
-        FAISS_THROW_MSG ("metric type not implemented");
-    }
-
-}
-
-void knn_extra_metrics (
-        const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        MetricType mt, float metric_arg,
-        float_maxheap_array_t * res)
-{
-
-    switch(mt) {
-#define HANDLE_VAR(kw)                                          \
-     case METRIC_ ## kw: {                                      \
-        VectorDistance ## kw vd({(size_t)d});                   \
-        knn_extra_metrics_template (vd, x, y, nx, ny, res);     \
-        break;                                                  \
-    }
-        HANDLE_VAR(L2);
-        HANDLE_VAR(L1);
-        HANDLE_VAR(Linf);
-        HANDLE_VAR(Canberra);
-        HANDLE_VAR(BrayCurtis);
-        HANDLE_VAR(JensenShannon);
-#undef HANDLE_VAR
-    case METRIC_Lp: {
-        VectorDistanceLp vd({(size_t)d, metric_arg});
-        knn_extra_metrics_template (vd, x, y, nx, ny, res);
-        break;
-    }
-    default:
-        FAISS_THROW_MSG ("metric type not implemented");
-    }
-
-}
-
-DistanceComputer *get_extra_distance_computer (
-        size_t d,
-        MetricType mt, float metric_arg,
-        size_t nb, const float *xb)
-{
-
-    switch(mt) {
-#define HANDLE_VAR(kw)                                                  \
-     case METRIC_ ## kw: {                                              \
-        VectorDistance ## kw vd({(size_t)d});                           \
-        return new ExtraDistanceComputer<VectorDistance ## kw>(vd, xb, nb); \
-    }
-        HANDLE_VAR(L2);
-        HANDLE_VAR(L1);
-        HANDLE_VAR(Linf);
-        HANDLE_VAR(Canberra);
-        HANDLE_VAR(BrayCurtis);
-        HANDLE_VAR(JensenShannon);
-#undef HANDLE_VAR
-    case METRIC_Lp: {
-        VectorDistanceLp vd({(size_t)d, metric_arg});
-        return new ExtraDistanceComputer<VectorDistanceLp> (vd, xb, nb);
-        break;
-    }
-    default:
-        FAISS_THROW_MSG ("metric type not implemented");
-    }
-
-}
-
-
-} // namespace faiss
diff --git a/distances.h b/distances.h
deleted file mode 100644
index 9432b3e78d..0000000000
--- a/distances.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#ifndef FAISS_distances_h
-#define FAISS_distances_h
-
-/** In this file are the implementations of extra metrics beyond L2
- *  and inner product */
-
-#include <stdint.h>
-
-#include "Index.h"
-
-#include "Heap.h"
-
-
-
-namespace faiss {
-
-
-void pairwise_extra_distances (
-                     int64_t d,
-                     int64_t nq, const float *xq,
-                     int64_t nb, const float *xb,
-                     MetricType mt, float metric_arg,
-                     float *dis,
-                     int64_t ldq = -1, int64_t ldb = -1, int64_t ldd = -1);
-
-
-void knn_extra_metrics (
-        const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        MetricType mt, float metric_arg,
-        float_maxheap_array_t * res);
-
-
-/** get a DistanceComputer that refers to this type of distance and
- *  indexes a flat array of size nb */
-DistanceComputer *get_extra_distance_computer (
-        size_t d,
-        MetricType mt, float metric_arg,
-        size_t nb, const float *xb);
-
-}
-
-
-#endif
diff --git a/gpu/impl/InvertedListAppend.cu b/gpu/impl/InvertedListAppend.cu
deleted file mode 100644
index 36d6ecb137..0000000000
--- a/gpu/impl/InvertedListAppend.cu
+++ /dev/null
@@ -1,271 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-
-#include "InvertedListAppend.cuh"
-#include "../../FaissAssert.h"
-#include "../utils/Float16.cuh"
-#include "../utils/DeviceUtils.h"
-#include "../utils/Tensor.cuh"
-#include "../utils/StaticUtils.h"
-
-namespace faiss { namespace gpu {
-
-__global__ void
-runUpdateListPointers(Tensor<int, 1, true> listIds,
-                      Tensor<int, 1, true> newListLength,
-                      Tensor<void*, 1, true> newCodePointers,
-                      Tensor<void*, 1, true> newIndexPointers,
-                      int* listLengths,
-                      void** listCodes,
-                      void** listIndices) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (index >= listIds.getSize(0)) {
-    return;
-  }
-
-  int listId = listIds[index];
-  listLengths[listId] = newListLength[index];
-  listCodes[listId] = newCodePointers[index];
-  listIndices[listId] = newIndexPointers[index];
-}
-
-void
-runUpdateListPointers(Tensor<int, 1, true>& listIds,
-                      Tensor<int, 1, true>& newListLength,
-                      Tensor<void*, 1, true>& newCodePointers,
-                      Tensor<void*, 1, true>& newIndexPointers,
-                      thrust::device_vector<int>& listLengths,
-                      thrust::device_vector<void*>& listCodes,
-                      thrust::device_vector<void*>& listIndices,
-                      cudaStream_t stream) {
-  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
-  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
-
-  dim3 grid(numBlocks);
-  dim3 block(numThreads);
-
-  runUpdateListPointers<<<grid, block, 0, stream>>>(
-    listIds, newListLength, newCodePointers, newIndexPointers,
-    listLengths.data().get(),
-    listCodes.data().get(),
-    listIndices.data().get());
-
-  CUDA_TEST_ERROR();
-}
-
-template <IndicesOptions Opt>
-__global__ void
-ivfpqInvertedListAppend(Tensor<int, 1, true> listIds,
-                        Tensor<int, 1, true> listOffset,
-                        Tensor<int, 2, true> encodings,
-                        Tensor<long, 1, true> indices,
-                        void** listCodes,
-                        void** listIndices) {
-  int encodingToAdd = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (encodingToAdd >= listIds.getSize(0)) {
-    return;
-  }
-
-  int listId = listIds[encodingToAdd];
-  int offset = listOffset[encodingToAdd];
-
-  // Add vector could be invalid (contains NaNs etc)
-  if (listId == -1 || offset == -1) {
-    return;
-  }
-
-  auto encoding = encodings[encodingToAdd];
-  long index = indices[encodingToAdd];
-
-  if (Opt == INDICES_32_BIT) {
-    // FIXME: there could be overflow here, but where should we check this?
-    ((int*) listIndices[listId])[offset] = (int) index;
-  } else if (Opt == INDICES_64_BIT) {
-    ((long*) listIndices[listId])[offset] = (long) index;
-  } else {
-    // INDICES_CPU or INDICES_IVF; no indices are being stored
-  }
-
-  unsigned char* codeStart =
-    ((unsigned char*) listCodes[listId]) + offset * encodings.getSize(1);
-
-  // FIXME: slow
-  for (int i = 0; i < encodings.getSize(1); ++i) {
-    codeStart[i] = (unsigned char) encoding[i];
-  }
-}
-
-void
-runIVFPQInvertedListAppend(Tensor<int, 1, true>& listIds,
-                           Tensor<int, 1, true>& listOffset,
-                           Tensor<int, 2, true>& encodings,
-                           Tensor<long, 1, true>& indices,
-                           thrust::device_vector<void*>& listCodes,
-                           thrust::device_vector<void*>& listIndices,
-                           IndicesOptions indicesOptions,
-                           cudaStream_t stream) {
-  int numThreads = std::min(listIds.getSize(0), getMaxThreadsCurrentDevice());
-  int numBlocks = utils::divUp(listIds.getSize(0), numThreads);
-
-  dim3 grid(numBlocks);
-  dim3 block(numThreads);
-
-#define RUN_APPEND(IND)                                         \
-  do {                                                          \
-    ivfpqInvertedListAppend<IND><<<grid, block, 0, stream>>>(   \
-      listIds, listOffset, encodings, indices,                  \
-      listCodes.data().get(),                                   \
-      listIndices.data().get());                                \
-  } while (0)
-
-  if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) {
-    // no need to maintain indices on the GPU
-    RUN_APPEND(INDICES_IVF);
-  } else if (indicesOptions == INDICES_32_BIT) {
-    RUN_APPEND(INDICES_32_BIT);
-  } else if (indicesOptions == INDICES_64_BIT) {
-    RUN_APPEND(INDICES_64_BIT);
-  } else {
-    // unknown index storage type
-    FAISS_ASSERT(false);
-  }
-
-  CUDA_TEST_ERROR();
-
-#undef RUN_APPEND
-}
-
-template <IndicesOptions Opt, bool Exact, bool Float16>
-__global__ void
-ivfFlatInvertedListAppend(Tensor<int, 1, true> listIds,
-                          Tensor<int, 1, true> listOffset,
-                          Tensor<float, 2, true> vecs,
-                          Tensor<long, 1, true> indices,
-                          void** listData,
-                          void** listIndices) {
-  int vec = blockIdx.x;
-
-  int listId = listIds[vec];
-  int offset = listOffset[vec];
-
-  // Add vector could be invalid (contains NaNs etc)
-  if (listId == -1 || offset == -1) {
-    return;
-  }
-
-  if (threadIdx.x == 0) {
-    long index = indices[vec];
-
-    if (Opt == INDICES_32_BIT) {
-      // FIXME: there could be overflow here, but where should we check this?
-      ((int*) listIndices[listId])[offset] = (int) index;
-    } else if (Opt == INDICES_64_BIT) {
-      ((long*) listIndices[listId])[offset] = (long) index;
-    } else {
-      // INDICES_CPU or INDICES_IVF; no indices are being stored
-    }
-  }
-
-#ifdef FAISS_USE_FLOAT16
-  // FIXME: should use half2 for better memory b/w
-  if (Float16) {
-    half* vecStart = ((half*) listData[listId]) + offset * vecs.getSize(1);
-
-    if (Exact) {
-      vecStart[threadIdx.x] = __float2half(vecs[vec][threadIdx.x]);
-    } else {
-      for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
-        vecStart[i] = __float2half(vecs[vec][i]);
-      }
-    }
-  }
-#else
-  static_assert(!Float16, "float16 unsupported");
-#endif
-
-  if (!Float16) {
-    float* vecStart = ((float*) listData[listId]) + offset * vecs.getSize(1);
-
-    if (Exact) {
-      vecStart[threadIdx.x] = vecs[vec][threadIdx.x];
-    } else {
-      for (int i = threadIdx.x; i < vecs.getSize(1); i += blockDim.x) {
-        vecStart[i] = vecs[vec][i];
-      }
-    }
-  }
-}
-
-void
-runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
-                             Tensor<int, 1, true>& listOffset,
-                             Tensor<float, 2, true>& vecs,
-                             Tensor<long, 1, true>& indices,
-                             bool useFloat16,
-                             thrust::device_vector<void*>& listData,
-                             thrust::device_vector<void*>& listIndices,
-                             IndicesOptions indicesOptions,
-                             cudaStream_t stream) {
-  int maxThreads = getMaxThreadsCurrentDevice();
-  bool exact = vecs.getSize(1) <= maxThreads;
-
-  // Each block will handle appending a single vector
-  dim3 grid(vecs.getSize(0));
-  dim3 block(std::min(vecs.getSize(1), maxThreads));
-
-#define RUN_APPEND_OPT(OPT, EXACT, FLOAT16)                             \
-  do {                                                                  \
-    ivfFlatInvertedListAppend<OPT, EXACT, FLOAT16>                      \
-      <<<grid, block, 0, stream>>>(                                     \
-        listIds, listOffset, vecs, indices,                             \
-        listData.data().get(),                                          \
-        listIndices.data().get());                                      \
-  } while (0)                                                           \
-
-#define RUN_APPEND(EXACT, FLOAT16)                                      \
-  do {                                                                  \
-    if ((indicesOptions == INDICES_CPU) || (indicesOptions == INDICES_IVF)) { \
-      /* no indices are maintained on the GPU */                        \
-      RUN_APPEND_OPT(INDICES_IVF, EXACT, FLOAT16);                      \
-    } else if (indicesOptions == INDICES_32_BIT) {                      \
-      RUN_APPEND_OPT(INDICES_32_BIT, EXACT, FLOAT16);                   \
-    } else if (indicesOptions == INDICES_64_BIT) {                      \
-      RUN_APPEND_OPT(INDICES_64_BIT, EXACT, FLOAT16);                   \
-    } else {                                                            \
-      FAISS_ASSERT(false);                                              \
-    }                                                                   \
-  } while (0);
-
-  if (useFloat16) {
-#ifdef FAISS_USE_FLOAT16
-    if (exact) {
-      RUN_APPEND(true, true);
-    } else {
-      RUN_APPEND(false, true);
-    }
-#else
-    // no float16 support
-    FAISS_ASSERT(false);
-#endif
-  } else {
-    if (exact) {
-      RUN_APPEND(true, false);
-    } else {
-      RUN_APPEND(false, false);
-    }
-  }
-
-  CUDA_TEST_ERROR();
-
-#undef RUN_APPEND
-#undef RUN_APPEND_OPT
-}
-
-} } // namespace
diff --git a/gpu/impl/InvertedListAppend.cuh b/gpu/impl/InvertedListAppend.cuh
deleted file mode 100644
index e26ed70ef8..0000000000
--- a/gpu/impl/InvertedListAppend.cuh
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-
-#pragma once
-
-#include "../GpuIndicesOptions.h"
-#include "../utils/Tensor.cuh"
-#include <thrust/device_vector.h>
-
-namespace faiss { namespace gpu {
-
-/// Update device-side list pointers in a batch
-void runUpdateListPointers(Tensor<int, 1, true>& listIds,
-                           Tensor<int, 1, true>& newListLength,
-                           Tensor<void*, 1, true>& newCodePointers,
-                           Tensor<void*, 1, true>& newIndexPointers,
-                           thrust::device_vector<int>& listLengths,
-                           thrust::device_vector<void*>& listCodes,
-                           thrust::device_vector<void*>& listIndices,
-                           cudaStream_t stream);
-
-/// Actually append the new codes / vector indices to the individual lists
-
-/// IVFPQ
-void runIVFPQInvertedListAppend(Tensor<int, 1, true>& listIds,
-                                Tensor<int, 1, true>& listOffset,
-                                Tensor<int, 2, true>& encodings,
-                                Tensor<long, 1, true>& indices,
-                                thrust::device_vector<void*>& listCodes,
-                                thrust::device_vector<void*>& listIndices,
-                                IndicesOptions indicesOptions,
-                                cudaStream_t stream);
-
-/// IVF flat storage
-void runIVFFlatInvertedListAppend(Tensor<int, 1, true>& listIds,
-                                  Tensor<int, 1, true>& listOffset,
-                                  Tensor<float, 2, true>& vecs,
-                                  Tensor<long, 1, true>& indices,
-                                  bool useFloat16,
-                                  thrust::device_vector<void*>& listData,
-                                  thrust::device_vector<void*>& listIndices,
-                                  IndicesOptions indicesOptions,
-                                  cudaStream_t stream);
-
-} } // namespace
diff --git a/hamming.cpp b/hamming.cpp
deleted file mode 100644
index fca9ef5cc7..0000000000
--- a/hamming.cpp
+++ /dev/null
@@ -1,776 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/*
- * Implementation of Hamming related functions (distances, smallest distance
- * selection with regular heap|radix and probabilistic heap|radix.
- *
- * IMPLEMENTATION NOTES
- * Bitvectors are generally assumed to be multiples of 64 bits.
- *
- * hamdis_t is used for distances because at this time
- * it is not clear how we will need to balance
- * - flexibility in vector size (unclear more than 2^16 or even 2^8 bitvectors)
- * - memory usage
- * - cache-misses when dealing with large volumes of data (lower bits is better)
- *
- * The hamdis_t should optimally be compatibe with one of the Torch Storage
- * (Byte,Short,Long) and therefore should be signed for 2-bytes and 4-bytes
-*/
-
-#include "hamming.h"
-
-#include <vector>
-#include <memory>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <assert.h>
-#include <limits.h>
-
-#include "Heap.h"
-#include "FaissAssert.h"
-
-static const size_t BLOCKSIZE_QUERY = 8192;
-
-
-namespace faiss {
-
-size_t hamming_batch_size = 65536;
-
-static const uint8_t hamdis_tab_ham_bytes[256] = {
-    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
-};
-
-
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-T hamming (const uint8_t *bs1,
-           const uint8_t *bs2)
-{
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++)
-        h += (T) hamdis_tab_ham_bytes[bs1[i]^bs2[i]];
-    return h;
-}
-
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-hamdis_t hamming (const uint64_t * bs1, const uint64_t * bs2)
-{
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++)
-        h += popcount64 (bs1[i] ^ bs2[i]);
-    return h;
-}
-
-
-
-/* specialized (optimized) functions */
-template <>
-hamdis_t hamming<64> (const uint64_t * pa, const uint64_t * pb)
-{
-    return popcount64 (pa[0] ^ pb[0]);
-}
-
-
-template <>
-hamdis_t hamming<128> (const uint64_t *pa, const uint64_t *pb)
-{
-    return popcount64 (pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-
-template <>
-hamdis_t hamming<256> (const uint64_t * pa, const uint64_t * pb)
-{
-    return  popcount64 (pa[0] ^ pb[0])
-          + popcount64 (pa[1] ^ pb[1])
-          + popcount64 (pa[2] ^ pb[2])
-          + popcount64 (pa[3] ^ pb[3]);
-}
-
-
-/* Hamming distances for multiple of 64 bits */
-hamdis_t hamming (
-        const uint64_t * bs1,
-        const uint64_t * bs2,
-        size_t nwords)
-{
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++)
-        h += popcount64 (bs1[i] ^ bs2[i]);
-    return h;
-}
-
-
-
-template <size_t nbits>
-void hammings (
-        const uint64_t * bs1,
-        const uint64_t * bs2,
-        size_t n1, size_t n2,
-        hamdis_t * dis)
-
-{
-    size_t i, j;
-    const size_t nwords = nbits / 64;
-    for (i = 0; i < n1; i++) {
-        const uint64_t * __restrict bs1_ = bs1 + i * nwords;
-        hamdis_t * __restrict dis_ = dis + i * n2;
-        for (j = 0; j < n2; j++)
-            dis_[j] = hamming<nbits>(bs1_, bs2 + j * nwords);
-    }
-}
-
-
-
-void hammings (
-        const uint64_t * bs1,
-        const uint64_t * bs2,
-        size_t n1,
-        size_t n2,
-        size_t nwords,
-        hamdis_t * __restrict dis)
-{
-    size_t i, j;
-    n1 *= nwords;
-    n2 *= nwords;
-    for (i = 0; i < n1; i+=nwords) {
-        const uint64_t * bs1_ = bs1+i;
-        for (j = 0; j < n2; j+=nwords)
-            dis[j] = hamming (bs1_, bs2+j, nwords);
-    }
-}
-
-
-
-
-/* Count number of matches given a max threshold */
-template <size_t nbits>
-void hamming_count_thres (
-        const uint64_t * bs1,
-        const uint64_t * bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t * nptr)
-{
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    const uint64_t * bs2_ = bs2;
-
-    for (i = 0; i < n1; i++) {
-        bs2 = bs2_;
-        for (j = 0; j < n2; j++) {
-            /* collect the match only if this satisfies the threshold */
-            if (hamming <nbits> (bs1, bs2) <= ht)
-                posm++;
-            bs2 += nwords;
-        }
-        bs1 += nwords;  /* next signature */
-    }
-    *nptr = posm;
-}
-
-
-template <size_t nbits>
-void crosshamming_count_thres (
-        const uint64_t * dbs,
-        size_t n,
-        int ht,
-        size_t * nptr)
-{
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    const uint64_t * bs1 = dbs;
-    for (i = 0; i < n; i++) {
-        const uint64_t * bs2 = bs1 + 2;
-        for (j = i + 1; j < n; j++) {
-            /* collect the match only if this satisfies the threshold */
-            if (hamming <nbits> (bs1, bs2) <= ht)
-                posm++;
-            bs2 += nwords;
-        }
-        bs1 += nwords;
-    }
-    *nptr = posm;
-}
-
-
-template <size_t nbits>
-size_t match_hamming_thres (
-        const uint64_t * bs1,
-        const uint64_t * bs2,
-        size_t n1,
-        size_t n2,
-        int ht,
-        int64_t * idx,
-        hamdis_t * hams)
-{
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    hamdis_t h;
-    const uint64_t * bs2_ = bs2;
-    for (i = 0; i < n1; i++) {
-        bs2 = bs2_;
-        for (j = 0; j < n2; j++) {
-            /* Here perform the real work of computing the distance */
-            h = hamming <nbits> (bs1, bs2);
-
-            /* collect the match only if this satisfies the threshold */
-            if (h <= ht) {
-                /* Enough space to store another match ? */
-                *idx = i; idx++;
-                *idx = j; idx++;
-                *hams = h;
-                hams++;
-                posm++;
-            }
-            bs2+=nwords;  /* next signature */
-        }
-        bs1+=nwords;
-    }
-    return posm;
-}
-
-
-/* Return closest neighbors w.r.t Hamming distance, using a heap. */
-template <class HammingComputer>
-static
-void hammings_knn_hc (
-        int bytes_per_code,
-        int_maxheap_array_t * ha,
-        const uint8_t * bs1,
-        const uint8_t * bs2,
-        size_t n2,
-        bool order = true,
-        bool init_heap = true)
-{
-    size_t k = ha->k;
-    if (init_heap) ha->heapify ();
-
-    const size_t block_size = hamming_batch_size;
-    for (size_t j0 = 0; j0 < n2; j0 += block_size) {
-      const size_t j1 = std::min(j0 + block_size, n2);
-#pragma omp parallel for
-      for (size_t i = 0; i < ha->nh; i++) {
-        HammingComputer hc (bs1 + i * bytes_per_code, bytes_per_code);
-
-        const uint8_t * bs2_ = bs2 + j0 * bytes_per_code;
-        hamdis_t dis;
-        hamdis_t * __restrict bh_val_ = ha->val + i * k;
-        int64_t * __restrict bh_ids_ = ha->ids + i * k;
-        size_t j;
-        for (j = j0; j < j1; j++, bs2_+= bytes_per_code) {
-          dis = hc.hamming (bs2_);
-          if (dis < bh_val_[0]) {
-            faiss::maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
-            faiss::maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, dis, j);
-          }
-        }
-      }
-    }
-    if (order) ha->reorder ();
- }
-
-/* Return closest neighbors w.r.t Hamming distance, using max count. */
-template <class HammingComputer>
-static
-void hammings_knn_mc (
-        int bytes_per_code,
-        const uint8_t *a,
-        const uint8_t *b,
-        size_t na,
-        size_t nb,
-        size_t k,
-        int32_t *distances,
-        int64_t *labels)
-{
-  const int nBuckets = bytes_per_code * 8 + 1;
-  std::vector<int> all_counters(na * nBuckets, 0);
-  std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
-
-  std::vector<HCounterState<HammingComputer>> cs;
-  for (size_t i = 0; i < na; ++i) {
-    cs.push_back(HCounterState<HammingComputer>(
-                   all_counters.data() + i * nBuckets,
-                   all_ids_per_dis.get() + i * nBuckets * k,
-                   a + i * bytes_per_code,
-                   8 * bytes_per_code,
-                   k
-                 ));
-  }
-
-  const size_t block_size = hamming_batch_size;
-  for (size_t j0 = 0; j0 < nb; j0 += block_size) {
-    const size_t j1 = std::min(j0 + block_size, nb);
-#pragma omp parallel for
-    for (size_t i = 0; i < na; ++i) {
-      for (size_t j = j0; j < j1; ++j) {
-        cs[i].update_counter(b + j * bytes_per_code, j);
-      }
-    }
-  }
-
-  for (size_t i = 0; i < na; ++i) {
-    HCounterState<HammingComputer>& csi = cs[i];
-
-    int nres = 0;
-    for (int b = 0; b < nBuckets && nres < k; b++) {
-      for (int l = 0; l < csi.counters[b] && nres < k; l++) {
-        labels[i * k + nres] = csi.ids_per_dis[b * k + l];
-        distances[i * k + nres] = b;
-        nres++;
-      }
-    }
-    while (nres < k) {
-      labels[i * k + nres] = -1;
-      distances[i * k + nres] = std::numeric_limits<int32_t>::max();
-      ++nres;
-    }
-  }
-}
-
-
-
-// works faster than the template version
-static
-void hammings_knn_hc_1 (
-        int_maxheap_array_t * ha,
-        const uint64_t * bs1,
-        const uint64_t * bs2,
-        size_t n2,
-        bool order = true,
-        bool init_heap = true)
-{
-    const size_t nwords = 1;
-    size_t k = ha->k;
-
-
-    if (init_heap) {
-        ha->heapify ();
-    }
-
-#pragma omp parallel for
-    for (size_t i = 0; i < ha->nh; i++) {
-        const uint64_t bs1_ = bs1 [i];
-        const uint64_t * bs2_ = bs2;
-        hamdis_t dis;
-        hamdis_t * bh_val_ = ha->val + i * k;
-        hamdis_t bh_val_0 = bh_val_[0];
-        int64_t * bh_ids_ = ha->ids + i * k;
-        size_t j;
-        for (j = 0; j < n2; j++, bs2_+= nwords) {
-            dis = popcount64 (bs1_ ^ *bs2_);
-            if (dis < bh_val_0) {
-                faiss::maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
-                faiss::maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, dis, j);
-                bh_val_0 = bh_val_[0];
-            }
-        }
-    }
-    if (order) {
-        ha->reorder ();
-    }
-}
-
-
-
-
-/* Functions to maps vectors to bits. Assume proper allocation done beforehand,
-   meaning that b should be be able to receive as many bits as x may produce. */
-
-/*
- * dimension 0 corresponds to the least significant bit of b[0], or
- * equivalently to the lsb of the first byte that is stored.
- */
-void fvec2bitvec (const float * x, uint8_t * b, size_t d)
-{
-    for (int i = 0; i < d; i += 8) {
-        uint8_t w = 0;
-        uint8_t mask = 1;
-        int nj = i + 8 <= d ? 8 : d - i;
-        for (int j = 0; j < nj; j++) {
-            if (x[i + j] >= 0)
-                w |= mask;
-            mask <<= 1;
-        }
-        *b = w;
-        b++;
-    }
-}
-
-
-
-/* Same but for n vectors.
-   Ensure that the ouptut b is byte-aligned (pad with 0s). */
-void fvecs2bitvecs (const float * x, uint8_t * b, size_t d, size_t n)
-{
-    const int64_t ncodes = ((d + 7) / 8);
-#pragma omp parallel for
-    for (size_t i = 0; i < n; i++)
-        fvec2bitvec (x + i * d, b + i * ncodes, d);
-}
-
-
-/* Reverse bit (NOT a optimized function, only used for print purpose) */
-static uint64_t uint64_reverse_bits (uint64_t b)
-{
-    int i;
-    uint64_t revb = 0;
-    for (i = 0; i < 64; i++) {
-        revb <<= 1;
-        revb |= b & 1;
-        b >>= 1;
-    }
-    return revb;
-}
-
-
-/* print the bit vector */
-void bitvec_print (const uint8_t * b, size_t d)
-{
-    size_t i, j;
-    for (i = 0; i < d; ) {
-        uint64_t brev = uint64_reverse_bits (* (uint64_t *) b);
-        for (j = 0; j < 64 && i < d; j++, i++) {
-            printf ("%d", (int) (brev & 1));
-            brev >>= 1;
-        }
-        b += 8;
-        printf (" ");
-    }
-}
-
-
-
-
-
-/*----------------------------------------*/
-/* Hamming distance computation and k-nn  */
-
-
-#define C64(x) ((uint64_t *)x)
-
-
-/* Compute a set of Hamming distances */
-void hammings (
-        const uint8_t * a,
-        const uint8_t * b,
-        size_t na, size_t nb,
-        size_t ncodes,
-        hamdis_t * __restrict dis)
-{
-    FAISS_THROW_IF_NOT (ncodes % 8 == 0);
-    switch (ncodes) {
-        case 8:
-            faiss::hammings <64>  (C64(a), C64(b), na, nb, dis); return;
-        case 16:
-            faiss::hammings <128> (C64(a), C64(b), na, nb, dis); return;
-        case 32:
-            faiss::hammings <256> (C64(a), C64(b), na, nb, dis); return;
-        case 64:
-            faiss::hammings <512> (C64(a), C64(b), na, nb, dis); return;
-        default:
-            faiss::hammings (C64(a), C64(b), na, nb, ncodes * 8, dis); return;
-    }
-}
-
-void hammings_knn(
-    int_maxheap_array_t *ha,
-    const uint8_t *a,
-    const uint8_t *b,
-    size_t nb,
-    size_t ncodes,
-    int order)
-{
-    hammings_knn_hc(ha, a, b, nb, ncodes, order);
-}
-void hammings_knn_hc (
-        int_maxheap_array_t * ha,
-        const uint8_t * a,
-        const uint8_t * b,
-        size_t nb,
-        size_t ncodes,
-        int order)
-{
-    switch (ncodes) {
-    case 4:
-        hammings_knn_hc<faiss::HammingComputer4>
-            (4, ha, a, b, nb, order, true);
-        break;
-    case 8:
-        hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
-        // hammings_knn_hc<faiss::HammingComputer8>
-        //      (8, ha, a, b, nb, order, true);
-        break;
-    case 16:
-        hammings_knn_hc<faiss::HammingComputer16>
-            (16, ha, a, b, nb, order, true);
-        break;
-    case 32:
-        hammings_knn_hc<faiss::HammingComputer32>
-            (32, ha, a, b, nb, order, true);
-        break;
-    default:
-        if(ncodes % 8 == 0) {
-            hammings_knn_hc<faiss::HammingComputerM8>
-                (ncodes, ha, a, b, nb, order, true);
-        } else {
-            hammings_knn_hc<faiss::HammingComputerDefault>
-                (ncodes, ha, a, b, nb, order, true);
-
-        }
-    }
-}
-
-void hammings_knn_mc(
-    const uint8_t * a,
-    const uint8_t * b,
-    size_t na,
-    size_t nb,
-    size_t k,
-    size_t ncodes,
-    int32_t *distances,
-    int64_t *labels)
-{
-    switch (ncodes) {
-    case 4:
-        hammings_knn_mc<faiss::HammingComputer4>(
-          4, a, b, na, nb, k, distances, labels
-        );
-        break;
-    case 8:
-        // TODO(hoss): Write analog to hammings_knn_hc_1
-        // hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
-        hammings_knn_mc<faiss::HammingComputer8>(
-          8, a, b, na, nb, k, distances, labels
-        );
-        break;
-    case 16:
-        hammings_knn_mc<faiss::HammingComputer16>(
-          16, a, b, na, nb, k, distances, labels
-        );
-        break;
-    case 32:
-        hammings_knn_mc<faiss::HammingComputer32>(
-          32, a, b, na, nb, k, distances, labels
-        );
-        break;
-    default:
-        if(ncodes % 8 == 0) {
-            hammings_knn_mc<faiss::HammingComputerM8>(
-              ncodes, a, b, na, nb, k, distances, labels
-            );
-        } else {
-            hammings_knn_mc<faiss::HammingComputerDefault>(
-              ncodes, a, b, na, nb, k, distances, labels
-            );
-        }
-    }
-}
-
-
-
-
-/* Count number of matches given a max threshold            */
-void hamming_count_thres (
-        const uint8_t * bs1,
-        const uint8_t * bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t ncodes,
-        size_t * nptr)
-{
-    switch (ncodes) {
-        case 8:
-            faiss::hamming_count_thres <64> (C64(bs1), C64(bs2),
-                                             n1, n2, ht, nptr);
-            return;
-        case 16:
-            faiss::hamming_count_thres <128> (C64(bs1), C64(bs2),
-                                              n1, n2, ht, nptr);
-            return;
-        case 32:
-            faiss::hamming_count_thres <256> (C64(bs1), C64(bs2),
-                                              n1, n2, ht, nptr);
-            return;
-        case 64:
-            faiss::hamming_count_thres <512> (C64(bs1), C64(bs2),
-                                              n1, n2, ht, nptr);
-            return;
-        default:
-          FAISS_THROW_FMT ("not implemented for %zu bits", ncodes);
-    }
-}
-
-
-/* Count number of cross-matches given a threshold */
-void crosshamming_count_thres (
-        const uint8_t * dbs,
-        size_t n,
-        hamdis_t ht,
-        size_t ncodes,
-        size_t * nptr)
-{
-    switch (ncodes) {
-        case 8:
-            faiss::crosshamming_count_thres <64>  (C64(dbs), n, ht, nptr);
-            return;
-        case 16:
-            faiss::crosshamming_count_thres <128> (C64(dbs), n, ht, nptr);
-            return;
-        case 32:
-            faiss::crosshamming_count_thres <256> (C64(dbs), n, ht, nptr);
-            return;
-        case 64:
-            faiss::crosshamming_count_thres <512> (C64(dbs), n, ht, nptr);
-            return;
-        default:
-            FAISS_THROW_FMT ("not implemented for %zu bits", ncodes);
-    }
-}
-
-
-/* Returns all matches given a threshold */
-size_t match_hamming_thres (
-        const uint8_t * bs1,
-        const uint8_t * bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t ncodes,
-        int64_t * idx,
-        hamdis_t * dis)
-{
-    switch (ncodes) {
-        case 8:
-          return faiss::match_hamming_thres <64> (C64(bs1), C64(bs2),
-                                                  n1, n2, ht, idx, dis);
-        case 16:
-          return faiss::match_hamming_thres <128> (C64(bs1), C64(bs2),
-                                                   n1, n2, ht, idx, dis);
-        case 32:
-          return faiss::match_hamming_thres <256> (C64(bs1), C64(bs2),
-                                                   n1, n2, ht, idx, dis);
-        case 64:
-          return faiss::match_hamming_thres <512> (C64(bs1), C64(bs2),
-                                                   n1, n2, ht, idx, dis);
-        default:
-            FAISS_THROW_FMT ("not implemented for %zu bits", ncodes);
-            return 0;
-    }
-}
-
-
-#undef C64
-
-
-
-/*************************************
- * generalized Hamming distances
- ************************************/
-
-
-
-template <class HammingComputer>
-static void hamming_dis_inner_loop (
-        const uint8_t *ca,
-        const uint8_t *cb,
-        size_t nb,
-        size_t code_size,
-        int k,
-        hamdis_t * bh_val_,
-        int64_t *     bh_ids_)
-{
-
-    HammingComputer hc (ca, code_size);
-
-    for (size_t j = 0; j < nb; j++) {
-        int ndiff = hc.hamming (cb);
-        cb += code_size;
-        if (ndiff < bh_val_[0]) {
-            maxheap_pop<hamdis_t> (k, bh_val_, bh_ids_);
-            maxheap_push<hamdis_t> (k, bh_val_, bh_ids_, ndiff, j);
-        }
-    }
-}
-
-void generalized_hammings_knn_hc (
-        int_maxheap_array_t * ha,
-        const uint8_t * a,
-        const uint8_t * b,
-        size_t nb,
-        size_t code_size,
-        int ordered)
-{
-    int na = ha->nh;
-    int k = ha->k;
-
-    if (ordered)
-        ha->heapify ();
-
-#pragma omp parallel for
-    for (int i = 0; i < na; i++) {
-        const uint8_t *ca = a + i * code_size;
-        const uint8_t *cb = b;
-
-        hamdis_t * bh_val_ = ha->val + i * k;
-        int64_t *     bh_ids_ = ha->ids + i * k;
-
-        switch (code_size) {
-        case 8:
-            hamming_dis_inner_loop<GenHammingComputer8>
-                (ca, cb, nb, 8, k, bh_val_, bh_ids_);
-            break;
-        case 16:
-            hamming_dis_inner_loop<GenHammingComputer16>
-                (ca, cb, nb, 16, k, bh_val_, bh_ids_);
-            break;
-        case 32:
-            hamming_dis_inner_loop<GenHammingComputer32>
-                (ca, cb, nb, 32, k, bh_val_, bh_ids_);
-            break;
-        default:
-            hamming_dis_inner_loop<GenHammingComputerM8>
-                (ca, cb, nb, code_size, k, bh_val_, bh_ids_);
-            break;
-        }
-    }
-
-    if (ordered)
-        ha->reorder ();
-
-}
-
-
-} // namespace faiss
diff --git a/hamming.h b/hamming.h
deleted file mode 100644
index e5ef13c9b5..0000000000
--- a/hamming.h
+++ /dev/null
@@ -1,572 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/*
- * Hamming distances. The binary vector dimensionality should be a
- * multiple of 8, as the elementary operations operate on bytes. If
- * you need other sizes, just pad with 0s (this is done by function
- * fvecs2bitvecs).
- *
- * User-defined type hamdis_t is used for distances because at this time
- * it is still uncler clear how we will need to balance
- * - flexibility in vector size (may need 16- or even 8-bit vectors)
- * - memory usage
- * - cache-misses when dealing with large volumes of data (fewer bits is better)
- *
- */
-
-#ifndef FAISS_hamming_h
-#define FAISS_hamming_h
-
-
-#include <stdint.h>
-
-#include "Heap.h"
-
-
-/* The Hamming distance type */
-typedef int32_t hamdis_t;
-
-namespace faiss {
-
-
-extern size_t hamming_batch_size;
-
-inline int popcount64(uint64_t x) {
-    return __builtin_popcountl(x);
-}
-
-
-/** Compute a set of Hamming distances between na and nb binary vectors
- *
- * @param  a             size na * nbytespercode
- * @param  b             size nb * nbytespercode
- * @param  nbytespercode should be multiple of 8
- * @param  dis           output distances, size na * nb
- */
-void hammings (
-        const uint8_t * a,
-        const uint8_t * b,
-        size_t na, size_t nb,
-        size_t nbytespercode,
-        hamdis_t * dis);
-
-void bitvec_print (const uint8_t * b, size_t d);
-
-
-/* Functions for casting vectors of regular types to compact bits.
-   They assume proper allocation done beforehand, meaning that b
-   should be be able to receive as many bits as x may produce.  */
-
-/* Makes an array of bits from the signs of a float array. The length
-   of the output array b is rounded up to byte size (allocate
-   accordingly) */
-void fvecs2bitvecs (
-        const float * x,
-        uint8_t * b,
-        size_t d,
-        size_t n);
-
-
-void fvec2bitvec (const float * x, uint8_t * b, size_t d);
-
-
-
-/** Return the k smallest Hamming distances for a set of binary query vectors,
- * using a max heap.
- * @param a       queries, size ha->nh * ncodes
- * @param b       database, size nb * ncodes
- * @param nb      number of database vectors
- * @param ncodes  size of the binary codes (bytes)
- * @param ordered if != 0: order the results by decreasing distance
- *                (may be bottleneck for k/n > 0.01) */
-void hammings_knn_hc (
-        int_maxheap_array_t * ha,
-        const uint8_t * a,
-        const uint8_t * b,
-        size_t nb,
-        size_t ncodes,
-        int ordered);
-
-/* Legacy alias to hammings_knn_hc. */
-void hammings_knn (
-  int_maxheap_array_t * ha,
-  const uint8_t * a,
-  const uint8_t * b,
-  size_t nb,
-  size_t ncodes,
-  int ordered);
-
-/** Return the k smallest Hamming distances for a set of binary query vectors,
- * using counting max.
- * @param a       queries, size na * ncodes
- * @param b       database, size nb * ncodes
- * @param na      number of query vectors
- * @param nb      number of database vectors
- * @param k       number of vectors/distances to return
- * @param ncodes  size of the binary codes (bytes)
- * @param distances output distances from each query vector to its k nearest
- *                neighbors
- * @param labels  output ids of the k nearest neighbors to each query vector
- */
-void hammings_knn_mc (
-  const uint8_t * a,
-  const uint8_t * b,
-  size_t na,
-  size_t nb,
-  size_t k,
-  size_t ncodes,
-  int32_t *distances,
-  int64_t *labels);
-
-/* Counting the number of matches or of cross-matches (without returning them)
-   For use with function that assume pre-allocated memory */
-void hamming_count_thres (
-        const uint8_t * bs1,
-        const uint8_t * bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t ncodes,
-        size_t * nptr);
-
-/* Return all Hamming distances/index passing a thres. Pre-allocation of output
-   is required. Use hamming_count_thres to determine the proper size. */
-size_t match_hamming_thres (
-        const uint8_t * bs1,
-        const uint8_t * bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t ncodes,
-        int64_t * idx,
-        hamdis_t * dis);
-
-/* Cross-matching in a set of vectors */
-void crosshamming_count_thres (
-        const uint8_t * dbs,
-        size_t n,
-        hamdis_t ht,
-        size_t ncodes,
-        size_t * nptr);
-
-
-/* compute the Hamming distances between two codewords of nwords*64 bits */
-hamdis_t hamming (
-        const uint64_t * bs1,
-        const uint64_t * bs2,
-        size_t nwords);
-
-
-
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4 () {}
-
-    HammingComputer4 (const uint8_t *a, int code_size) {
-        set (a, code_size);
-    }
-
-    void set (const uint8_t *a, int code_size) {
-        assert (code_size == 4);
-        a0 = *(uint32_t *)a;
-    }
-
-    inline int hamming (const uint8_t *b) const {
-        return popcount64 (*(uint32_t *)b ^ a0);
-    }
-
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8 () {}
-
-    HammingComputer8 (const uint8_t *a, int code_size) {
-        set (a, code_size);
-    }
-
-    void set (const uint8_t *a, int code_size) {
-        assert (code_size == 8);
-        a0 = *(uint64_t *)a;
-    }
-
-    inline int hamming (const uint8_t *b) const {
-        return popcount64 (*(uint64_t *)b ^ a0);
-    }
-
-};
-
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16 () {}
-
-    HammingComputer16 (const uint8_t *a8, int code_size) {
-        set (a8, code_size);
-    }
-
-    void set (const uint8_t *a8, int code_size) {
-        assert (code_size == 16);
-        const uint64_t *a = (uint64_t *)a8;
-        a0 = a[0]; a1 = a[1];
-    }
-
-    inline int hamming (const uint8_t *b8) const {
-        const uint64_t *b = (uint64_t *)b8;
-        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1);
-    }
-
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20 () {}
-
-    HammingComputer20 (const uint8_t *a8, int code_size) {
-        set (a8, code_size);
-    }
-
-    void set (const uint8_t *a8, int code_size) {
-        assert (code_size == 20);
-        const uint64_t *a = (uint64_t *)a8;
-        a0 = a[0]; a1 = a[1]; a2 = a[2];
-    }
-
-    inline int hamming (const uint8_t *b8) const {
-        const uint64_t *b = (uint64_t *)b8;
-        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
-            popcount64 (*(uint32_t*)(b + 2) ^ a2);
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32 () {}
-
-    HammingComputer32 (const uint8_t *a8, int code_size) {
-        set (a8, code_size);
-    }
-
-    void set (const uint8_t *a8, int code_size) {
-        assert (code_size == 32);
-        const uint64_t *a = (uint64_t *)a8;
-        a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
-    }
-
-    inline int hamming (const uint8_t *b8) const {
-        const uint64_t *b = (uint64_t *)b8;
-        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
-            popcount64 (b[2] ^ a2) + popcount64 (b[3] ^ a3);
-    }
-
-};
-
-struct HammingComputer64 {
-    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
-
-    HammingComputer64 () {}
-
-    HammingComputer64 (const uint8_t *a8, int code_size) {
-        set (a8, code_size);
-    }
-
-    void set (const uint8_t *a8, int code_size) {
-        assert (code_size == 64);
-        const uint64_t *a = (uint64_t *)a8;
-        a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
-        a4 = a[4]; a5 = a[5]; a6 = a[6]; a7 = a[7];
-    }
-
-    inline int hamming (const uint8_t *b8) const {
-        const uint64_t *b = (uint64_t *)b8;
-        return popcount64 (b[0] ^ a0) + popcount64 (b[1] ^ a1) +
-            popcount64 (b[2] ^ a2) + popcount64 (b[3] ^ a3) +
-            popcount64 (b[4] ^ a4) + popcount64 (b[5] ^ a5) +
-            popcount64 (b[6] ^ a6) + popcount64 (b[7] ^ a7);
-    }
-
-};
-
-// very inefficient...
-struct HammingComputerDefault {
-    const uint8_t *a;
-    int n;
-
-    HammingComputerDefault () {}
-
-    HammingComputerDefault (const uint8_t *a8, int code_size) {
-        set (a8, code_size);
-    }
-
-    void set (const uint8_t *a8, int code_size) {
-        a =  a8;
-        n = code_size;
-    }
-
-    int hamming (const uint8_t *b8) const {
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-            accu += popcount64 (a[i] ^ b8[i]);
-        return accu;
-    }
-
-};
-
-
-struct HammingComputerM8 {
-    const uint64_t *a;
-    int n;
-
-    HammingComputerM8 () {}
-
-    HammingComputerM8 (const uint8_t *a8, int code_size) {
-        set (a8, code_size);
-    }
-
-    void set (const uint8_t *a8, int code_size) {
-        assert (code_size % 8 == 0);
-        a =  (uint64_t *)a8;
-        n = code_size / 8;
-    }
-
-    int hamming (const uint8_t *b8) const {
-        const uint64_t *b = (uint64_t *)b8;
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-            accu += popcount64 (a[i] ^ b[i]);
-        return accu;
-    }
-
-};
-
-// even more inefficient!
-struct HammingComputerM4 {
-    const uint32_t *a;
-    int n;
-
-    HammingComputerM4 () {}
-
-    HammingComputerM4 (const uint8_t *a4, int code_size) {
-        set (a4, code_size);
-    }
-
-    void set (const uint8_t *a4, int code_size) {
-        assert (code_size % 4 == 0);
-        a =  (uint32_t *)a4;
-        n = code_size / 4;
-    }
-
-    int hamming (const uint8_t *b8) const {
-        const uint32_t *b = (uint32_t *)b8;
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-             accu += popcount64 (a[i] ^ b[i]);
-        return accu;
-    }
-
-};
-
-/***************************************************************************
- * Equivalence with a template class when code size is known at compile time
- **************************************************************************/
-
-// default template
-template<int CODE_SIZE>
-struct HammingComputer: HammingComputerM8 {
-    HammingComputer (const uint8_t *a, int code_size):
-    HammingComputerM8(a, code_size) {}
-};
-
-#define SPECIALIZED_HC(CODE_SIZE)                     \
-    template<> struct HammingComputer<CODE_SIZE>:     \
-            HammingComputer ## CODE_SIZE {            \
-        HammingComputer (const uint8_t *a):           \
-        HammingComputer ## CODE_SIZE(a, CODE_SIZE) {} \
-    }
-
-SPECIALIZED_HC(4);
-SPECIALIZED_HC(8);
-SPECIALIZED_HC(16);
-SPECIALIZED_HC(20);
-SPECIALIZED_HC(32);
-SPECIALIZED_HC(64);
-
-#undef SPECIALIZED_HC
-
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-
-inline int generalized_hamming_64 (uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64 (a);
-}
-
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8 (const uint8_t *a, int code_size) {
-        assert (code_size == 8);
-        a0 = *(uint64_t *)a;
-    }
-
-    inline int hamming (const uint8_t *b) const {
-        return generalized_hamming_64 (*(uint64_t *)b ^ a0);
-    }
-
-};
-
-
-struct GenHammingComputer16 {
-    uint64_t a0, a1;
-    GenHammingComputer16 (const uint8_t *a8, int code_size) {
-        assert (code_size == 16);
-        const uint64_t *a = (uint64_t *)a8;
-        a0 = a[0]; a1 = a[1];
-    }
-
-    inline int hamming (const uint8_t *b8) const {
-        const uint64_t *b = (uint64_t *)b8;
-        return generalized_hamming_64 (b[0] ^ a0) +
-            generalized_hamming_64 (b[1] ^ a1);
-    }
-
-};
-
-struct GenHammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    GenHammingComputer32 (const uint8_t *a8, int code_size) {
-        assert (code_size == 32);
-        const uint64_t *a = (uint64_t *)a8;
-        a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3];
-    }
-
-    inline int hamming (const uint8_t *b8) const {
-        const uint64_t *b = (uint64_t *)b8;
-        return generalized_hamming_64 (b[0] ^ a0) +
-            generalized_hamming_64 (b[1] ^ a1) +
-            generalized_hamming_64 (b[2] ^ a2) +
-            generalized_hamming_64 (b[3] ^ a3);
-    }
-
-};
-
-struct GenHammingComputerM8 {
-    const uint64_t *a;
-    int n;
-
-    GenHammingComputerM8 (const uint8_t *a8, int code_size) {
-        assert (code_size % 8 == 0);
-        a =  (uint64_t *)a8;
-        n = code_size / 8;
-    }
-
-    int hamming (const uint8_t *b8) const {
-        const uint64_t *b = (uint64_t *)b8;
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-            accu += generalized_hamming_64 (a[i] ^ b[i]);
-        return accu;
-    }
-
-};
-
-
-/** generalized Hamming distances (= count number of code bytes that
-    are the same) */
-void generalized_hammings_knn_hc (
-        int_maxheap_array_t * ha,
-        const uint8_t * a,
-        const uint8_t * b,
-        size_t nb,
-        size_t code_size,
-        int ordered = true);
-
-
-
-/** This class maintains a list of best distances seen so far.
- *
- * Since the distances are in a limited range (0 to nbit), the
- * object maintains one list per possible distance, and fills
- * in only the n-first lists, such that the sum of sizes of the
- * n lists is below k.
- */
-template<class HammingComputer>
-struct HCounterState {
-  int *counters;
-  int64_t *ids_per_dis;
-
-  HammingComputer hc;
-  int thres;
-  int count_lt;
-  int count_eq;
-  int k;
-
- HCounterState(int *counters, int64_t *ids_per_dis,
-               const uint8_t *x, int d, int k)
- : counters(counters),
-        ids_per_dis(ids_per_dis),
-        hc(x, d / 8),
-        thres(d + 1),
-        count_lt(0),
-        count_eq(0),
-        k(k) {}
-
-  void update_counter(const uint8_t *y, size_t j) {
-    int32_t dis = hc.hamming(y);
-
-    if (dis <= thres) {
-      if (dis < thres) {
-        ids_per_dis[dis * k + counters[dis]++] = j;
-        ++count_lt;
-        while (count_lt == k && thres > 0) {
-          --thres;
-          count_eq = counters[thres];
-          count_lt -= count_eq;
-        }
-      } else if (count_eq < k) {
-        ids_per_dis[dis * k + count_eq++] = j;
-        counters[dis] = count_eq;
-      }
-    }
-  }
-};
-
-
-} // namespace faiss
-
-
-#endif /* FAISS_hamming_h */
diff --git a/index_io.cpp b/index_io.cpp
deleted file mode 100644
index 7bd55aa8c7..0000000000
--- a/index_io.cpp
+++ /dev/null
@@ -1,1389 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "index_io.h"
-
-#include <cstdio>
-#include <cstdlib>
-
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "FaissAssert.h"
-#include "AuxIndexStructures.h"
-
-#include "IndexFlat.h"
-#include "VectorTransform.h"
-#include "IndexLSH.h"
-#include "IndexPQ.h"
-#include "IndexIVF.h"
-#include "IndexIVFPQ.h"
-#include "IndexIVFFlat.h"
-#include "IndexIVFSpectralHash.h"
-#include "MetaIndexes.h"
-#include "IndexScalarQuantizer.h"
-#include "IndexHNSW.h"
-#include "OnDiskInvertedLists.h"
-#include "IndexBinaryFlat.h"
-#include "IndexBinaryFromFloat.h"
-#include "IndexBinaryHNSW.h"
-#include "IndexBinaryIVF.h"
-
-
-
-/*************************************************************
- * The I/O format is the content of the class. For objects that are
- * inherited, like Index, a 4-character-code (fourcc) indicates which
- * child class this is an instance of.
- *
- * In this case, the fields of the parent class are written first,
- * then the ones for the child classes. Note that this requires
- * classes to be serialized to have a constructor without parameters,
- * so that the fields can be filled in later. The default constructor
- * should set reasonable defaults for all fields.
- *
- * The fourccs are assigned arbitrarily. When the class changed (added
- * or deprecated fields), the fourcc can be replaced. New code should
- * be able to read the old fourcc and fill in new classes.
- *
- * TODO: serialization to strings for use in Python pickle or Torch
- * serialization.
- *
- * TODO: in this file, the read functions that encouter errors may
- * leak memory.
- **************************************************************/
-
-
-
-namespace faiss {
-
-static uint32_t fourcc (const char sx[4]) {
-    assert(4 == strlen(sx));
-    const unsigned char *x = (unsigned char*)sx;
-    return x[0] | x[1] << 8 | x[2] << 16 | x[3] << 24;
-}
-
-/*************************************************************
- * I/O macros
- *
- * we use macros so that we have a line number to report in abort
- * (). This makes debugging a lot easier. The IOReader or IOWriter is
- * always called f and thus is not passed in as a macro parameter.
- **************************************************************/
-
-
-#define WRITEANDCHECK(ptr, n) {                                 \
-        size_t ret = (*f)(ptr, sizeof(*(ptr)), n);              \
-        FAISS_THROW_IF_NOT_FMT(ret == (n),                      \
-            "write error in %s: %ld != %ld (%s)",               \
-            f->name.c_str(), ret, size_t(n), strerror(errno));  \
-    }
-
-#define READANDCHECK(ptr, n) {                                  \
-        size_t ret = (*f)(ptr, sizeof(*(ptr)), n);              \
-        FAISS_THROW_IF_NOT_FMT(ret == (n),                      \
-            "read error in %s: %ld != %ld (%s)",                \
-            f->name.c_str(), ret, size_t(n), strerror(errno));  \
-    }
-
-#define WRITE1(x) WRITEANDCHECK(&(x), 1)
-#define READ1(x)  READANDCHECK(&(x), 1)
-
-#define WRITEVECTOR(vec) {                      \
-        size_t size = (vec).size ();            \
-        WRITEANDCHECK (&size, 1);               \
-        WRITEANDCHECK ((vec).data (), size);    \
-    }
-
-// will fail if we write 256G of data at once...
-#define READVECTOR(vec) {                       \
-        long size;                            \
-        READANDCHECK (&size, 1);                \
-        FAISS_THROW_IF_NOT (size >= 0 && size < (1L << 40));  \
-        (vec).resize (size);                    \
-        READANDCHECK ((vec).data (), size);     \
-    }
-
-struct ScopeFileCloser {
-    FILE *f;
-    ScopeFileCloser (FILE *f): f (f) {}
-    ~ScopeFileCloser () {fclose (f); }
-};
-
-
-namespace {
-
-struct FileIOReader: IOReader {
-    FILE *f = nullptr;
-    bool need_close = false;
-
-    FileIOReader(FILE *rf): f(rf) {}
-
-    FileIOReader(const char * fname)
-    {
-        name = fname;
-        f = fopen(fname, "rb");
-        FAISS_THROW_IF_NOT_FMT (
-             f, "could not open %s for reading: %s",
-             fname, strerror(errno));
-        need_close = true;
-    }
-
-    ~FileIOReader() override {
-        if (need_close) {
-            int ret = fclose(f);
-            if (ret != 0) {// we cannot raise and exception in the destructor
-                fprintf(stderr, "file %s close error: %s",
-                        name.c_str(), strerror(errno));
-            }
-        }
-    }
-
-    size_t operator()(
-            void *ptr, size_t size, size_t nitems) override {
-        return fread(ptr, size, nitems, f);
-    }
-
-    int fileno() override {
-        return ::fileno (f);
-    }
-
-};
-
-struct FileIOWriter: IOWriter {
-    FILE *f = nullptr;
-    bool need_close = false;
-
-    FileIOWriter(FILE *wf): f(wf) {}
-
-    FileIOWriter(const char * fname)
-    {
-        name = fname;
-        f = fopen(fname, "wb");
-        FAISS_THROW_IF_NOT_FMT (
-             f, "could not open %s for writing: %s",
-             fname, strerror(errno));
-        need_close = true;
-    }
-
-    ~FileIOWriter() override {
-        if (need_close) {
-            int ret = fclose(f);
-            if (ret != 0) {
-                // we cannot raise and exception in the destructor
-                fprintf(stderr, "file %s close error: %s",
-                        name.c_str(), strerror(errno));
-            }
-        }
-    }
-
-    size_t operator()(
-            const void *ptr, size_t size, size_t nitems) override {
-        return fwrite(ptr, size, nitems, f);
-    }
-    int fileno() override {
-        return ::fileno (f);
-    }
-
-};
-
-
-} // namespace
-
-
-/*************************************************************
- * Write
- **************************************************************/
-static void write_index_header (const Index *idx, IOWriter *f) {
-    WRITE1 (idx->d);
-    WRITE1 (idx->ntotal);
-    Index::idx_t dummy = 1 << 20;
-    WRITE1 (dummy);
-    WRITE1 (dummy);
-    WRITE1 (idx->is_trained);
-    WRITE1 (idx->metric_type);
-    if (idx->metric_type > 1) {
-        WRITE1 (idx->metric_arg);
-    }
-}
-
-void write_VectorTransform (const VectorTransform *vt, IOWriter *f) {
-    if (const LinearTransform * lt =
-           dynamic_cast < const LinearTransform *> (vt)) {
-        if (dynamic_cast<const RandomRotationMatrix *>(lt)) {
-            uint32_t h = fourcc ("rrot");
-            WRITE1 (h);
-        } else if (const PCAMatrix * pca =
-                   dynamic_cast<const PCAMatrix *>(lt)) {
-            uint32_t h = fourcc ("PcAm");
-            WRITE1 (h);
-            WRITE1 (pca->eigen_power);
-            WRITE1 (pca->random_rotation);
-            WRITE1 (pca->balanced_bins);
-            WRITEVECTOR (pca->mean);
-            WRITEVECTOR (pca->eigenvalues);
-            WRITEVECTOR (pca->PCAMat);
-        } else {
-            // generic LinearTransform (includes OPQ)
-            uint32_t h = fourcc ("LTra");
-            WRITE1 (h);
-        }
-        WRITE1 (lt->have_bias);
-        WRITEVECTOR (lt->A);
-        WRITEVECTOR (lt->b);
-    } else if (const RemapDimensionsTransform *rdt =
-               dynamic_cast<const RemapDimensionsTransform *>(vt)) {
-        uint32_t h = fourcc ("RmDT");
-        WRITE1 (h);
-        WRITEVECTOR (rdt->map);
-    } else if (const NormalizationTransform *nt =
-               dynamic_cast<const NormalizationTransform *>(vt)) {
-        uint32_t h = fourcc ("VNrm");
-        WRITE1 (h);
-        WRITE1 (nt->norm);
-    } else if (const CenteringTransform *ct =
-               dynamic_cast<const CenteringTransform *>(vt)) {
-        uint32_t h = fourcc ("VCnt");
-        WRITE1 (h);
-        WRITEVECTOR (ct->mean);
-    } else {
-        FAISS_THROW_MSG ("cannot serialize this");
-    }
-    // common fields
-    WRITE1 (vt->d_in);
-    WRITE1 (vt->d_out);
-    WRITE1 (vt->is_trained);
-}
-
-void write_ProductQuantizer (const ProductQuantizer *pq, IOWriter *f) {
-    WRITE1 (pq->d);
-    WRITE1 (pq->M);
-    WRITE1 (pq->nbits);
-    WRITEVECTOR (pq->centroids);
-}
-
-static void write_ScalarQuantizer (
-        const ScalarQuantizer *ivsc, IOWriter *f) {
-    WRITE1 (ivsc->qtype);
-    WRITE1 (ivsc->rangestat);
-    WRITE1 (ivsc->rangestat_arg);
-    WRITE1 (ivsc->d);
-    WRITE1 (ivsc->code_size);
-    WRITEVECTOR (ivsc->trained);
-}
-
-void write_InvertedLists (const InvertedLists *ils, IOWriter *f) {
-    if (ils == nullptr) {
-        uint32_t h = fourcc ("il00");
-        WRITE1 (h);
-    } else if (const auto & ails =
-               dynamic_cast<const ArrayInvertedLists *>(ils)) {
-        uint32_t h = fourcc ("ilar");
-        WRITE1 (h);
-        WRITE1 (ails->nlist);
-        WRITE1 (ails->code_size);
-        // here we store either as a full or a sparse data buffer
-        size_t n_non0 = 0;
-        for (size_t i = 0; i < ails->nlist; i++) {
-            if (ails->ids[i].size() > 0)
-                n_non0++;
-        }
-        if (n_non0 > ails->nlist / 2) {
-            uint32_t list_type = fourcc("full");
-            WRITE1 (list_type);
-            std::vector<size_t> sizes;
-            for (size_t i = 0; i < ails->nlist; i++) {
-                sizes.push_back (ails->ids[i].size());
-            }
-            WRITEVECTOR (sizes);
-        } else {
-            int list_type = fourcc("sprs"); // sparse
-            WRITE1 (list_type);
-            std::vector<size_t> sizes;
-            for (size_t i = 0; i < ails->nlist; i++) {
-                size_t n = ails->ids[i].size();
-                if (n > 0) {
-                    sizes.push_back (i);
-                    sizes.push_back (n);
-                }
-            }
-            WRITEVECTOR (sizes);
-        }
-        // make a single contiguous data buffer (useful for mmapping)
-        for (size_t i = 0; i < ails->nlist; i++) {
-            size_t n = ails->ids[i].size();
-            if (n > 0) {
-                WRITEANDCHECK (ails->codes[i].data(), n * ails->code_size);
-                WRITEANDCHECK (ails->ids[i].data(), n);
-            }
-        }
-    } else if (const auto & od =
-               dynamic_cast<const OnDiskInvertedLists *>(ils)) {
-        uint32_t h = fourcc ("ilod");
-        WRITE1 (h);
-        WRITE1 (ils->nlist);
-        WRITE1 (ils->code_size);
-        // this is a POD object
-        WRITEVECTOR (od->lists);
-
-        {
-            std::vector<OnDiskInvertedLists::Slot> v(
-                      od->slots.begin(), od->slots.end());
-            WRITEVECTOR(v);
-        }
-        {
-            std::vector<char> x(od->filename.begin(), od->filename.end());
-            WRITEVECTOR(x);
-        }
-        WRITE1(od->totsize);
-
-    } else {
-        fprintf(stderr, "WARN! write_InvertedLists: unsupported invlist type, "
-                "saving null invlist\n");
-        uint32_t h = fourcc ("il00");
-        WRITE1 (h);
-    }
-}
-
-
-void write_ProductQuantizer (const ProductQuantizer*pq, const char *fname) {
-    FileIOWriter writer(fname);
-    write_ProductQuantizer (pq, &writer);
-}
-
-static void write_HNSW (const HNSW *hnsw, IOWriter *f) {
-
-    WRITEVECTOR (hnsw->assign_probas);
-    WRITEVECTOR (hnsw->cum_nneighbor_per_level);
-    WRITEVECTOR (hnsw->levels);
-    WRITEVECTOR (hnsw->offsets);
-    WRITEVECTOR (hnsw->neighbors);
-
-    WRITE1 (hnsw->entry_point);
-    WRITE1 (hnsw->max_level);
-    WRITE1 (hnsw->efConstruction);
-    WRITE1 (hnsw->efSearch);
-    WRITE1 (hnsw->upper_beam);
-}
-
-static void write_ivf_header (const IndexIVF *ivf, IOWriter *f) {
-    write_index_header (ivf, f);
-    WRITE1 (ivf->nlist);
-    WRITE1 (ivf->nprobe);
-    write_index (ivf->quantizer, f);
-    WRITE1 (ivf->maintain_direct_map);
-    WRITEVECTOR (ivf->direct_map);
-}
-
-void write_index (const Index *idx, IOWriter *f) {
-    if (const IndexFlat * idxf = dynamic_cast<const IndexFlat *> (idx)) {
-        uint32_t h = fourcc (
-              idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI" :
-              idxf->metric_type == METRIC_L2 ? "IxF2" : nullptr);
-        WRITE1 (h);
-        write_index_header (idx, f);
-        WRITEVECTOR (idxf->xb);
-    } else if(const IndexLSH * idxl = dynamic_cast<const IndexLSH *> (idx)) {
-        uint32_t h = fourcc ("IxHe");
-        WRITE1 (h);
-        write_index_header (idx, f);
-        WRITE1 (idxl->nbits);
-        WRITE1 (idxl->rotate_data);
-        WRITE1 (idxl->train_thresholds);
-        WRITEVECTOR (idxl->thresholds);
-        WRITE1 (idxl->bytes_per_vec);
-        write_VectorTransform (&idxl->rrot, f);
-        WRITEVECTOR (idxl->codes);
-    } else if(const IndexPQ * idxp = dynamic_cast<const IndexPQ *> (idx)) {
-        uint32_t h = fourcc ("IxPq");
-        WRITE1 (h);
-        write_index_header (idx, f);
-        write_ProductQuantizer (&idxp->pq, f);
-        WRITEVECTOR (idxp->codes);
-        // search params -- maybe not useful to store?
-        WRITE1 (idxp->search_type);
-        WRITE1 (idxp->encode_signs);
-        WRITE1 (idxp->polysemous_ht);
-    } else if(const Index2Layer * idxp =
-              dynamic_cast<const Index2Layer *> (idx)) {
-        uint32_t h = fourcc ("Ix2L");
-        WRITE1 (h);
-        write_index_header (idx, f);
-        write_index (idxp->q1.quantizer, f);
-        WRITE1 (idxp->q1.nlist);
-        WRITE1 (idxp->q1.quantizer_trains_alone);
-        write_ProductQuantizer (&idxp->pq, f);
-        WRITE1 (idxp->code_size_1);
-        WRITE1 (idxp->code_size_2);
-        WRITE1 (idxp->code_size);
-        WRITEVECTOR (idxp->codes);
-    } else if(const IndexScalarQuantizer * idxs =
-              dynamic_cast<const IndexScalarQuantizer *> (idx)) {
-        uint32_t h = fourcc ("IxSQ");
-        WRITE1 (h);
-        write_index_header (idx, f);
-        write_ScalarQuantizer (&idxs->sq, f);
-        WRITEVECTOR (idxs->codes);
-    } else if(const IndexIVFFlatDedup * ivfl =
-              dynamic_cast<const IndexIVFFlatDedup *> (idx)) {
-        uint32_t h = fourcc ("IwFd");
-        WRITE1 (h);
-        write_ivf_header (ivfl, f);
-        {
-            std::vector<Index::idx_t> tab (2 * ivfl->instances.size());
-            long i = 0;
-            for (auto it = ivfl->instances.begin();
-                 it != ivfl->instances.end(); ++it) {
-                tab[i++] = it->first;
-                tab[i++] = it->second;
-            }
-            WRITEVECTOR (tab);
-        }
-        write_InvertedLists (ivfl->invlists, f);
-    } else if(const IndexIVFFlat * ivfl =
-              dynamic_cast<const IndexIVFFlat *> (idx)) {
-        uint32_t h = fourcc ("IwFl");
-        WRITE1 (h);
-        write_ivf_header (ivfl, f);
-        write_InvertedLists (ivfl->invlists, f);
-    } else if(const IndexIVFScalarQuantizer * ivsc =
-              dynamic_cast<const IndexIVFScalarQuantizer *> (idx)) {
-        uint32_t h = fourcc ("IwSq");
-        WRITE1 (h);
-        write_ivf_header (ivsc, f);
-        write_ScalarQuantizer (&ivsc->sq, f);
-        WRITE1 (ivsc->code_size);
-        WRITE1 (ivsc->by_residual);
-        write_InvertedLists (ivsc->invlists, f);
-    } else if(const IndexIVFSpectralHash *ivsp =
-              dynamic_cast<const IndexIVFSpectralHash *>(idx)) {
-        uint32_t h = fourcc ("IwSh");
-        WRITE1 (h);
-        write_ivf_header (ivsp, f);
-        write_VectorTransform (ivsp->vt, f);
-        WRITE1 (ivsp->nbit);
-        WRITE1 (ivsp->period);
-        WRITE1 (ivsp->threshold_type);
-        WRITEVECTOR (ivsp->trained);
-        write_InvertedLists (ivsp->invlists, f);
-    } else if(const IndexIVFPQ * ivpq =
-              dynamic_cast<const IndexIVFPQ *> (idx)) {
-        const IndexIVFPQR * ivfpqr = dynamic_cast<const IndexIVFPQR *> (idx);
-
-        uint32_t h = fourcc (ivfpqr ? "IwQR" : "IwPQ");
-        WRITE1 (h);
-        write_ivf_header (ivpq, f);
-        WRITE1 (ivpq->by_residual);
-        WRITE1 (ivpq->code_size);
-        write_ProductQuantizer (&ivpq->pq, f);
-        write_InvertedLists (ivpq->invlists, f);
-        if (ivfpqr) {
-            write_ProductQuantizer (&ivfpqr->refine_pq, f);
-            WRITEVECTOR (ivfpqr->refine_codes);
-            WRITE1 (ivfpqr->k_factor);
-        }
-
-    } else if(const IndexPreTransform * ixpt =
-              dynamic_cast<const IndexPreTransform *> (idx)) {
-        uint32_t h = fourcc ("IxPT");
-        WRITE1 (h);
-        write_index_header (ixpt, f);
-        int nt = ixpt->chain.size();
-        WRITE1 (nt);
-        for (int i = 0; i < nt; i++)
-            write_VectorTransform (ixpt->chain[i], f);
-        write_index (ixpt->index, f);
-    } else if(const MultiIndexQuantizer * imiq =
-              dynamic_cast<const MultiIndexQuantizer *> (idx)) {
-        uint32_t h = fourcc ("Imiq");
-        WRITE1 (h);
-        write_index_header (imiq, f);
-        write_ProductQuantizer (&imiq->pq, f);
-    } else if(const IndexRefineFlat * idxrf =
-              dynamic_cast<const IndexRefineFlat *> (idx)) {
-        uint32_t h = fourcc ("IxRF");
-        WRITE1 (h);
-        write_index_header (idxrf, f);
-        write_index (idxrf->base_index, f);
-        write_index (&idxrf->refine_index, f);
-        WRITE1 (idxrf->k_factor);
-    } else if(const IndexIDMap * idxmap =
-              dynamic_cast<const IndexIDMap *> (idx)) {
-        uint32_t h =
-            dynamic_cast<const IndexIDMap2 *> (idx) ? fourcc ("IxM2") :
-            fourcc ("IxMp");
-        // no need to store additional info for IndexIDMap2
-        WRITE1 (h);
-        write_index_header (idxmap, f);
-        write_index (idxmap->index, f);
-        WRITEVECTOR (idxmap->id_map);
-    } else if(const IndexHNSW * idxhnsw =
-              dynamic_cast<const IndexHNSW *> (idx)) {
-        uint32_t h =
-            dynamic_cast<const IndexHNSWFlat*>(idx)   ? fourcc("IHNf") :
-            dynamic_cast<const IndexHNSWPQ*>(idx)     ? fourcc("IHNp") :
-            dynamic_cast<const IndexHNSWSQ*>(idx)     ? fourcc("IHNs") :
-            dynamic_cast<const IndexHNSW2Level*>(idx) ? fourcc("IHN2") :
-            0;
-        FAISS_THROW_IF_NOT (h != 0);
-        WRITE1 (h);
-        write_index_header (idxhnsw, f);
-        write_HNSW (&idxhnsw->hnsw, f);
-        write_index (idxhnsw->storage, f);
-    } else {
-      FAISS_THROW_MSG ("don't know how to serialize this type of index");
-    }
-}
-
-void write_index (const Index *idx, FILE *f) {
-    FileIOWriter writer(f);
-    write_index (idx, &writer);
-}
-
-void write_index (const Index *idx, const char *fname) {
-    FileIOWriter writer(fname);
-    write_index (idx, &writer);
-}
-
-void write_VectorTransform (const VectorTransform *vt, const char *fname) {
-    FileIOWriter writer(fname);
-    write_VectorTransform (vt, &writer);
-}
-
-/*************************************************************
- * Read
- **************************************************************/
-
-static void read_index_header (Index *idx, IOReader *f) {
-    READ1 (idx->d);
-    READ1 (idx->ntotal);
-    Index::idx_t dummy;
-    READ1 (dummy);
-    READ1 (dummy);
-    READ1 (idx->is_trained);
-    READ1 (idx->metric_type);
-    if (idx->metric_type > 1) {
-        READ1 (idx->metric_arg);
-    }
-    idx->verbose = false;
-}
-
-VectorTransform* read_VectorTransform (IOReader *f) {
-    uint32_t h;
-    READ1 (h);
-    VectorTransform *vt = nullptr;
-
-    if (h == fourcc ("rrot") || h == fourcc ("PCAm") ||
-        h == fourcc ("LTra") || h == fourcc ("PcAm")) {
-        LinearTransform *lt = nullptr;
-        if (h == fourcc ("rrot")) {
-            lt = new RandomRotationMatrix ();
-        } else if (h == fourcc ("PCAm") ||
-                   h == fourcc ("PcAm")) {
-            PCAMatrix * pca = new PCAMatrix ();
-            READ1 (pca->eigen_power);
-            READ1 (pca->random_rotation);
-            if (h == fourcc ("PcAm"))
-                READ1 (pca->balanced_bins);
-            READVECTOR (pca->mean);
-            READVECTOR (pca->eigenvalues);
-            READVECTOR (pca->PCAMat);
-            lt = pca;
-        } else if (h == fourcc ("LTra")) {
-            lt = new LinearTransform ();
-        }
-        READ1 (lt->have_bias);
-        READVECTOR (lt->A);
-        READVECTOR (lt->b);
-        FAISS_THROW_IF_NOT (lt->A.size() >= lt->d_in * lt->d_out);
-        FAISS_THROW_IF_NOT (!lt->have_bias || lt->b.size() >= lt->d_out);
-        lt->set_is_orthonormal();
-        vt = lt;
-    } else if (h == fourcc ("RmDT")) {
-        RemapDimensionsTransform *rdt = new RemapDimensionsTransform ();
-        READVECTOR (rdt->map);
-        vt = rdt;
-    } else if (h == fourcc ("VNrm")) {
-        NormalizationTransform *nt = new NormalizationTransform ();
-        READ1 (nt->norm);
-        vt = nt;
-    } else if (h == fourcc ("VCnt")) {
-        CenteringTransform *ct = new CenteringTransform ();
-        READVECTOR (ct->mean);
-        vt = ct;
-    } else {
-        FAISS_THROW_MSG("fourcc not recognized");
-    }
-    READ1 (vt->d_in);
-    READ1 (vt->d_out);
-    READ1 (vt->is_trained);
-    return vt;
-}
-
-
-static void read_ArrayInvertedLists_sizes (
-         IOReader *f, std::vector<size_t> & sizes)
-{
-    uint32_t list_type;
-    READ1(list_type);
-    if (list_type == fourcc("full")) {
-        size_t os = sizes.size();
-        READVECTOR (sizes);
-        FAISS_THROW_IF_NOT (os == sizes.size());
-    } else if (list_type == fourcc("sprs")) {
-        std::vector<size_t> idsizes;
-        READVECTOR (idsizes);
-        for (size_t j = 0; j < idsizes.size(); j += 2) {
-            FAISS_THROW_IF_NOT (idsizes[j] < sizes.size());
-            sizes[idsizes[j]] = idsizes[j + 1];
-        }
-    } else {
-        FAISS_THROW_MSG ("invalid list_type");
-    }
-}
-
-InvertedLists *read_InvertedLists (IOReader *f, int io_flags) {
-    uint32_t h;
-    READ1 (h);
-    if (h == fourcc ("il00")) {
-        fprintf(stderr, "read_InvertedLists:"
-                " WARN! inverted lists not stored with IVF object\n");
-        return nullptr;
-    } else if (h == fourcc ("ilar") && !(io_flags & IO_FLAG_MMAP)) {
-        auto ails = new ArrayInvertedLists (0, 0);
-        READ1 (ails->nlist);
-        READ1 (ails->code_size);
-        ails->ids.resize (ails->nlist);
-        ails->codes.resize (ails->nlist);
-        std::vector<size_t> sizes (ails->nlist);
-        read_ArrayInvertedLists_sizes (f, sizes);
-        for (size_t i = 0; i < ails->nlist; i++) {
-            ails->ids[i].resize (sizes[i]);
-            ails->codes[i].resize (sizes[i] * ails->code_size);
-        }
-        for (size_t i = 0; i < ails->nlist; i++) {
-            size_t n = ails->ids[i].size();
-            if (n > 0) {
-                READANDCHECK (ails->codes[i].data(), n * ails->code_size);
-                READANDCHECK (ails->ids[i].data(), n);
-            }
-        }
-        return ails;
-    } else if (h == fourcc ("ilar") && (io_flags & IO_FLAG_MMAP)) {
-        // then we load it as an OnDiskInvertedLists
-
-        FileIOReader *reader = dynamic_cast<FileIOReader*>(f);
-        FAISS_THROW_IF_NOT_MSG(reader, "mmap only supported for File objects");
-        FILE *fdesc = reader->f;
-
-        auto ails = new OnDiskInvertedLists ();
-        READ1 (ails->nlist);
-        READ1 (ails->code_size);
-        ails->read_only = true;
-        ails->lists.resize (ails->nlist);
-        std::vector<size_t> sizes (ails->nlist);
-        read_ArrayInvertedLists_sizes (f, sizes);
-        size_t o0 = ftell(fdesc), o = o0;
-        { // do the mmap
-            struct stat buf;
-            int ret = fstat (fileno(fdesc), &buf);
-            FAISS_THROW_IF_NOT_FMT (ret == 0,
-                                    "fstat failed: %s", strerror(errno));
-            ails->totsize = buf.st_size;
-            ails->ptr = (uint8_t*)mmap (nullptr, ails->totsize,
-                                        PROT_READ, MAP_SHARED,
-                                        fileno(fdesc), 0);
-            FAISS_THROW_IF_NOT_FMT (ails->ptr != MAP_FAILED,
-                            "could not mmap: %s",
-                            strerror(errno));
-        }
-
-        for (size_t i = 0; i < ails->nlist; i++) {
-            OnDiskInvertedLists::List & l = ails->lists[i];
-            l.size = l.capacity = sizes[i];
-            l.offset = o;
-            o += l.size * (sizeof(OnDiskInvertedLists::idx_t) +
-                           ails->code_size);
-        }
-        FAISS_THROW_IF_NOT(o <= ails->totsize);
-        // resume normal reading of file
-        fseek (fdesc, o, SEEK_SET);
-        return ails;
-    } else if (h == fourcc ("ilod")) {
-        OnDiskInvertedLists *od = new OnDiskInvertedLists();
-        od->read_only = io_flags & IO_FLAG_READ_ONLY;
-        READ1 (od->nlist);
-        READ1 (od->code_size);
-        // this is a POD object
-        READVECTOR (od->lists);
-        {
-            std::vector<OnDiskInvertedLists::Slot> v;
-            READVECTOR(v);
-            od->slots.assign(v.begin(), v.end());
-        }
-        {
-            std::vector<char> x;
-            READVECTOR(x);
-            od->filename.assign(x.begin(), x.end());
-
-            if (io_flags & IO_FLAG_ONDISK_SAME_DIR) {
-                FileIOReader *reader = dynamic_cast<FileIOReader*>(f);
-                FAISS_THROW_IF_NOT_MSG (
-                    reader, "IO_FLAG_ONDISK_SAME_DIR only supported "
-                    "when reading from file");
-                std::string indexname = reader->name;
-                std::string dirname = "./";
-                size_t slash = indexname.find_last_of('/');
-                if (slash != std::string::npos) {
-                    dirname = indexname.substr(0, slash + 1);
-                }
-                std::string filename = od->filename;
-                slash = filename.find_last_of('/');
-                if (slash != std::string::npos) {
-                    filename = filename.substr(slash + 1);
-                }
-                filename = dirname + filename;
-                printf("IO_FLAG_ONDISK_SAME_DIR: "
-                       "updating ondisk filename from %s to %s\n",
-                       od->filename.c_str(), filename.c_str());
-                od->filename = filename;
-            }
-
-        }
-        READ1(od->totsize);
-        od->do_mmap();
-        return od;
-    } else {
-        FAISS_THROW_MSG ("read_InvertedLists: unsupported invlist type");
-    }
-}
-
-static void read_InvertedLists (
-        IndexIVF *ivf, IOReader *f, int io_flags) {
-    InvertedLists *ils = read_InvertedLists (f, io_flags);
-    FAISS_THROW_IF_NOT (!ils || (ils->nlist == ivf->nlist &&
-                                 ils->code_size == ivf->code_size));
-    ivf->invlists = ils;
-    ivf->own_invlists = true;
-}
-
-static void read_InvertedLists (
-    IndexBinaryIVF *ivf, IOReader *f, int io_flags) {
-    InvertedLists *ils = read_InvertedLists (f, io_flags);
-    FAISS_THROW_IF_NOT (!ils || (ils->nlist == ivf->nlist &&
-                                 ils->code_size == ivf->code_size));
-    ivf->invlists = ils;
-    ivf->own_invlists = true;
-}
-
-static void read_ProductQuantizer (ProductQuantizer *pq, IOReader *f) {
-    READ1 (pq->d);
-    READ1 (pq->M);
-    READ1 (pq->nbits);
-    pq->set_derived_values ();
-    READVECTOR (pq->centroids);
-}
-
-static void read_ScalarQuantizer (ScalarQuantizer *ivsc, IOReader *f) {
-    READ1 (ivsc->qtype);
-    READ1 (ivsc->rangestat);
-    READ1 (ivsc->rangestat_arg);
-    READ1 (ivsc->d);
-    READ1 (ivsc->code_size);
-    READVECTOR (ivsc->trained);
-}
-
-
-static void read_HNSW (HNSW *hnsw, IOReader *f) {
-    READVECTOR (hnsw->assign_probas);
-    READVECTOR (hnsw->cum_nneighbor_per_level);
-    READVECTOR (hnsw->levels);
-    READVECTOR (hnsw->offsets);
-    READVECTOR (hnsw->neighbors);
-
-    READ1 (hnsw->entry_point);
-    READ1 (hnsw->max_level);
-    READ1 (hnsw->efConstruction);
-    READ1 (hnsw->efSearch);
-    READ1 (hnsw->upper_beam);
-}
-
-ProductQuantizer * read_ProductQuantizer (const char*fname) {
-    FileIOReader reader(fname);
-    return read_ProductQuantizer(&reader);
-}
-
-ProductQuantizer * read_ProductQuantizer (IOReader *reader) {
-  ProductQuantizer *pq = new ProductQuantizer();
-  ScopeDeleter1<ProductQuantizer> del (pq);
-
-  read_ProductQuantizer(pq, reader);
-  del.release ();
-  return pq;
-}
-
-static void read_ivf_header (
-    IndexIVF *ivf, IOReader *f,
-    std::vector<std::vector<Index::idx_t> > *ids = nullptr)
-{
-    read_index_header (ivf, f);
-    READ1 (ivf->nlist);
-    READ1 (ivf->nprobe);
-    ivf->quantizer = read_index (f);
-    ivf->own_fields = true;
-    if (ids) { // used in legacy "Iv" formats
-        ids->resize (ivf->nlist);
-        for (size_t i = 0; i < ivf->nlist; i++)
-            READVECTOR ((*ids)[i]);
-    }
-    READ1 (ivf->maintain_direct_map);
-    READVECTOR (ivf->direct_map);
-}
-
-// used for legacy formats
-static ArrayInvertedLists *set_array_invlist(
-    IndexIVF *ivf, std::vector<std::vector<Index::idx_t> > &ids)
-{
-    ArrayInvertedLists *ail = new ArrayInvertedLists (
-             ivf->nlist, ivf->code_size);
-    std::swap (ail->ids, ids);
-    ivf->invlists = ail;
-    ivf->own_invlists = true;
-    return ail;
-}
-
-static IndexIVFPQ *read_ivfpq (IOReader *f, uint32_t h, int io_flags)
-{
-    bool legacy = h == fourcc ("IvQR") || h == fourcc ("IvPQ");
-
-    IndexIVFPQR *ivfpqr =
-        h == fourcc ("IvQR") || h == fourcc ("IwQR") ?
-        new IndexIVFPQR () : nullptr;
-    IndexIVFPQ * ivpq = ivfpqr ? ivfpqr : new IndexIVFPQ ();
-
-    std::vector<std::vector<Index::idx_t> > ids;
-    read_ivf_header (ivpq, f, legacy ? &ids : nullptr);
-    READ1 (ivpq->by_residual);
-    READ1 (ivpq->code_size);
-    read_ProductQuantizer (&ivpq->pq, f);
-
-    if (legacy) {
-        ArrayInvertedLists *ail = set_array_invlist (ivpq, ids);
-        for (size_t i = 0; i < ail->nlist; i++)
-            READVECTOR (ail->codes[i]);
-    } else {
-        read_InvertedLists (ivpq, f, io_flags);
-    }
-
-    if (ivpq->is_trained) {
-        // precomputed table not stored. It is cheaper to recompute it
-        ivpq->use_precomputed_table = 0;
-        if (ivpq->by_residual)
-            ivpq->precompute_table ();
-        if (ivfpqr) {
-            read_ProductQuantizer (&ivfpqr->refine_pq, f);
-            READVECTOR (ivfpqr->refine_codes);
-            READ1 (ivfpqr->k_factor);
-        }
-    }
-    return ivpq;
-}
-
-int read_old_fmt_hack = 0;
-
-Index *read_index (IOReader *f, int io_flags) {
-    Index * idx = nullptr;
-    uint32_t h;
-    READ1 (h);
-    if (h == fourcc ("IxFI") || h == fourcc ("IxF2")) {
-        IndexFlat *idxf;
-        if (h == fourcc ("IxFI")) idxf = new IndexFlatIP ();
-        else                      idxf = new IndexFlatL2 ();
-        read_index_header (idxf, f);
-        READVECTOR (idxf->xb);
-        FAISS_THROW_IF_NOT (idxf->xb.size() == idxf->ntotal * idxf->d);
-        // leak!
-        idx = idxf;
-    } else if (h == fourcc("IxHE") || h == fourcc("IxHe")) {
-        IndexLSH * idxl = new IndexLSH ();
-        read_index_header (idxl, f);
-        READ1 (idxl->nbits);
-        READ1 (idxl->rotate_data);
-        READ1 (idxl->train_thresholds);
-        READVECTOR (idxl->thresholds);
-        READ1 (idxl->bytes_per_vec);
-        if (h == fourcc("IxHE")) {
-            FAISS_THROW_IF_NOT_FMT (idxl->nbits % 64 == 0,
-                            "can only read old format IndexLSH with "
-                            "nbits multiple of 64 (got %d)",
-                            (int) idxl->nbits);
-            // leak
-            idxl->bytes_per_vec *= 8;
-        }
-        {
-            RandomRotationMatrix *rrot = dynamic_cast<RandomRotationMatrix *>
-                (read_VectorTransform (f));
-            FAISS_THROW_IF_NOT_MSG(rrot, "expected a random rotation");
-            idxl->rrot = *rrot;
-            delete rrot;
-        }
-        READVECTOR (idxl->codes);
-        FAISS_THROW_IF_NOT (idxl->rrot.d_in == idxl->d &&
-                      idxl->rrot.d_out == idxl->nbits);
-        FAISS_THROW_IF_NOT (
-               idxl->codes.size() == idxl->ntotal * idxl->bytes_per_vec);
-        idx = idxl;
-    } else if (h == fourcc ("IxPQ") || h == fourcc ("IxPo") ||
-               h == fourcc ("IxPq")) {
-        // IxPQ and IxPo were merged into the same IndexPQ object
-        IndexPQ * idxp =new IndexPQ ();
-        read_index_header (idxp, f);
-        read_ProductQuantizer (&idxp->pq, f);
-        READVECTOR (idxp->codes);
-        if (h == fourcc ("IxPo") || h == fourcc ("IxPq")) {
-            READ1 (idxp->search_type);
-            READ1 (idxp->encode_signs);
-            READ1 (idxp->polysemous_ht);
-        }
-        // Old versoins of PQ all had metric_type set to INNER_PRODUCT
-        // when they were in fact using L2. Therefore, we force metric type
-        // to L2 when the old format is detected
-        if (h == fourcc ("IxPQ") || h == fourcc ("IxPo")) {
-            idxp->metric_type = METRIC_L2;
-        }
-        idx = idxp;
-    } else if (h == fourcc ("IvFl") || h == fourcc("IvFL")) { // legacy
-        IndexIVFFlat * ivfl = new IndexIVFFlat ();
-        std::vector<std::vector<Index::idx_t> > ids;
-        read_ivf_header (ivfl, f, &ids);
-        ivfl->code_size = ivfl->d * sizeof(float);
-        ArrayInvertedLists *ail = set_array_invlist (ivfl, ids);
-
-        if (h == fourcc ("IvFL")) {
-            for (size_t i = 0; i < ivfl->nlist; i++) {
-                READVECTOR (ail->codes[i]);
-            }
-        } else { // old format
-            for (size_t i = 0; i < ivfl->nlist; i++) {
-                std::vector<float> vec;
-                READVECTOR (vec);
-                ail->codes[i].resize(vec.size() * sizeof(float));
-                memcpy(ail->codes[i].data(), vec.data(),
-                       ail->codes[i].size());
-            }
-        }
-        idx = ivfl;
-    } else if (h == fourcc ("IwFd")) {
-        IndexIVFFlatDedup * ivfl = new IndexIVFFlatDedup ();
-        read_ivf_header (ivfl, f);
-        ivfl->code_size = ivfl->d * sizeof(float);
-        {
-            std::vector<Index::idx_t> tab;
-            READVECTOR (tab);
-            for (long i = 0; i < tab.size(); i += 2) {
-                std::pair<Index::idx_t, Index::idx_t>
-                    pair (tab[i], tab[i + 1]);
-                ivfl->instances.insert (pair);
-            }
-        }
-        read_InvertedLists (ivfl, f, io_flags);
-        idx = ivfl;
-    } else if (h == fourcc ("IwFl")) {
-        IndexIVFFlat * ivfl = new IndexIVFFlat ();
-        read_ivf_header (ivfl, f);
-        ivfl->code_size = ivfl->d * sizeof(float);
-        read_InvertedLists (ivfl, f, io_flags);
-        idx = ivfl;
-    } else if (h == fourcc ("IxSQ")) {
-        IndexScalarQuantizer * idxs = new IndexScalarQuantizer ();
-        read_index_header (idxs, f);
-        read_ScalarQuantizer (&idxs->sq, f);
-        READVECTOR (idxs->codes);
-        idxs->code_size = idxs->sq.code_size;
-        idx = idxs;
-    } else if(h == fourcc ("IvSQ")) { // legacy
-        IndexIVFScalarQuantizer * ivsc = new IndexIVFScalarQuantizer();
-        std::vector<std::vector<Index::idx_t> > ids;
-        read_ivf_header (ivsc, f, &ids);
-        read_ScalarQuantizer (&ivsc->sq, f);
-        READ1 (ivsc->code_size);
-        ArrayInvertedLists *ail = set_array_invlist (ivsc, ids);
-        for(int i = 0; i < ivsc->nlist; i++)
-            READVECTOR (ail->codes[i]);
-        idx = ivsc;
-    } else if(h == fourcc ("IwSQ") || h == fourcc ("IwSq")) {
-        IndexIVFScalarQuantizer * ivsc = new IndexIVFScalarQuantizer();
-        read_ivf_header (ivsc, f);
-        read_ScalarQuantizer (&ivsc->sq, f);
-        READ1 (ivsc->code_size);
-        if (h == fourcc ("IwSQ")) {
-            ivsc->by_residual = true;
-        } else {
-            READ1 (ivsc->by_residual);
-        }
-        read_InvertedLists (ivsc, f, io_flags);
-        idx = ivsc;
-    } else if(h == fourcc ("IwSh")) {
-        IndexIVFSpectralHash *ivsp = new IndexIVFSpectralHash ();
-        read_ivf_header (ivsp, f);
-        ivsp->vt = read_VectorTransform (f);
-        ivsp->own_fields = true;
-        READ1 (ivsp->nbit);
-        // not stored by write_ivf_header
-        ivsp->code_size = (ivsp->nbit + 7) / 8;
-        READ1 (ivsp->period);
-        READ1 (ivsp->threshold_type);
-        READVECTOR (ivsp->trained);
-        read_InvertedLists (ivsp, f, io_flags);
-        idx = ivsp;
-    } else if(h == fourcc ("IvPQ") || h == fourcc ("IvQR") ||
-              h == fourcc ("IwPQ") || h == fourcc ("IwQR")) {
-
-        idx = read_ivfpq (f, h, io_flags);
-
-    } else if(h == fourcc ("IxPT")) {
-        IndexPreTransform * ixpt = new IndexPreTransform();
-        ixpt->own_fields = true;
-        read_index_header (ixpt, f);
-        int nt;
-        if (read_old_fmt_hack == 2) {
-            nt = 1;
-        } else {
-            READ1 (nt);
-        }
-        for (int i = 0; i < nt; i++) {
-            ixpt->chain.push_back (read_VectorTransform (f));
-        }
-        ixpt->index = read_index (f, io_flags);
-        idx = ixpt;
-    } else if(h == fourcc ("Imiq")) {
-        MultiIndexQuantizer * imiq = new MultiIndexQuantizer ();
-        read_index_header (imiq, f);
-        read_ProductQuantizer (&imiq->pq, f);
-        idx = imiq;
-    } else if(h == fourcc ("IxRF")) {
-        IndexRefineFlat *idxrf = new IndexRefineFlat ();
-        read_index_header (idxrf, f);
-        idxrf->base_index = read_index(f, io_flags);
-        idxrf->own_fields = true;
-        IndexFlat *rf = dynamic_cast<IndexFlat*> (read_index (f, io_flags));
-        std::swap (*rf, idxrf->refine_index);
-        delete rf;
-        READ1 (idxrf->k_factor);
-        idx = idxrf;
-    } else if(h == fourcc ("IxMp") || h == fourcc ("IxM2")) {
-        bool is_map2 = h == fourcc ("IxM2");
-        IndexIDMap * idxmap = is_map2 ? new IndexIDMap2 () : new IndexIDMap ();
-        read_index_header (idxmap, f);
-        idxmap->index = read_index (f, io_flags);
-        idxmap->own_fields = true;
-        READVECTOR (idxmap->id_map);
-        if (is_map2) {
-            static_cast<IndexIDMap2*>(idxmap)->construct_rev_map ();
-        }
-        idx = idxmap;
-    } else if (h == fourcc ("Ix2L")) {
-        Index2Layer * idxp = new Index2Layer ();
-        read_index_header (idxp, f);
-        idxp->q1.quantizer = read_index (f, io_flags);
-        READ1 (idxp->q1.nlist);
-        READ1 (idxp->q1.quantizer_trains_alone);
-        read_ProductQuantizer (&idxp->pq, f);
-        READ1 (idxp->code_size_1);
-        READ1 (idxp->code_size_2);
-        READ1 (idxp->code_size);
-        READVECTOR (idxp->codes);
-        idx = idxp;
-    } else if(h == fourcc("IHNf") || h == fourcc("IHNp") ||
-              h == fourcc("IHNs") || h == fourcc("IHN2")) {
-        IndexHNSW *idxhnsw = nullptr;
-        if (h == fourcc("IHNf")) idxhnsw = new IndexHNSWFlat ();
-        if (h == fourcc("IHNp")) idxhnsw = new IndexHNSWPQ ();
-        if (h == fourcc("IHNs")) idxhnsw = new IndexHNSWSQ ();
-        if (h == fourcc("IHN2")) idxhnsw = new IndexHNSW2Level ();
-        read_index_header (idxhnsw, f);
-        read_HNSW (&idxhnsw->hnsw, f);
-        idxhnsw->storage = read_index (f, io_flags);
-        idxhnsw->own_fields = true;
-        if (h == fourcc("IHNp")) {
-            dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table ();
-        }
-        idx = idxhnsw;
-    } else {
-        FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
-        idx = nullptr;
-    }
-    return idx;
-}
-
-
-Index *read_index (FILE * f, int io_flags) {
-    FileIOReader reader(f);
-    return read_index(&reader, io_flags);
-}
-
-Index *read_index (const char *fname, int io_flags) {
-    FileIOReader reader(fname);
-    Index *idx = read_index (&reader, io_flags);
-    return idx;
-}
-
-VectorTransform *read_VectorTransform (const char *fname) {
-    FileIOReader reader(fname);
-    VectorTransform *vt = read_VectorTransform (&reader);
-    return vt;
-}
-
-/*************************************************************
- * cloning functions
- **************************************************************/
-
-
-
-Index * clone_index (const Index *index)
-{
-    Cloner cl;
-    return cl.clone_Index (index);
-}
-
-// assumes there is a copy constructor ready. Always try from most
-// specific to most general
-#define TRYCLONE(classname, obj) \
-    if (const classname *clo = dynamic_cast<const classname *>(obj)) { \
-        return new classname(*clo); \
-    } else
-
-VectorTransform *Cloner::clone_VectorTransform (const VectorTransform *vt)
-{
-    TRYCLONE (RemapDimensionsTransform, vt)
-    TRYCLONE (OPQMatrix, vt)
-    TRYCLONE (PCAMatrix, vt)
-    TRYCLONE (RandomRotationMatrix, vt)
-    TRYCLONE (LinearTransform, vt)
-    {
-      FAISS_THROW_MSG("clone not supported for this type of VectorTransform");
-    }
-    return nullptr;
-}
-
-IndexIVF * Cloner::clone_IndexIVF (const IndexIVF *ivf)
-{
-    TRYCLONE (IndexIVFPQR, ivf)
-    TRYCLONE (IndexIVFPQ, ivf)
-    TRYCLONE (IndexIVFFlat, ivf)
-    TRYCLONE (IndexIVFScalarQuantizer, ivf)
-    {
-      FAISS_THROW_MSG("clone not supported for this type of IndexIVF");
-    }
-    return nullptr;
-}
-
-Index *Cloner::clone_Index (const Index *index)
-{
-    TRYCLONE (IndexPQ, index)
-    TRYCLONE (IndexLSH, index)
-    TRYCLONE (IndexFlatL2, index)
-    TRYCLONE (IndexFlatIP, index)
-    TRYCLONE (IndexFlat, index)
-    TRYCLONE (IndexScalarQuantizer, index)
-    TRYCLONE (MultiIndexQuantizer, index)
-    if (const IndexIVF * ivf = dynamic_cast<const IndexIVF*>(index)) {
-        IndexIVF *res = clone_IndexIVF (ivf);
-        if (ivf->invlists == nullptr) {
-            res->invlists = nullptr;
-        } else if (auto *ails = dynamic_cast<const ArrayInvertedLists*>
-                   (ivf->invlists)) {
-            res->invlists = new ArrayInvertedLists(*ails);
-            res->own_invlists = true;
-        } else {
-            FAISS_THROW_MSG( "clone not supported for this type of inverted lists");
-        }
-        res->own_fields = true;
-        res->quantizer = clone_Index (ivf->quantizer);
-        return res;
-    } else if (const IndexPreTransform * ipt =
-               dynamic_cast<const IndexPreTransform*> (index)) {
-        IndexPreTransform *res = new IndexPreTransform ();
-        res->d = ipt->d;
-        res->index = clone_Index (ipt->index);
-        for (int i = 0; i < ipt->chain.size(); i++)
-            res->chain.push_back (clone_VectorTransform (ipt->chain[i]));
-        res->own_fields = true;
-        return res;
-    } else if (const IndexIDMap *idmap =
-               dynamic_cast<const IndexIDMap*> (index)) {
-        IndexIDMap *res = new IndexIDMap (*idmap);
-        res->own_fields = true;
-        res->index = clone_Index (idmap->index);
-        return res;
-    } else {
-        FAISS_THROW_MSG( "clone not supported for this type of Index");
-    }
-    return nullptr;
-}
-
-
-static void write_index_binary_header (const IndexBinary *idx, IOWriter *f) {
-    WRITE1 (idx->d);
-    WRITE1 (idx->code_size);
-    WRITE1 (idx->ntotal);
-    WRITE1 (idx->is_trained);
-    WRITE1 (idx->metric_type);
-}
-
-static void write_binary_ivf_header (const IndexBinaryIVF *ivf, IOWriter *f) {
-    write_index_binary_header (ivf, f);
-    WRITE1 (ivf->nlist);
-    WRITE1 (ivf->nprobe);
-    write_index_binary (ivf->quantizer, f);
-    WRITE1 (ivf->maintain_direct_map);
-    WRITEVECTOR (ivf->direct_map);
-}
-
-void write_index_binary (const IndexBinary *idx, IOWriter *f) {
-    if (const IndexBinaryFlat *idxf =
-        dynamic_cast<const IndexBinaryFlat *> (idx)) {
-        uint32_t h = fourcc ("IBxF");
-        WRITE1 (h);
-        write_index_binary_header (idx, f);
-        WRITEVECTOR (idxf->xb);
-    } else if (const IndexBinaryIVF *ivf =
-               dynamic_cast<const IndexBinaryIVF *> (idx)) {
-        uint32_t h = fourcc ("IBwF");
-        WRITE1 (h);
-        write_binary_ivf_header (ivf, f);
-        write_InvertedLists (ivf->invlists, f);
-    } else if(const IndexBinaryFromFloat * idxff =
-              dynamic_cast<const IndexBinaryFromFloat *> (idx)) {
-        uint32_t h = fourcc ("IBFf");
-        WRITE1 (h);
-        write_index_binary_header (idxff, f);
-        write_index (idxff->index, f);
-    } else if (const IndexBinaryHNSW *idxhnsw =
-               dynamic_cast<const IndexBinaryHNSW *> (idx)) {
-        uint32_t h = fourcc ("IBHf");
-        WRITE1 (h);
-        write_index_binary_header (idxhnsw, f);
-        write_HNSW (&idxhnsw->hnsw, f);
-        write_index_binary (idxhnsw->storage, f);
-    } else if(const IndexBinaryIDMap * idxmap =
-              dynamic_cast<const IndexBinaryIDMap *> (idx)) {
-        uint32_t h =
-            dynamic_cast<const IndexBinaryIDMap2 *> (idx) ? fourcc ("IBM2") :
-            fourcc ("IBMp");
-        // no need to store additional info for IndexIDMap2
-        WRITE1 (h);
-        write_index_binary_header (idxmap, f);
-        write_index_binary (idxmap->index, f);
-        WRITEVECTOR (idxmap->id_map);
-    } else {
-        FAISS_THROW_MSG ("don't know how to serialize this type of index");
-    }
-}
-
-void write_index_binary (const IndexBinary *idx, FILE *f) {
-    FileIOWriter writer(f);
-    write_index_binary(idx, &writer);
-}
-
-void write_index_binary (const IndexBinary *idx, const char *fname) {
-    FileIOWriter writer(fname);
-    write_index_binary (idx, &writer);
-}
-
-static void read_index_binary_header (IndexBinary *idx, IOReader *f) {
-    READ1 (idx->d);
-    READ1 (idx->code_size);
-    READ1 (idx->ntotal);
-    READ1 (idx->is_trained);
-    READ1 (idx->metric_type);
-    idx->verbose = false;
-}
-
-static void read_binary_ivf_header (
-    IndexBinaryIVF *ivf, IOReader *f,
-    std::vector<std::vector<Index::idx_t> > *ids = nullptr)
-{
-    read_index_binary_header (ivf, f);
-    READ1 (ivf->nlist);
-    READ1 (ivf->nprobe);
-    ivf->quantizer = read_index_binary (f);
-    ivf->own_fields = true;
-    if (ids) { // used in legacy "Iv" formats
-        ids->resize (ivf->nlist);
-        for (size_t i = 0; i < ivf->nlist; i++)
-            READVECTOR ((*ids)[i]);
-    }
-    READ1 (ivf->maintain_direct_map);
-    READVECTOR (ivf->direct_map);
-}
-
-IndexBinary *read_index_binary (IOReader *f, int io_flags) {
-    IndexBinary * idx = nullptr;
-    uint32_t h;
-    READ1 (h);
-    if (h == fourcc ("IBxF")) {
-        IndexBinaryFlat *idxf = new IndexBinaryFlat ();
-        read_index_binary_header (idxf, f);
-        READVECTOR (idxf->xb);
-        FAISS_THROW_IF_NOT (idxf->xb.size() == idxf->ntotal * idxf->code_size);
-        // leak!
-        idx = idxf;
-    } else if (h == fourcc ("IBwF")) {
-        IndexBinaryIVF *ivf = new IndexBinaryIVF ();
-        read_binary_ivf_header (ivf, f);
-        read_InvertedLists (ivf, f, io_flags);
-        idx = ivf;
-    } else if (h == fourcc ("IBFf")) {
-        IndexBinaryFromFloat *idxff = new IndexBinaryFromFloat ();
-        read_index_binary_header (idxff, f);
-        idxff->own_fields = true;
-        idxff->index = read_index (f, io_flags);
-        idx = idxff;
-    } else if (h == fourcc ("IBHf")) {
-        IndexBinaryHNSW *idxhnsw = new IndexBinaryHNSW ();
-        read_index_binary_header (idxhnsw, f);
-        read_HNSW (&idxhnsw->hnsw, f);
-        idxhnsw->storage = read_index_binary (f, io_flags);
-        idxhnsw->own_fields = true;
-        idx = idxhnsw;
-    } else if(h == fourcc ("IBMp") || h == fourcc ("IBM2")) {
-        bool is_map2 = h == fourcc ("IBM2");
-        IndexBinaryIDMap * idxmap = is_map2 ?
-            new IndexBinaryIDMap2 () : new IndexBinaryIDMap ();
-        read_index_binary_header (idxmap, f);
-        idxmap->index = read_index_binary (f, io_flags);
-        idxmap->own_fields = true;
-        READVECTOR (idxmap->id_map);
-        if (is_map2) {
-            static_cast<IndexBinaryIDMap2*>(idxmap)->construct_rev_map ();
-        }
-        idx = idxmap;
-    } else {
-        FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
-        idx = nullptr;
-    }
-    return idx;
-}
-
-IndexBinary *read_index_binary (FILE * f, int io_flags) {
-    FileIOReader reader(f);
-    return read_index_binary(&reader, io_flags);
-}
-
-IndexBinary *read_index_binary (const char *fname, int io_flags) {
-    FileIOReader reader(fname);
-    IndexBinary *idx = read_index_binary (&reader, io_flags);
-    return idx;
-}
-
-
-} // namespace faiss
diff --git a/utils.cpp b/utils.cpp
deleted file mode 100644
index a96e7d5087..0000000000
--- a/utils.cpp
+++ /dev/null
@@ -1,1612 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "utils.h"
-
-#include <cstdio>
-#include <cassert>
-#include <cstring>
-#include <cmath>
-
-#include <sys/time.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <omp.h>
-
-#include <algorithm>
-#include <vector>
-
-#include "AuxIndexStructures.h"
-#include "FaissAssert.h"
-
-
-
-#ifndef FINTEGER
-#define FINTEGER long
-#endif
-
-
-extern "C" {
-
-/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
-
-int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
-            n, FINTEGER *k, const float *alpha, const float *a,
-            FINTEGER *lda, const float *b, FINTEGER *
-            ldb, float *beta, float *c, FINTEGER *ldc);
-
-/* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
-
-int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
-                 float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
-
-int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
-            FINTEGER *lda, float *tau, float *work,
-            FINTEGER *lwork, FINTEGER *info);
-
-int sgemv_(const char *trans, FINTEGER *m, FINTEGER *n, float *alpha,
-           const float *a, FINTEGER *lda, const float *x, FINTEGER *incx,
-           float *beta, float *y, FINTEGER *incy);
-
-}
-
-
-/**************************************************
- * Get some stats about the system
- **************************************************/
-
-namespace faiss {
-
-double getmillisecs () {
-    struct timeval tv;
-    gettimeofday (&tv, nullptr);
-    return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
-}
-
-
-#ifdef __linux__
-
-size_t get_mem_usage_kb ()
-{
-    int pid = getpid ();
-    char fname[256];
-    snprintf (fname, 256, "/proc/%d/status", pid);
-    FILE * f = fopen (fname, "r");
-    FAISS_THROW_IF_NOT_MSG (f, "cannot open proc status file");
-    size_t sz = 0;
-    for (;;) {
-        char buf [256];
-        if (!fgets (buf, 256, f)) break;
-        if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
-    }
-    fclose (f);
-    return sz;
-}
-
-#elif __APPLE__
-
-size_t get_mem_usage_kb ()
-{
-    fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
-    return 0;
-}
-
-#endif
-
-
-
-/**************************************************
- * Random data generation functions
- **************************************************/
-
-RandomGenerator::RandomGenerator (int64_t seed)
-    : mt((unsigned int)seed) {}
-
-int RandomGenerator::rand_int ()
-{
-    return mt() & 0x7fffffff;
-}
-
-int64_t RandomGenerator::rand_int64 ()
-{
-    return int64_t(rand_int()) | int64_t(rand_int()) << 31;
-}
-
-int RandomGenerator::rand_int (int max)
-{
-    return mt() % max;
-}
-
-float RandomGenerator::rand_float ()
-{
-    return mt() / float(mt.max());
-}
-
-double RandomGenerator::rand_double ()
-{
-    return mt() / double(mt.max());
-}
-
-
-/***********************************************************************
- * Random functions in this C file only exist because Torch
- *  counterparts are slow and not multi-threaded.  Typical use is for
- *  more than 1-100 billion values. */
-
-
-/* Generate a set of random floating point values such that x[i] in [0,1]
-   multi-threading. For this reason, we rely on re-entreant functions.  */
-void float_rand (float * x, size_t n, int64_t seed)
-{
-    // only try to parallelize on large enough arrays
-    const size_t nblock = n < 1024 ? 1 : 1024;
-
-    RandomGenerator rng0 (seed);
-    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
-
-#pragma omp parallel for
-    for (size_t j = 0; j < nblock; j++) {
-
-        RandomGenerator rng (a0 + j * b0);
-
-        const size_t istart = j * n / nblock;
-        const size_t iend = (j + 1) * n / nblock;
-
-        for (size_t i = istart; i < iend; i++)
-            x[i] = rng.rand_float ();
-    }
-}
-
-
-void float_randn (float * x, size_t n, int64_t seed)
-{
-    // only try to parallelize on large enough arrays
-    const size_t nblock = n < 1024 ? 1 : 1024;
-
-    RandomGenerator rng0 (seed);
-    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
-
-#pragma omp parallel for
-    for (size_t j = 0; j < nblock; j++) {
-        RandomGenerator rng (a0 + j * b0);
-
-        double a = 0, b = 0, s = 0;
-        int state = 0;  /* generate two number per "do-while" loop */
-
-        const size_t istart = j * n / nblock;
-        const size_t iend = (j + 1) * n / nblock;
-
-        for (size_t i = istart; i < iend; i++) {
-            /* Marsaglia's method (see Knuth) */
-            if (state == 0) {
-                do {
-                    a = 2.0 * rng.rand_double () - 1;
-                    b = 2.0 * rng.rand_double () - 1;
-                    s = a * a + b * b;
-                } while (s >= 1.0);
-                x[i] = a * sqrt(-2.0 * log(s) / s);
-            }
-            else
-                x[i] = b * sqrt(-2.0 * log(s) / s);
-            state = 1 - state;
-        }
-    }
-}
-
-
-/* Integer versions */
-void int64_rand (int64_t * x, size_t n, int64_t seed)
-{
-    // only try to parallelize on large enough arrays
-    const size_t nblock = n < 1024 ? 1 : 1024;
-
-    RandomGenerator rng0 (seed);
-    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
-
-#pragma omp parallel for
-    for (size_t j = 0; j < nblock; j++) {
-
-        RandomGenerator rng (a0 + j * b0);
-
-        const size_t istart = j * n / nblock;
-        const size_t iend = (j + 1) * n / nblock;
-        for (size_t i = istart; i < iend; i++)
-            x[i] = rng.rand_int64 ();
-    }
-}
-
-
-
-void rand_perm (int *perm, size_t n, int64_t seed)
-{
-    for (size_t i = 0; i < n; i++) perm[i] = i;
-
-    RandomGenerator rng (seed);
-
-    for (size_t i = 0; i + 1 < n; i++) {
-        int i2 = i + rng.rand_int (n - i);
-        std::swap(perm[i], perm[i2]);
-    }
-}
-
-
-
-
-void byte_rand (uint8_t * x, size_t n, int64_t seed)
-{
-    // only try to parallelize on large enough arrays
-    const size_t nblock = n < 1024 ? 1 : 1024;
-
-    RandomGenerator rng0 (seed);
-    int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
-
-#pragma omp parallel for
-    for (size_t j = 0; j < nblock; j++) {
-
-        RandomGenerator rng (a0 + j * b0);
-
-        const size_t istart = j * n / nblock;
-        const size_t iend = (j + 1) * n / nblock;
-
-        size_t i;
-        for (i = istart; i < iend; i++)
-            x[i] = rng.rand_int64 ();
-    }
-}
-
-
-
-void reflection (const float * __restrict u,
-                 float * __restrict x,
-                 size_t n, size_t d, size_t nu)
-{
-    size_t i, j, l;
-    for (i = 0; i < n; i++) {
-        const float * up = u;
-        for (l = 0; l < nu; l++) {
-            float ip1 = 0, ip2 = 0;
-
-            for (j = 0; j < d; j+=2) {
-                ip1 += up[j] * x[j];
-                ip2 += up[j+1] * x[j+1];
-            }
-            float ip = 2 * (ip1 + ip2);
-
-            for (j = 0; j < d; j++)
-                x[j] -= ip * up[j];
-            up += d;
-        }
-        x += d;
-    }
-}
-
-
-/* Reference implementation (slower) */
-void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
-{
-    size_t i, j, l;
-    for (i = 0; i < n; i++) {
-        const float * up = u;
-        for (l = 0; l < nu; l++) {
-            double ip = 0;
-
-            for (j = 0; j < d; j++)
-                ip += up[j] * x[j];
-            ip *= 2;
-
-            for (j = 0; j < d; j++)
-                x[j] -= ip * up[j];
-
-            up += d;
-        }
-        x += d;
-    }
-}
-
-
-
-
-
-/***************************************************************************
- * Matrix/vector ops
- ***************************************************************************/
-
-
-
-/* Compute the inner product between a vector x and
-   a set of ny vectors y.
-   These functions are not intended to replace BLAS matrix-matrix, as they
-   would be significantly less efficient in this case. */
-void fvec_inner_products_ny (float * ip,
-                             const float * x,
-                             const float * y,
-                             size_t d, size_t ny)
-{
-    // Not sure which one is fastest
-#if 0
-    {
-        FINTEGER di = d;
-        FINTEGER nyi = ny;
-        float one = 1.0, zero = 0.0;
-        FINTEGER onei = 1;
-        sgemv_ ("T", &di, &nyi, &one, y, &di, x, &onei, &zero, ip, &onei);
-    }
-#endif
-    for (size_t i = 0; i < ny; i++) {
-        ip[i] = fvec_inner_product (x, y, d);
-        y += d;
-    }
-}
-
-
-
-
-
-/* Compute the L2 norm of a set of nx vectors */
-void fvec_norms_L2 (float * __restrict nr,
-                    const float * __restrict x,
-                    size_t d, size_t nx)
-{
-
-#pragma omp parallel for
-    for (size_t i = 0; i < nx; i++) {
-        nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
-    }
-}
-
-void fvec_norms_L2sqr (float * __restrict nr,
-                       const float * __restrict x,
-                       size_t d, size_t nx)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < nx; i++)
-        nr[i] = fvec_norm_L2sqr (x + i * d, d);
-}
-
-
-
-void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
-{
-#pragma omp parallel for
-    for (size_t i = 0; i < nx; i++) {
-        float * __restrict xi = x + i * d;
-
-        float nr = fvec_norm_L2sqr (xi, d);
-
-        if (nr > 0) {
-            size_t j;
-            const float inv_nr = 1.0 / sqrtf (nr);
-            for (j = 0; j < d; j++)
-                xi[j] *= inv_nr;
-        }
-    }
-}
-
-
-
-
-
-
-
-
-
-
-
-
-/***************************************************************************
- * KNN functions
- ***************************************************************************/
-
-
-
-/* Find the nearest neighbors for nx queries in a set of ny vectors */
-static void knn_inner_product_sse (const float * x,
-                        const float * y,
-                        size_t d, size_t nx, size_t ny,
-                        float_minheap_array_t * res)
-{
-    size_t k = res->k;
-    size_t check_period = InterruptCallback::get_period_hint (ny * d);
-
-    check_period *= omp_get_max_threads();
-
-    for (size_t i0 = 0; i0 < nx; i0 += check_period) {
-        size_t i1 = std::min(i0 + check_period, nx);
-
-#pragma omp parallel for
-        for (size_t i = i0; i < i1; i++) {
-            const float * x_i = x + i * d;
-            const float * y_j = y;
-
-            float * __restrict simi = res->get_val(i);
-            int64_t * __restrict idxi = res->get_ids (i);
-
-            minheap_heapify (k, simi, idxi);
-
-            for (size_t j = 0; j < ny; j++) {
-                float ip = fvec_inner_product (x_i, y_j, d);
-
-                if (ip > simi[0]) {
-                    minheap_pop (k, simi, idxi);
-                    minheap_push (k, simi, idxi, ip, j);
-                }
-                y_j += d;
-            }
-            minheap_reorder (k, simi, idxi);
-        }
-        InterruptCallback::check ();
-    }
-
-}
-
-static void knn_L2sqr_sse (
-                const float * x,
-                const float * y,
-                size_t d, size_t nx, size_t ny,
-                float_maxheap_array_t * res)
-{
-    size_t k = res->k;
-
-    size_t check_period = InterruptCallback::get_period_hint (ny * d);
-    check_period *= omp_get_max_threads();
-
-    for (size_t i0 = 0; i0 < nx; i0 += check_period) {
-        size_t i1 = std::min(i0 + check_period, nx);
-
-#pragma omp parallel for
-        for (size_t i = i0; i < i1; i++) {
-            const float * x_i = x + i * d;
-            const float * y_j = y;
-            size_t j;
-            float * simi = res->get_val(i);
-            int64_t * idxi = res->get_ids (i);
-
-            maxheap_heapify (k, simi, idxi);
-            for (j = 0; j < ny; j++) {
-                float disij = fvec_L2sqr (x_i, y_j, d);
-
-                if (disij < simi[0]) {
-                    maxheap_pop (k, simi, idxi);
-                    maxheap_push (k, simi, idxi, disij, j);
-                }
-                y_j += d;
-            }
-            maxheap_reorder (k, simi, idxi);
-        }
-        InterruptCallback::check ();
-    }
-
-}
-
-
-/** Find the nearest neighbors for nx queries in a set of ny vectors */
-static void knn_inner_product_blas (
-        const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        float_minheap_array_t * res)
-{
-    res->heapify ();
-
-    // BLAS does not like empty matrices
-    if (nx == 0 || ny == 0) return;
-
-    /* block sizes */
-    const size_t bs_x = 4096, bs_y = 1024;
-    // const size_t bs_x = 16, bs_y = 16;
-    std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
-
-    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
-        size_t i1 = i0 + bs_x;
-        if(i1 > nx) i1 = nx;
-
-        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
-            size_t j1 = j0 + bs_y;
-            if (j1 > ny) j1 = ny;
-            /* compute the actual dot products */
-            {
-                float one = 1, zero = 0;
-                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
-                sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
-                        y + j0 * d, &di,
-                        x + i0 * d, &di, &zero,
-                        ip_block.get(), &nyi);
-            }
-
-            /* collect maxima */
-            res->addn (j1 - j0, ip_block.get(), j0, i0, i1 - i0);
-        }
-        InterruptCallback::check ();
-    }
-    res->reorder ();
-}
-
-// distance correction is an operator that can be applied to transform
-// the distances
-template<class DistanceCorrection>
-static void knn_L2sqr_blas (const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        float_maxheap_array_t * res,
-        const DistanceCorrection &corr)
-{
-    res->heapify ();
-
-    // BLAS does not like empty matrices
-    if (nx == 0 || ny == 0) return;
-
-    size_t k = res->k;
-
-    /* block sizes */
-    const size_t bs_x = 4096, bs_y = 1024;
-    // const size_t bs_x = 16, bs_y = 16;
-    float *ip_block = new float[bs_x * bs_y];
-    float *x_norms = new float[nx];
-    float *y_norms = new float[ny];
-    ScopeDeleter<float> del1(ip_block), del3(x_norms), del2(y_norms);
-
-    fvec_norms_L2sqr (x_norms, x, d, nx);
-    fvec_norms_L2sqr (y_norms, y, d, ny);
-
-
-    for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
-        size_t i1 = i0 + bs_x;
-        if(i1 > nx) i1 = nx;
-
-        for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
-            size_t j1 = j0 + bs_y;
-            if (j1 > ny) j1 = ny;
-            /* compute the actual dot products */
-            {
-                float one = 1, zero = 0;
-                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
-                sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
-                        y + j0 * d, &di,
-                        x + i0 * d, &di, &zero,
-                        ip_block, &nyi);
-            }
-
-            /* collect minima */
-#pragma omp parallel for
-            for (size_t i = i0; i < i1; i++) {
-                float * __restrict simi = res->get_val(i);
-                int64_t * __restrict idxi = res->get_ids (i);
-                const float *ip_line = ip_block + (i - i0) * (j1 - j0);
-
-                for (size_t j = j0; j < j1; j++) {
-                    float ip = *ip_line++;
-                    float dis = x_norms[i] + y_norms[j] - 2 * ip;
-
-                    // negative values can occur for identical vectors
-                    // due to roundoff errors
-                    if (dis < 0) dis = 0;
-
-                    dis = corr (dis, i, j);
-
-                    if (dis < simi[0]) {
-                        maxheap_pop (k, simi, idxi);
-                        maxheap_push (k, simi, idxi, dis, j);
-                    }
-                }
-            }
-        }
-        InterruptCallback::check ();
-    }
-    res->reorder ();
-
-}
-
-
-
-
-
-
-
-
-
-/*******************************************************
- * KNN driver functions
- *******************************************************/
-
-int distance_compute_blas_threshold = 20;
-
-void knn_inner_product (const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        float_minheap_array_t * res)
-{
-    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
-        knn_inner_product_sse (x, y, d, nx, ny, res);
-    } else {
-        knn_inner_product_blas (x, y, d, nx, ny, res);
-    }
-}
-
-
-
-struct NopDistanceCorrection {
-  float operator()(float dis, size_t /*qno*/, size_t /*bno*/) const {
-    return dis;
-    }
-};
-
-void knn_L2sqr (const float * x,
-                const float * y,
-                size_t d, size_t nx, size_t ny,
-                float_maxheap_array_t * res)
-{
-    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
-        knn_L2sqr_sse (x, y, d, nx, ny, res);
-    } else {
-        NopDistanceCorrection nop;
-        knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
-    }
-}
-
-struct BaseShiftDistanceCorrection {
-    const float *base_shift;
-    float operator()(float dis, size_t /*qno*/, size_t bno) const {
-      return dis - base_shift[bno];
-    }
-};
-
-void knn_L2sqr_base_shift (
-         const float * x,
-         const float * y,
-         size_t d, size_t nx, size_t ny,
-         float_maxheap_array_t * res,
-         const float *base_shift)
-{
-    BaseShiftDistanceCorrection corr = {base_shift};
-    knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
-}
-
-
-
-/***************************************************************************
- * compute a subset of  distances
- ***************************************************************************/
-
-/* compute the inner product between x and a subset y of ny vectors,
-   whose indices are given by idy.  */
-void fvec_inner_products_by_idx (float * __restrict ip,
-                                 const float * x,
-                                 const float * y,
-                                 const int64_t * __restrict ids, /* for y vecs */
-                                 size_t d, size_t nx, size_t ny)
-{
-#pragma omp parallel for
-    for (size_t j = 0; j < nx; j++) {
-        const int64_t * __restrict idsj = ids + j * ny;
-        const float * xj = x + j * d;
-        float * __restrict ipj = ip + j * ny;
-        for (size_t i = 0; i < ny; i++) {
-            if (idsj[i] < 0)
-                continue;
-            ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
-        }
-    }
-}
-
-/* compute the inner product between x and a subset y of ny vectors,
-   whose indices are given by idy.  */
-void fvec_L2sqr_by_idx (float * __restrict dis,
-                        const float * x,
-                        const float * y,
-                        const int64_t * __restrict ids, /* ids of y vecs */
-                        size_t d, size_t nx, size_t ny)
-{
-#pragma omp parallel for
-    for (size_t j = 0; j < nx; j++) {
-        const int64_t * __restrict idsj = ids + j * ny;
-        const float * xj = x + j * d;
-        float * __restrict disj = dis + j * ny;
-        for (size_t i = 0; i < ny; i++) {
-            if (idsj[i] < 0)
-                continue;
-            disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
-        }
-    }
-}
-
-
-
-
-
-/* Find the nearest neighbors for nx queries in a set of ny vectors
-   indexed by ids. May be useful for re-ranking a pre-selected vector list */
-void knn_inner_products_by_idx (const float * x,
-                                const float * y,
-                                const int64_t * ids,
-                                size_t d, size_t nx, size_t ny,
-                                float_minheap_array_t * res)
-{
-    size_t k = res->k;
-
-#pragma omp parallel for
-    for (size_t i = 0; i < nx; i++) {
-        const float * x_ = x + i * d;
-        const int64_t * idsi = ids + i * ny;
-        size_t j;
-        float * __restrict simi = res->get_val(i);
-        int64_t * __restrict idxi = res->get_ids (i);
-        minheap_heapify (k, simi, idxi);
-
-        for (j = 0; j < ny; j++) {
-            if (idsi[j] < 0) break;
-            float ip = fvec_inner_product (x_, y + d * idsi[j], d);
-
-            if (ip > simi[0]) {
-                minheap_pop (k, simi, idxi);
-                minheap_push (k, simi, idxi, ip, idsi[j]);
-            }
-        }
-        minheap_reorder (k, simi, idxi);
-    }
-
-}
-
-void knn_L2sqr_by_idx (const float * x,
-                       const float * y,
-                       const int64_t * __restrict ids,
-                       size_t d, size_t nx, size_t ny,
-                       float_maxheap_array_t * res)
-{
-    size_t k = res->k;
-
-#pragma omp parallel for
-    for (size_t i = 0; i < nx; i++) {
-        const float * x_ = x + i * d;
-        const int64_t * __restrict idsi = ids + i * ny;
-        float * __restrict simi = res->get_val(i);
-        int64_t * __restrict idxi = res->get_ids (i);
-        maxheap_heapify (res->k, simi, idxi);
-        for (size_t j = 0; j < ny; j++) {
-            float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
-
-            if (disij < simi[0]) {
-                maxheap_pop (k, simi, idxi);
-                maxheap_push (k, simi, idxi, disij, idsi[j]);
-            }
-        }
-        maxheap_reorder (res->k, simi, idxi);
-    }
-
-}
-
-
-
-
-
-/***************************************************************************
- * Range search
- ***************************************************************************/
-
-/** Find the nearest neighbors for nx queries in a set of ny vectors
- * compute_l2 = compute pairwise squared L2 distance rather than inner prod
- */
-template <bool compute_l2>
-static void range_search_blas (
-        const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        float radius,
-        RangeSearchResult *result)
-{
-
-    // BLAS does not like empty matrices
-    if (nx == 0 || ny == 0) return;
-
-    /* block sizes */
-    const size_t bs_x = 4096, bs_y = 1024;
-    // const size_t bs_x = 16, bs_y = 16;
-    float *ip_block = new float[bs_x * bs_y];
-    ScopeDeleter<float> del0(ip_block);
-
-    float *x_norms = nullptr, *y_norms = nullptr;
-    ScopeDeleter<float> del1, del2;
-    if (compute_l2) {
-        x_norms = new float[nx];
-        del1.set (x_norms);
-        fvec_norms_L2sqr (x_norms, x, d, nx);
-
-        y_norms = new float[ny];
-        del2.set (y_norms);
-        fvec_norms_L2sqr (y_norms, y, d, ny);
-    }
-
-    std::vector <RangeSearchPartialResult *> partial_results;
-
-    for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
-        size_t j1 = j0 + bs_y;
-        if (j1 > ny) j1 = ny;
-        RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
-        partial_results.push_back (pres);
-
-        for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
-            size_t i1 = i0 + bs_x;
-            if(i1 > nx) i1 = nx;
-
-            /* compute the actual dot products */
-            {
-                float one = 1, zero = 0;
-                FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
-                sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
-                        y + j0 * d, &di,
-                        x + i0 * d, &di, &zero,
-                        ip_block, &nyi);
-            }
-
-
-            for (size_t i = i0; i < i1; i++) {
-                const float *ip_line = ip_block + (i - i0) * (j1 - j0);
-
-                RangeQueryResult & qres = pres->new_result (i);
-
-                for (size_t j = j0; j < j1; j++) {
-                    float ip = *ip_line++;
-                    if (compute_l2) {
-                        float dis =  x_norms[i] + y_norms[j] - 2 * ip;
-                        if (dis < radius) {
-                            qres.add (dis, j);
-                        }
-                    } else {
-                        if (ip > radius) {
-                            qres.add (ip, j);
-                        }
-                    }
-                }
-            }
-        }
-        InterruptCallback::check ();
-    }
-
-    RangeSearchPartialResult::merge (partial_results);
-}
-
-
-template <bool compute_l2>
-static void range_search_sse (const float * x,
-                const float * y,
-                size_t d, size_t nx, size_t ny,
-                float radius,
-                RangeSearchResult *res)
-{
-    FAISS_THROW_IF_NOT (d % 4 == 0);
-
-#pragma omp parallel
-    {
-        RangeSearchPartialResult pres (res);
-
-#pragma omp for
-        for (size_t i = 0; i < nx; i++) {
-            const float * x_ = x + i * d;
-            const float * y_ = y;
-            size_t j;
-
-            RangeQueryResult & qres = pres.new_result (i);
-
-            for (j = 0; j < ny; j++) {
-                if (compute_l2) {
-                    float disij = fvec_L2sqr (x_, y_, d);
-                    if (disij < radius) {
-                        qres.add (disij, j);
-                    }
-                } else {
-                    float ip = fvec_inner_product (x_, y_, d);
-                    if (ip > radius) {
-                        qres.add (ip, j);
-                    }
-                }
-                y_ += d;
-            }
-
-        }
-        pres.finalize ();
-    }
-
-    // check just at the end because the use case is typically just
-    // when the nb of queries is low.
-    InterruptCallback::check();
-}
-
-
-
-
-
-void range_search_L2sqr (
-        const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        float radius,
-        RangeSearchResult *res)
-{
-
-    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
-        range_search_sse<true> (x, y, d, nx, ny, radius, res);
-    } else {
-        range_search_blas<true> (x, y, d, nx, ny, radius, res);
-    }
-}
-
-void range_search_inner_product (
-        const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        float radius,
-        RangeSearchResult *res)
-{
-
-    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
-        range_search_sse<false> (x, y, d, nx, ny, radius, res);
-    } else {
-        range_search_blas<false> (x, y, d, nx, ny, radius, res);
-    }
-}
-
-
-
-/***************************************************************************
- * Some matrix manipulation functions
- ***************************************************************************/
-
-
-/* This function exists because the Torch counterpart is extremly slow
-   (not multi-threaded + unexpected overhead even in single thread).
-   It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y>  */
-void inner_product_to_L2sqr (float * __restrict dis,
-                             const float * nr1,
-                             const float * nr2,
-                             size_t n1, size_t n2)
-{
-
-#pragma omp parallel for
-    for (size_t j = 0 ; j < n1 ; j++) {
-        float * disj = dis + j * n2;
-        for (size_t i = 0 ; i < n2 ; i++)
-            disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
-    }
-}
-
-
-void matrix_qr (int m, int n, float *a)
-{
-    FAISS_THROW_IF_NOT (m >= n);
-    FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
-    std::vector<float> tau (ki);
-    FINTEGER lwork = -1, info;
-    float work_size;
-
-    sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
-             &work_size, &lwork, &info);
-    lwork = size_t(work_size);
-    std::vector<float> work (lwork);
-
-    sgeqrf_ (&mi, &ni, a, &mi,
-             tau.data(), work.data(), &lwork, &info);
-
-    sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
-             work.data(), &lwork, &info);
-
-}
-
-
-void pairwise_L2sqr (int64_t d,
-                     int64_t nq, const float *xq,
-                     int64_t nb, const float *xb,
-                     float *dis,
-                     int64_t ldq, int64_t ldb, int64_t ldd)
-{
-    if (nq == 0 || nb == 0) return;
-    if (ldq == -1) ldq = d;
-    if (ldb == -1) ldb = d;
-    if (ldd == -1) ldd = nb;
-
-    // store in beginning of distance matrix to avoid malloc
-    float *b_norms = dis;
-
-#pragma omp parallel for
-    for (int64_t i = 0; i < nb; i++)
-        b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
-
-#pragma omp parallel for
-    for (int64_t i = 1; i < nq; i++) {
-        float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
-        for (int64_t j = 0; j < nb; j++)
-            dis[i * ldd + j] = q_norm + b_norms [j];
-    }
-
-    {
-        float q_norm = fvec_norm_L2sqr (xq, d);
-        for (int64_t j = 0; j < nb; j++)
-            dis[j] += q_norm;
-    }
-
-    {
-        FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
-        float one = 1.0, minus_2 = -2.0;
-
-        sgemm_ ("Transposed", "Not transposed",
-                &nbi, &nqi, &di,
-                &minus_2,
-                xb, &ldbi,
-                xq, &ldqi,
-                &one, dis, &lddi);
-    }
-
-}
-
-/***************************************************************************
- * Kmeans subroutine
- ***************************************************************************/
-
-// a bit above machine epsilon for float16
-
-#define EPS (1 / 1024.)
-
-/* For k-means, compute centroids given assignment of vectors to centroids */
-int km_update_centroids (const float * x,
-                         float * centroids,
-                         int64_t * assign,
-                         size_t d, size_t k, size_t n,
-                         size_t k_frozen)
-{
-    k -= k_frozen;
-    centroids += k_frozen * d;
-
-    std::vector<size_t> hassign(k);
-    memset (centroids, 0, sizeof(*centroids) * d * k);
-
-#pragma omp parallel
-    {
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-        // this thread is taking care of centroids c0:c1
-        size_t c0 = (k * rank) / nt;
-        size_t c1 = (k * (rank + 1)) / nt;
-        const float *xi = x;
-        size_t nacc = 0;
-
-        for (size_t i = 0; i < n; i++) {
-            int64_t ci = assign[i];
-            assert (ci >= 0 && ci < k + k_frozen);
-            ci -= k_frozen;
-            if (ci >= c0 && ci < c1)  {
-                float * c = centroids + ci * d;
-                hassign[ci]++;
-                for (size_t j = 0; j < d; j++)
-                    c[j] += xi[j];
-                nacc++;
-            }
-            xi += d;
-        }
-
-    }
-
-#pragma omp parallel for
-    for (size_t ci = 0; ci < k; ci++) {
-        float * c = centroids + ci * d;
-        float ni = (float) hassign[ci];
-        if (ni != 0) {
-            for (size_t j = 0; j < d; j++)
-                c[j] /= ni;
-        }
-    }
-
-    /* Take care of void clusters */
-    size_t nsplit = 0;
-    RandomGenerator rng (1234);
-    for (size_t ci = 0; ci < k; ci++) {
-        if (hassign[ci] == 0) { /* need to redefine a centroid */
-            size_t cj;
-            for (cj = 0; 1; cj = (cj + 1) % k) {
-                /* probability to pick this cluster for split */
-                float p = (hassign[cj] - 1.0) / (float) (n - k);
-                float r = rng.rand_float ();
-                if (r < p) {
-                    break; /* found our cluster to be split */
-                }
-            }
-            memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
-
-            /* small symmetric pertubation. Much better than  */
-            for (size_t j = 0; j < d; j++) {
-                if (j % 2 == 0) {
-                    centroids[ci * d + j] *= 1 + EPS;
-                    centroids[cj * d + j] *= 1 - EPS;
-                } else {
-                    centroids[ci * d + j] *= 1 - EPS;
-                    centroids[cj * d + j] *= 1 + EPS;
-                }
-            }
-
-            /* assume even split of the cluster */
-            hassign[ci] = hassign[cj] / 2;
-            hassign[cj] -= hassign[ci];
-            nsplit++;
-        }
-    }
-
-    return nsplit;
-}
-
-#undef EPS
-
-
-
-/***************************************************************************
- * Result list routines
- ***************************************************************************/
-
-
-void ranklist_handle_ties (int k, int64_t *idx, const float *dis)
-{
-    float prev_dis = -1e38;
-    int prev_i = -1;
-    for (int i = 0; i < k; i++) {
-        if (dis[i] != prev_dis) {
-            if (i > prev_i + 1) {
-                // sort between prev_i and i - 1
-                std::sort (idx + prev_i, idx + i);
-            }
-            prev_i = i;
-            prev_dis = dis[i];
-        }
-    }
-}
-
-size_t merge_result_table_with (size_t n, size_t k,
-                                int64_t *I0, float *D0,
-                                const int64_t *I1, const float *D1,
-                                bool keep_min,
-                                int64_t translation)
-{
-    size_t n1 = 0;
-
-#pragma omp parallel reduction(+:n1)
-    {
-        std::vector<int64_t> tmpI (k);
-        std::vector<float> tmpD (k);
-
-#pragma omp for
-        for (size_t i = 0; i < n; i++) {
-            int64_t *lI0 = I0 + i * k;
-            float *lD0 = D0 + i * k;
-            const int64_t *lI1 = I1 + i * k;
-            const float *lD1 = D1 + i * k;
-            size_t r0 = 0;
-            size_t r1 = 0;
-
-            if (keep_min) {
-                for (size_t j = 0; j < k; j++) {
-
-                    if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
-                        tmpD[j] = lD0[r0];
-                        tmpI[j] = lI0[r0];
-                        r0++;
-                    } else if (lD1[r1] >= 0) {
-                        tmpD[j] = lD1[r1];
-                        tmpI[j] = lI1[r1] + translation;
-                        r1++;
-                    } else { // both are NaNs
-                        tmpD[j] = NAN;
-                        tmpI[j] = -1;
-                    }
-                }
-            } else {
-                for (size_t j = 0; j < k; j++) {
-                    if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
-                        tmpD[j] = lD0[r0];
-                        tmpI[j] = lI0[r0];
-                        r0++;
-                    } else if (lD1[r1] >= 0) {
-                        tmpD[j] = lD1[r1];
-                        tmpI[j] = lI1[r1] + translation;
-                        r1++;
-                    } else { // both are NaNs
-                        tmpD[j] = NAN;
-                        tmpI[j] = -1;
-                    }
-                }
-            }
-            n1 += r1;
-            memcpy (lD0, tmpD.data(), sizeof (lD0[0]) * k);
-            memcpy (lI0, tmpI.data(), sizeof (lI0[0]) * k);
-        }
-    }
-
-    return n1;
-}
-
-
-
-size_t ranklist_intersection_size (size_t k1, const int64_t *v1,
-                                   size_t k2, const int64_t *v2_in)
-{
-    if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
-    int64_t *v2 = new int64_t [k2];
-    memcpy (v2, v2_in, sizeof (int64_t) * k2);
-    std::sort (v2, v2 + k2);
-    { // de-dup v2
-        int64_t prev = -1;
-        size_t wp = 0;
-        for (size_t i = 0; i < k2; i++) {
-            if (v2 [i] != prev) {
-                v2[wp++] = prev = v2 [i];
-            }
-        }
-        k2 = wp;
-    }
-    const int64_t seen_flag = 1L << 60;
-    size_t count = 0;
-    for (size_t i = 0; i < k1; i++) {
-        int64_t q = v1 [i];
-        size_t i0 = 0, i1 = k2;
-        while (i0 + 1 < i1) {
-            size_t imed = (i1 + i0) / 2;
-            int64_t piv = v2 [imed] & ~seen_flag;
-            if (piv <= q) i0 = imed;
-            else          i1 = imed;
-        }
-        if (v2 [i0] == q) {
-            count++;
-            v2 [i0] |= seen_flag;
-        }
-    }
-    delete [] v2;
-
-    return count;
-}
-
-double imbalance_factor (int k, const int *hist) {
-    double tot = 0, uf = 0;
-
-    for (int i = 0 ; i < k ; i++) {
-        tot += hist[i];
-        uf += hist[i] * (double) hist[i];
-    }
-    uf = uf * k / (tot * tot);
-
-    return uf;
-}
-
-
-double imbalance_factor (int n, int k, const int64_t *assign) {
-    std::vector<int> hist(k, 0);
-    for (int i = 0; i < n; i++) {
-        hist[assign[i]]++;
-    }
-
-    return imbalance_factor (k, hist.data());
-}
-
-
-
-int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
-    memset (hist, 0, sizeof(hist[0]) * vmax);
-    int nout = 0;
-    while (n--) {
-        if (v[n] < 0 || v[n] >= vmax) nout++;
-        else hist[v[n]]++;
-    }
-    return nout;
-}
-
-
-void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
-{
-    FAISS_THROW_IF_NOT (nbits % 8 == 0);
-    size_t d = nbits / 8;
-    std::vector<int> accu(d * 256);
-    const uint8_t *c = codes;
-    for (size_t i = 0; i < n; i++)
-        for(int j = 0; j < d; j++)
-            accu[j * 256 + *c++]++;
-    memset (hist, 0, sizeof(*hist) * nbits);
-    for (int i = 0; i < d; i++) {
-        const int *ai = accu.data() + i * 256;
-        int * hi = hist + i * 8;
-        for (int j = 0; j < 256; j++)
-            for (int k = 0; k < 8; k++)
-                if ((j >> k) & 1)
-                    hi[k] += ai[j];
-    }
-
-}
-
-
-
-size_t ivec_checksum (size_t n, const int *a)
-{
-    size_t cs = 112909;
-    while (n--) cs = cs * 65713 + a[n] * 1686049;
-    return cs;
-}
-
-
-namespace {
-    struct ArgsortComparator {
-        const float *vals;
-        bool operator() (const size_t a, const size_t b) const {
-            return vals[a] < vals[b];
-        }
-    };
-
-    struct SegmentS {
-        size_t i0; // begin pointer in the permutation array
-        size_t i1; // end
-        size_t len() const {
-            return i1 - i0;
-        }
-    };
-
-    // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
-    // extended to > 1 merge thread
-
-    // merges 2 ranges that should be consecutive on the source into
-    // the union of the two on the destination
-    template<typename T>
-    void parallel_merge (const T *src, T *dst,
-                         SegmentS &s1, SegmentS & s2, int nt,
-                         const ArgsortComparator & comp) {
-        if (s2.len() > s1.len()) { // make sure that s1 larger than s2
-            std::swap(s1, s2);
-        }
-
-        // compute sub-ranges for each thread
-        SegmentS s1s[nt], s2s[nt], sws[nt];
-        s2s[0].i0 = s2.i0;
-        s2s[nt - 1].i1 = s2.i1;
-
-        // not sure parallel actually helps here
-#pragma omp parallel for num_threads(nt)
-        for (int t = 0; t < nt; t++) {
-            s1s[t].i0 = s1.i0 + s1.len() * t / nt;
-            s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
-
-            if (t + 1 < nt) {
-                T pivot = src[s1s[t].i1];
-                size_t i0 = s2.i0, i1 = s2.i1;
-                while (i0 + 1 < i1) {
-                    size_t imed = (i1 + i0) / 2;
-                    if (comp (pivot, src[imed])) {i1 = imed; }
-                    else                         {i0 = imed; }
-                }
-                s2s[t].i1 = s2s[t + 1].i0 = i1;
-            }
-        }
-        s1.i0 = std::min(s1.i0, s2.i0);
-        s1.i1 = std::max(s1.i1, s2.i1);
-        s2 = s1;
-        sws[0].i0 = s1.i0;
-        for (int t = 0; t < nt; t++) {
-            sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
-            if (t + 1 < nt) {
-                sws[t + 1].i0 = sws[t].i1;
-            }
-        }
-        assert(sws[nt - 1].i1 == s1.i1);
-
-        // do the actual merging
-#pragma omp parallel for num_threads(nt)
-        for (int t = 0; t < nt; t++) {
-            SegmentS sw = sws[t];
-            SegmentS s1t = s1s[t];
-            SegmentS s2t = s2s[t];
-            if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
-                for (;;) {
-                    // assert (sw.len() == s1t.len() + s2t.len());
-                    if (comp(src[s1t.i0], src[s2t.i0])) {
-                        dst[sw.i0++] = src[s1t.i0++];
-                        if (s1t.i0 == s1t.i1) break;
-                    } else {
-                        dst[sw.i0++] = src[s2t.i0++];
-                        if (s2t.i0 == s2t.i1) break;
-                    }
-                }
-            }
-            if (s1t.len() > 0) {
-                assert(s1t.len() == sw.len());
-                memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
-            } else if (s2t.len() > 0) {
-                assert(s2t.len() == sw.len());
-                memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
-            }
-        }
-    }
-
-};
-
-void fvec_argsort (size_t n, const float *vals,
-                    size_t *perm)
-{
-    for (size_t i = 0; i < n; i++) perm[i] = i;
-    ArgsortComparator comp = {vals};
-    std::sort (perm, perm + n, comp);
-}
-
-void fvec_argsort_parallel (size_t n, const float *vals,
-                            size_t *perm)
-{
-    size_t * perm2 = new size_t[n];
-    // 2 result tables, during merging, flip between them
-    size_t *permB = perm2, *permA = perm;
-
-    int nt = omp_get_max_threads();
-    { // prepare correct permutation so that the result ends in perm
-      // at final iteration
-        int nseg = nt;
-        while (nseg > 1) {
-            nseg = (nseg + 1) / 2;
-            std::swap (permA, permB);
-        }
-    }
-
-#pragma omp parallel
-    for (size_t i = 0; i < n; i++) permA[i] = i;
-
-    ArgsortComparator comp = {vals};
-
-    SegmentS segs[nt];
-
-    // independent sorts
-#pragma omp parallel for
-    for (int t = 0; t < nt; t++) {
-        size_t i0 = t * n / nt;
-        size_t i1 = (t + 1) * n / nt;
-        SegmentS seg = {i0, i1};
-        std::sort (permA + seg.i0, permA + seg.i1, comp);
-        segs[t] = seg;
-    }
-    int prev_nested = omp_get_nested();
-    omp_set_nested(1);
-
-    int nseg = nt;
-    while (nseg > 1) {
-        int nseg1 = (nseg + 1) / 2;
-        int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
-        int sub_nseg1 = nseg / 2;
-
-#pragma omp parallel for num_threads(nseg1)
-        for (int s = 0; s < nseg; s += 2) {
-            if (s + 1 == nseg) { // otherwise isolated segment
-                memcpy(permB + segs[s].i0, permA + segs[s].i0,
-                       segs[s].len() * sizeof(size_t));
-            } else {
-                int t0 = s * sub_nt / sub_nseg1;
-                int t1 = (s + 1) * sub_nt / sub_nseg1;
-                printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
-                parallel_merge(permA, permB, segs[s], segs[s + 1],
-                               t1 - t0, comp);
-            }
-        }
-        for (int s = 0; s < nseg; s += 2)
-            segs[s / 2] = segs[s];
-        nseg = nseg1;
-        std::swap (permA, permB);
-    }
-    assert (permA == perm);
-    omp_set_nested(prev_nested);
-    delete [] perm2;
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-const float *fvecs_maybe_subsample (
-          size_t d, size_t *n, size_t nmax, const float *x,
-          bool verbose, int64_t seed)
-{
-
-    if (*n <= nmax) return x; // nothing to do
-
-    size_t n2 = nmax;
-    if (verbose) {
-        printf ("  Input training set too big (max size is %ld), sampling "
-                "%ld / %ld vectors\n", nmax, n2, *n);
-    }
-    std::vector<int> subset (*n);
-    rand_perm (subset.data (), *n, seed);
-    float *x_subset = new float[n2 * d];
-    for (int64_t i = 0; i < n2; i++)
-        memcpy (&x_subset[i * d],
-                &x[subset[i] * size_t(d)],
-                sizeof (x[0]) * d);
-    *n = n2;
-    return x_subset;
-}
-
-
-void binary_to_real(size_t d, const uint8_t *x_in, float *x_out) {
-    for (size_t i = 0; i < d; ++i) {
-        x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
-    }
-}
-
-void real_to_binary(size_t d, const float *x_in, uint8_t *x_out) {
-  for (size_t i = 0; i < d / 8; ++i) {
-    uint8_t b = 0;
-    for (int j = 0; j < 8; ++j) {
-      if (x_in[8 * i + j] > 0) {
-        b |= (1 << j);
-      }
-    }
-    x_out[i] = b;
-  }
-}
-
-
-// from Python's stringobject.c
-uint64_t hash_bytes (const uint8_t *bytes, int64_t n) {
-    const uint8_t *p = bytes;
-    uint64_t x = (uint64_t)(*p) << 7;
-    int64_t len = n;
-    while (--len >= 0) {
-        x = (1000003*x) ^ *p++;
-    }
-    x ^= n;
-    return x;
-}
-
-
-bool check_openmp() {
-    omp_set_num_threads(10);
-
-    if (omp_get_max_threads() != 10) {
-        return false;
-    }
-
-    std::vector<int> nt_per_thread(10);
-    size_t sum = 0;
-    bool in_parallel = true;
-#pragma omp parallel reduction(+: sum)
-    {
-        if (!omp_in_parallel()) {
-            in_parallel = false;
-        }
-
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-
-        nt_per_thread[rank] = nt;
-#pragma omp for
-        for(int i = 0; i < 1000 * 1000 * 10; i++) {
-            sum += i;
-        }
-    }
-
-    if (!in_parallel) {
-        return false;
-    }
-    if (nt_per_thread[0] != 10) {
-        return false;
-    }
-    if (sum == 0) {
-        return false;
-    }
-
-    return true;
-}
-
-} // namespace faiss
diff --git a/utils.h b/utils.h
deleted file mode 100644
index 6d802a5533..0000000000
--- a/utils.h
+++ /dev/null
@@ -1,418 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-/*
- *  A few utilitary functions for similarity search:
- * - random generators
- * - optimized exhaustive distance and knn search functions
- * - some functions reimplemented from torch for speed
- */
-
-#ifndef FAISS_utils_h
-#define FAISS_utils_h
-
-#include <random>
-#include <stdint.h>
-
-#include "Heap.h"
-
-
-namespace faiss {
-
-
-/**************************************************
- * Get some stats about the system
-**************************************************/
-
-
-/// ms elapsed since some arbitrary epoch
-double getmillisecs ();
-
-/// get current RSS usage in kB
-size_t get_mem_usage_kb ();
-
-
-/**************************************************
- * Random data generation functions
- **************************************************/
-
-/// random generator that can be used in multithreaded contexts
-struct RandomGenerator {
-
-    std::mt19937 mt;
-
-    /// random positive integer
-    int rand_int ();
-
-    /// random int64_t
-    int64_t rand_int64 ();
-
-    /// generate random integer between 0 and max-1
-    int rand_int (int max);
-
-    /// between 0 and 1
-    float rand_float ();
-
-    double rand_double ();
-
-    explicit RandomGenerator (int64_t seed = 1234);
-};
-
-/* Generate an array of uniform random floats / multi-threaded implementation */
-void float_rand (float * x, size_t n, int64_t seed);
-void float_randn (float * x, size_t n, int64_t seed);
-void int64_rand (int64_t * x, size_t n, int64_t seed);
-void byte_rand (uint8_t * x, size_t n, int64_t seed);
-
-/* random permutation */
-void rand_perm (int * perm, size_t n, int64_t seed);
-
-
-
- /*********************************************************
- * Optimized distance/norm/inner prod computations
- *********************************************************/
-
-
-/// Squared L2 distance between two vectors
-float fvec_L2sqr (
-        const float * x,
-        const float * y,
-        size_t d);
-
-/// inner product
-float  fvec_inner_product (
-        const float * x,
-        const float * y,
-        size_t d);
-
-/// L1 distance
-float fvec_L1 (
-        const float * x,
-        const float * y,
-        size_t d);
-
-float fvec_Linf (
-        const float * x,
-        const float * y,
-        size_t d);
-
-
-/// a balanced assignment has a IF of 1
-double imbalance_factor (int n, int k, const int64_t *assign);
-
-/// same, takes a histogram as input
-double imbalance_factor (int k, const int *hist);
-
-/** Compute pairwise distances between sets of vectors
- *
- * @param d     dimension of the vectors
- * @param nq    nb of query vectors
- * @param nb    nb of database vectors
- * @param xq    query vectors (size nq * d)
- * @param xb    database vectros (size nb * d)
- * @param dis   output distances (size nq * nb)
- * @param ldq,ldb, ldd strides for the matrices
- */
-void pairwise_L2sqr (int64_t d,
-                     int64_t nq, const float *xq,
-                     int64_t nb, const float *xb,
-                     float *dis,
-                     int64_t ldq = -1, int64_t ldb = -1, int64_t ldd = -1);
-
-/* compute the inner product between nx vectors x and one y */
-void fvec_inner_products_ny (
-        float * ip,         /* output inner product */
-        const float * x,
-        const float * y,
-        size_t d, size_t ny);
-
-/* compute ny square L2 distance bewteen x and a set of contiguous y vectors */
-void fvec_L2sqr_ny (
-        float * dis,
-        const float * x,
-        const float * y,
-        size_t d, size_t ny);
-
-
-/** squared norm of a vector */
-float fvec_norm_L2sqr (const float * x,
-                       size_t d);
-
-/** compute the L2 norms for a set of vectors
- *
- * @param  ip       output norms, size nx
- * @param  x        set of vectors, size nx * d
- */
-void fvec_norms_L2 (float * ip, const float * x, size_t d, size_t nx);
-
-/// same as fvec_norms_L2, but computes square norms
-void fvec_norms_L2sqr (float * ip, const float * x, size_t d, size_t nx);
-
-/* L2-renormalize a set of vector. Nothing done if the vector is 0-normed */
-void fvec_renorm_L2 (size_t d, size_t nx, float * x);
-
-
-/* This function exists because the Torch counterpart is extremly slow
-   (not multi-threaded + unexpected overhead even in single thread).
-   It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y>  */
-void inner_product_to_L2sqr (float * dis,
-                             const float * nr1,
-                             const float * nr2,
-                             size_t n1, size_t n2);
-
-/***************************************************************************
- * Compute a subset of  distances
- ***************************************************************************/
-
- /* compute the inner product between x and a subset y of ny vectors,
-   whose indices are given by idy.  */
-void fvec_inner_products_by_idx (
-        float * ip,
-        const float * x,
-        const float * y,
-        const int64_t *ids,
-        size_t d, size_t nx, size_t ny);
-
-/* same but for a subset in y indexed by idsy (ny vectors in total) */
-void fvec_L2sqr_by_idx (
-        float * dis,
-        const float * x,
-        const float * y,
-        const int64_t *ids, /* ids of y vecs */
-        size_t d, size_t nx, size_t ny);
-
-/***************************************************************************
- * KNN functions
- ***************************************************************************/
-
-// threshold on nx above which we switch to BLAS to compute distances
-extern int distance_compute_blas_threshold;
-
-/** Return the k nearest neighors of each of the nx vectors x among the ny
- *  vector y, w.r.t to max inner product
- *
- * @param x    query vectors, size nx * d
- * @param y    database vectors, size ny * d
- * @param res  result array, which also provides k. Sorted on output
- */
-void knn_inner_product (
-        const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        float_minheap_array_t * res);
-
-/** Same as knn_inner_product, for the L2 distance */
-void knn_L2sqr (
-        const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        float_maxheap_array_t * res);
-
-
-
-/** same as knn_L2sqr, but base_shift[bno] is subtracted to all
- * computed distances.
- *
- * @param base_shift   size ny
- */
-void knn_L2sqr_base_shift (
-         const float * x,
-         const float * y,
-         size_t d, size_t nx, size_t ny,
-         float_maxheap_array_t * res,
-         const float *base_shift);
-
-/* Find the nearest neighbors for nx queries in a set of ny vectors
- * indexed by ids. May be useful for re-ranking a pre-selected vector list
- */
-void knn_inner_products_by_idx (
-        const float * x,
-        const float * y,
-        const int64_t *  ids,
-        size_t d, size_t nx, size_t ny,
-        float_minheap_array_t * res);
-
-void knn_L2sqr_by_idx (const float * x,
-                       const float * y,
-                       const int64_t * ids,
-                       size_t d, size_t nx, size_t ny,
-                       float_maxheap_array_t * res);
-
-/***************************************************************************
- * Range search
- ***************************************************************************/
-
-
-
-/// Forward declaration, see AuxIndexStructures.h
-struct RangeSearchResult;
-
-/** Return the k nearest neighors of each of the nx vectors x among the ny
- *  vector y, w.r.t to max inner product
- *
- * @param x      query vectors, size nx * d
- * @param y      database vectors, size ny * d
- * @param radius search radius around the x vectors
- * @param result result structure
- */
-void range_search_L2sqr (
-        const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        float radius,
-        RangeSearchResult *result);
-
-/// same as range_search_L2sqr for the inner product similarity
-void range_search_inner_product (
-        const float * x,
-        const float * y,
-        size_t d, size_t nx, size_t ny,
-        float radius,
-        RangeSearchResult *result);
-
-
-
-
-
-/***************************************************************************
- * Misc  matrix and vector manipulation functions
- ***************************************************************************/
-
-
-/** compute c := a + bf * b for a, b and c tables
- *
- * @param n   size of the tables
- * @param a   size n
- * @param b   size n
- * @param c   restult table, size n
- */
-void fvec_madd (size_t n, const float *a,
-                float bf, const float *b, float *c);
-
-
-/** same as fvec_madd, also return index of the min of the result table
- * @return    index of the min of table c
- */
-int fvec_madd_and_argmin (size_t n, const float *a,
-                           float bf, const float *b, float *c);
-
-
-/* perform a reflection (not an efficient implementation, just for test ) */
-void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
-
-
-/** For k-means: update stage.
- *
- * @param x          training vectors, size n * d
- * @param centroids  centroid vectors, size k * d
- * @param assign     nearest centroid for each training vector, size n
- * @param k_frozen   do not update the k_frozen first centroids
- * @return           nb of spliting operations to fight empty clusters
- */
-int km_update_centroids (
-        const float * x,
-        float * centroids,
-        int64_t * assign,
-        size_t d, size_t k, size_t n,
-        size_t k_frozen);
-
-/** compute the Q of the QR decomposition for m > n
- * @param a   size n * m: input matrix and output Q
- */
-void matrix_qr (int m, int n, float *a);
-
-/** distances are supposed to be sorted. Sorts indices with same distance*/
-void ranklist_handle_ties (int k, int64_t *idx, const float *dis);
-
-/** count the number of comon elements between v1 and v2
- * algorithm = sorting + bissection to avoid double-counting duplicates
- */
-size_t ranklist_intersection_size (size_t k1, const int64_t *v1,
-                                   size_t k2, const int64_t *v2);
-
-/** merge a result table into another one
- *
- * @param I0, D0       first result table, size (n, k)
- * @param I1, D1       second result table, size (n, k)
- * @param keep_min     if true, keep min values, otherwise keep max
- * @param translation  add this value to all I1's indexes
- * @return             nb of values that were taken from the second table
- */
-size_t merge_result_table_with (size_t n, size_t k,
-                                int64_t *I0, float *D0,
-                                const int64_t *I1, const float *D1,
-                                bool keep_min = true,
-                                int64_t translation = 0);
-
-
-
-void fvec_argsort (size_t n, const float *vals,
-                    size_t *perm);
-
-void fvec_argsort_parallel (size_t n, const float *vals,
-                    size_t *perm);
-
-
-/// compute histogram on v
-int ivec_hist (size_t n, const int * v, int vmax, int *hist);
-
-/** Compute histogram of bits on a code array
- *
- * @param codes   size(n, nbits / 8)
- * @param hist    size(nbits): nb of 1s in the array of codes
- */
-void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist);
-
-
-/// compute a checksum on a table.
-size_t ivec_checksum (size_t n, const int *a);
-
-
-/** random subsamples a set of vectors if there are too many of them
- *
- * @param d      dimension of the vectors
- * @param n      on input: nb of input vectors, output: nb of output vectors
- * @param nmax   max nb of vectors to keep
- * @param x      input array, size *n-by-d
- * @param seed   random seed to use for sampling
- * @return       x or an array allocated with new [] with *n vectors
- */
-const float *fvecs_maybe_subsample (
-       size_t d, size_t *n, size_t nmax, const float *x,
-       bool verbose = false, int64_t seed = 1234);
-
-/** Convert binary vector to +1/-1 valued float vector.
- *
- * @param d      dimension of the vector (multiple of 8)
- * @param x_in   input binary vector (uint8_t table of size d / 8)
- * @param x_out  output float vector (float table of size d)
- */
-void binary_to_real(size_t d, const uint8_t *x_in, float *x_out);
-
-/** Convert float vector to binary vector. Components > 0 are converted to 1,
- * others to 0.
- *
- * @param d      dimension of the vector (multiple of 8)
- * @param x_in   input float vector (float table of size d)
- * @param x_out  output binary vector (uint8_t table of size d / 8)
- */
-void real_to_binary(size_t d, const float *x_in, uint8_t *x_out);
-
-
-/** A reasonable hashing function */
-uint64_t hash_bytes (const uint8_t *bytes, int64_t n);
-
-/** Whether OpenMP annotations were respected. */
-bool check_openmp();
-
-} // namspace faiss
-
-
-#endif /* FAISS_utils_h */
diff --git a/utils_simd.cpp b/utils_simd.cpp
deleted file mode 100644
index bb954a4310..0000000000
--- a/utils_simd.cpp
+++ /dev/null
@@ -1,815 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include "utils.h"
-
-#include <cstdio>
-#include <cassert>
-#include <cstring>
-#include <cmath>
-
-#ifdef __SSE__
-#include <immintrin.h>
-#endif
-
-#ifdef __aarch64__
-#include  <arm_neon.h>
-#endif
-
-#include <omp.h>
-
-
-
-/**************************************************
- * Get some stats about the system
- **************************************************/
-
-namespace faiss {
-
-#ifdef __AVX__
-#define USE_AVX
-#endif
-
-
-/*********************************************************
- * Optimized distance computations
- *********************************************************/
-
-
-/* Functions to compute:
-   - L2 distance between 2 vectors
-   - inner product between 2 vectors
-   - L2 norm of a vector
-
-   The functions should probably not be invoked when a large number of
-   vectors are be processed in batch (in which case Matrix multiply
-   is faster), but may be useful for comparing vectors isolated in
-   memory.
-
-   Works with any vectors of any dimension, even unaligned (in which
-   case they are slower).
-
-*/
-
-
-/*********************************************************
- * Reference implementations
- */
-
-
-float fvec_L2sqr_ref (const float * x,
-                     const float * y,
-                     size_t d)
-{
-    size_t i;
-    float res = 0;
-    for (i = 0; i < d; i++) {
-        const float tmp = x[i] - y[i];
-       res += tmp * tmp;
-    }
-    return res;
-}
-
-float fvec_L1_ref (const float * x,
-                   const float * y,
-                   size_t d)
-{
-    size_t i;
-    float res = 0;
-    for (i = 0; i < d; i++) {
-        const float tmp = x[i] - y[i];
-        res += fabs(tmp);
-    }
-    return res;
-}
-
-float fvec_Linf_ref (const float * x,
-                     const float * y,
-                     size_t d)
-{
-  size_t i;
-  float res = 0;
-  for (i = 0; i < d; i++) {
-    res = fmax(res, fabs(x[i] - y[i]));
-  }
-  return res;
-}
-
-float fvec_inner_product_ref (const float * x,
-                             const float * y,
-                             size_t d)
-{
-    size_t i;
-    float res = 0;
-    for (i = 0; i < d; i++)
-       res += x[i] * y[i];
-    return res;
-}
-
-float fvec_norm_L2sqr_ref (const float *x, size_t d)
-{
-    size_t i;
-    double res = 0;
-    for (i = 0; i < d; i++)
-       res += x[i] * x[i];
-    return res;
-}
-
-
-void fvec_L2sqr_ny_ref (float * dis,
-                    const float * x,
-                    const float * y,
-                    size_t d, size_t ny)
-{
-    for (size_t i = 0; i < ny; i++) {
-        dis[i] = fvec_L2sqr (x, y, d);
-        y += d;
-    }
-}
-
-
-
-
-/*********************************************************
- * SSE and AVX implementations
- */
-
-#ifdef __SSE__
-
-// reads 0 <= d < 4 floats as __m128
-static inline __m128 masked_read (int d, const float *x)
-{
-    assert (0 <= d && d < 4);
-    __attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0};
-    switch (d) {
-      case 3:
-        buf[2] = x[2];
-      case 2:
-        buf[1] = x[1];
-      case 1:
-        buf[0] = x[0];
-    }
-    return _mm_load_ps (buf);
-    // cannot use AVX2 _mm_mask_set1_epi32
-}
-
-float fvec_norm_L2sqr (const float *  x,
-                      size_t d)
-{
-    __m128 mx;
-    __m128 msum1 = _mm_setzero_ps();
-
-    while (d >= 4) {
-        mx = _mm_loadu_ps (x); x += 4;
-        msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
-        d -= 4;
-    }
-
-    mx = masked_read (d, x);
-    msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
-
-    msum1 = _mm_hadd_ps (msum1, msum1);
-    msum1 = _mm_hadd_ps (msum1, msum1);
-    return  _mm_cvtss_f32 (msum1);
-}
-
-namespace {
-
-float sqr (float x) {
-    return x * x;
-}
-
-
-void fvec_L2sqr_ny_D1 (float * dis, const float * x,
-                       const float * y, size_t ny)
-{
-    float x0s = x[0];
-    __m128 x0 = _mm_set_ps (x0s, x0s, x0s, x0s);
-
-    size_t i;
-    for (i = 0; i + 3 < ny; i += 4) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
-        dis[i] = _mm_cvtss_f32 (accu);
-        tmp = _mm_shuffle_ps (accu, accu, 1);
-        dis[i + 1] = _mm_cvtss_f32 (tmp);
-        tmp = _mm_shuffle_ps (accu, accu, 2);
-        dis[i + 2] = _mm_cvtss_f32 (tmp);
-        tmp = _mm_shuffle_ps (accu, accu, 3);
-        dis[i + 3] = _mm_cvtss_f32 (tmp);
-    }
-    while (i < ny) { // handle non-multiple-of-4 case
-        dis[i++] = sqr(x0s - *y++);
-    }
-}
-
-
-void fvec_L2sqr_ny_D2 (float * dis, const float * x,
-                       const float * y, size_t ny)
-{
-    __m128 x0 = _mm_set_ps (x[1], x[0], x[1], x[0]);
-
-    size_t i;
-    for (i = 0; i + 1 < ny; i += 2) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
-        accu = _mm_hadd_ps (accu, accu);
-        dis[i] = _mm_cvtss_f32 (accu);
-        accu = _mm_shuffle_ps (accu, accu, 3);
-        dis[i + 1] = _mm_cvtss_f32 (accu);
-    }
-    if (i < ny) { // handle odd case
-        dis[i] = sqr(x[0] - y[0]) + sqr(x[1] - y[1]);
-    }
-}
-
-
-
-void fvec_L2sqr_ny_D4 (float * dis, const float * x,
-                        const float * y, size_t ny)
-{
-    __m128 x0 = _mm_loadu_ps(x);
-
-    for (size_t i = 0; i < ny; i++) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
-        accu = _mm_hadd_ps (accu, accu);
-        accu = _mm_hadd_ps (accu, accu);
-        dis[i] = _mm_cvtss_f32 (accu);
-    }
-}
-
-
-void fvec_L2sqr_ny_D8 (float * dis, const float * x,
-                        const float * y, size_t ny)
-{
-    __m128 x0 = _mm_loadu_ps(x);
-    __m128 x1 = _mm_loadu_ps(x + 4);
-
-    for (size_t i = 0; i < ny; i++) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
-        tmp = x1 - _mm_loadu_ps (y); y += 4;
-        accu += tmp * tmp;
-        accu = _mm_hadd_ps (accu, accu);
-        accu = _mm_hadd_ps (accu, accu);
-        dis[i] = _mm_cvtss_f32 (accu);
-    }
-}
-
-
-void fvec_L2sqr_ny_D12 (float * dis, const float * x,
-                        const float * y, size_t ny)
-{
-    __m128 x0 = _mm_loadu_ps(x);
-    __m128 x1 = _mm_loadu_ps(x + 4);
-    __m128 x2 = _mm_loadu_ps(x + 8);
-
-    for (size_t i = 0; i < ny; i++) {
-        __m128 tmp, accu;
-        tmp = x0 - _mm_loadu_ps (y); y += 4;
-        accu = tmp * tmp;
-        tmp = x1 - _mm_loadu_ps (y); y += 4;
-        accu += tmp * tmp;
-        tmp = x2 - _mm_loadu_ps (y); y += 4;
-        accu += tmp * tmp;
-        accu = _mm_hadd_ps (accu, accu);
-        accu = _mm_hadd_ps (accu, accu);
-        dis[i] = _mm_cvtss_f32 (accu);
-    }
-}
-
-
-} // anonymous namespace
-
-void fvec_L2sqr_ny (float * dis, const float * x,
-                        const float * y, size_t d, size_t ny) {
-    // optimized for a few special cases
-    switch(d) {
-    case 1:
-        fvec_L2sqr_ny_D1 (dis, x, y, ny);
-        return;
-    case 2:
-        fvec_L2sqr_ny_D2 (dis, x, y, ny);
-        return;
-    case 4:
-        fvec_L2sqr_ny_D4 (dis, x, y, ny);
-        return;
-    case 8:
-        fvec_L2sqr_ny_D8 (dis, x, y, ny);
-        return;
-    case 12:
-        fvec_L2sqr_ny_D12 (dis, x, y, ny);
-        return;
-    default:
-        fvec_L2sqr_ny_ref (dis, x, y, d, ny);
-        return;
-    }
-}
-
-
-
-#endif
-
-#ifdef USE_AVX
-
-// reads 0 <= d < 8 floats as __m256
-static inline __m256 masked_read_8 (int d, const float *x)
-{
-    assert (0 <= d && d < 8);
-    if (d < 4) {
-        __m256 res = _mm256_setzero_ps ();
-        res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
-        return res;
-    } else {
-        __m256 res = _mm256_setzero_ps ();
-        res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
-        res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
-        return res;
-    }
-}
-
-float fvec_inner_product (const float * x,
-                          const float * y,
-                          size_t d)
-{
-    __m256 msum1 = _mm256_setzero_ps();
-
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps (x); x += 8;
-        __m256 my = _mm256_loadu_ps (y); y += 8;
-        msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
-        d -= 8;
-    }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 +=       _mm256_extractf128_ps(msum1, 0);
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps (x); x += 4;
-        __m128 my = _mm_loadu_ps (y); y += 4;
-        msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read (d, x);
-        __m128 my = masked_read (d, y);
-        msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
-    }
-
-    msum2 = _mm_hadd_ps (msum2, msum2);
-    msum2 = _mm_hadd_ps (msum2, msum2);
-    return  _mm_cvtss_f32 (msum2);
-}
-
-float fvec_L2sqr (const float * x,
-                 const float * y,
-                 size_t d)
-{
-    __m256 msum1 = _mm256_setzero_ps();
-
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps (x); x += 8;
-        __m256 my = _mm256_loadu_ps (y); y += 8;
-        const __m256 a_m_b1 = mx - my;
-        msum1 += a_m_b1 * a_m_b1;
-        d -= 8;
-    }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 +=       _mm256_extractf128_ps(msum1, 0);
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps (x); x += 4;
-        __m128 my = _mm_loadu_ps (y); y += 4;
-        const __m128 a_m_b1 = mx - my;
-        msum2 += a_m_b1 * a_m_b1;
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read (d, x);
-        __m128 my = masked_read (d, y);
-        __m128 a_m_b1 = mx - my;
-        msum2 += a_m_b1 * a_m_b1;
-    }
-
-    msum2 = _mm_hadd_ps (msum2, msum2);
-    msum2 = _mm_hadd_ps (msum2, msum2);
-    return  _mm_cvtss_f32 (msum2);
-}
-
-float fvec_L1 (const float * x, const float * y, size_t d)
-{
-    __m256 msum1 = _mm256_setzero_ps();
-    __m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
-
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps (x); x += 8;
-        __m256 my = _mm256_loadu_ps (y); y += 8;
-        const __m256 a_m_b = mx - my;
-        msum1 += _mm256_and_ps(signmask, a_m_b);
-        d -= 8;
-    }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 +=       _mm256_extractf128_ps(msum1, 0);
-    __m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps (x); x += 4;
-        __m128 my = _mm_loadu_ps (y); y += 4;
-        const __m128 a_m_b = mx - my;
-        msum2 += _mm_and_ps(signmask2, a_m_b);
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read (d, x);
-        __m128 my = masked_read (d, y);
-        __m128 a_m_b = mx - my;
-        msum2 += _mm_and_ps(signmask2, a_m_b);
-    }
-
-    msum2 = _mm_hadd_ps (msum2, msum2);
-    msum2 = _mm_hadd_ps (msum2, msum2);
-    return  _mm_cvtss_f32 (msum2);
-}
-
-float fvec_Linf (const float * x, const float * y, size_t d)
-{
-    __m256 msum1 = _mm256_setzero_ps();
-    __m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL));
-
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps (x); x += 8;
-        __m256 my = _mm256_loadu_ps (y); y += 8;
-        const __m256 a_m_b = mx - my;
-        msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b));
-        d -= 8;
-    }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 = _mm_max_ps (msum2, _mm256_extractf128_ps(msum1, 0));
-    __m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL));
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps (x); x += 4;
-        __m128 my = _mm_loadu_ps (y); y += 4;
-        const __m128 a_m_b = mx - my;
-        msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read (d, x);
-        __m128 my = masked_read (d, y);
-        __m128 a_m_b = mx - my;
-        msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b));
-    }
-
-    msum2 = _mm_max_ps(_mm_movehl_ps(msum2, msum2), msum2);
-    msum2 = _mm_max_ps(msum2, _mm_shuffle_ps (msum2, msum2, 1));
-    return  _mm_cvtss_f32 (msum2);
-}
-
-#elif defined(__SSE__) // But not AVX
-
-float fvec_L1 (const float * x, const float * y, size_t d)
-{
-    return fvec_L1_ref (x, y, d);
-}
-
-float fvec_Linf (const float * x, const float * y, size_t d)
-{
-    return fvec_Linf_ref (x, y, d);
-}
-
-
-float fvec_L2sqr (const float * x,
-                 const float * y,
-                 size_t d)
-{
-    __m128 msum1 = _mm_setzero_ps();
-
-    while (d >= 4) {
-        __m128 mx = _mm_loadu_ps (x); x += 4;
-        __m128 my = _mm_loadu_ps (y); y += 4;
-        const __m128 a_m_b1 = mx - my;
-        msum1 += a_m_b1 * a_m_b1;
-        d -= 4;
-    }
-
-    if (d > 0) {
-        // add the last 1, 2 or 3 values
-        __m128 mx = masked_read (d, x);
-        __m128 my = masked_read (d, y);
-        __m128 a_m_b1 = mx - my;
-        msum1 += a_m_b1 * a_m_b1;
-    }
-
-    msum1 = _mm_hadd_ps (msum1, msum1);
-    msum1 = _mm_hadd_ps (msum1, msum1);
-    return  _mm_cvtss_f32 (msum1);
-}
-
-
-float fvec_inner_product (const float * x,
-                         const float * y,
-                         size_t d)
-{
-    __m128 mx, my;
-    __m128 msum1 = _mm_setzero_ps();
-
-    while (d >= 4) {
-        mx = _mm_loadu_ps (x); x += 4;
-        my = _mm_loadu_ps (y); y += 4;
-        msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, my));
-        d -= 4;
-    }
-
-    // add the last 1, 2, or 3 values
-    mx = masked_read (d, x);
-    my = masked_read (d, y);
-    __m128 prod = _mm_mul_ps (mx, my);
-
-    msum1 = _mm_add_ps (msum1, prod);
-
-    msum1 = _mm_hadd_ps (msum1, msum1);
-    msum1 = _mm_hadd_ps (msum1, msum1);
-    return  _mm_cvtss_f32 (msum1);
-}
-
-#elif defined(__aarch64__)
-
-
-float fvec_L2sqr (const float * x,
-                  const float * y,
-                  size_t d)
-{
-    if (d & 3) return fvec_L2sqr_ref (x, y, d);
-    float32x4_t accu = vdupq_n_f32 (0);
-    for (size_t i = 0; i < d; i += 4) {
-        float32x4_t xi = vld1q_f32 (x + i);
-        float32x4_t yi = vld1q_f32 (y + i);
-        float32x4_t sq = vsubq_f32 (xi, yi);
-        accu = vfmaq_f32 (accu, sq, sq);
-    }
-    float32x4_t a2 = vpaddq_f32 (accu, accu);
-    return vdups_laneq_f32 (a2, 0) + vdups_laneq_f32 (a2, 1);
-}
-
-float fvec_inner_product (const float * x,
-                          const float * y,
-                          size_t d)
-{
-    if (d & 3) return fvec_inner_product_ref (x, y, d);
-    float32x4_t accu = vdupq_n_f32 (0);
-    for (size_t i = 0; i < d; i += 4) {
-        float32x4_t xi = vld1q_f32 (x + i);
-        float32x4_t yi = vld1q_f32 (y + i);
-        accu = vfmaq_f32 (accu, xi, yi);
-    }
-    float32x4_t a2 = vpaddq_f32 (accu, accu);
-    return vdups_laneq_f32 (a2, 0) + vdups_laneq_f32 (a2, 1);
-}
-
-float fvec_norm_L2sqr (const float *x, size_t d)
-{
-    if (d & 3) return fvec_norm_L2sqr_ref (x, d);
-    float32x4_t accu = vdupq_n_f32 (0);
-    for (size_t i = 0; i < d; i += 4) {
-        float32x4_t xi = vld1q_f32 (x + i);
-        accu = vfmaq_f32 (accu, xi, xi);
-    }
-    float32x4_t a2 = vpaddq_f32 (accu, accu);
-    return vdups_laneq_f32 (a2, 0) + vdups_laneq_f32 (a2, 1);
-}
-
-// not optimized for ARM
-void fvec_L2sqr_ny (float * dis, const float * x,
-                        const float * y, size_t d, size_t ny) {
-    fvec_L2sqr_ny_ref (dis, x, y, d, ny);
-}
-
-float fvec_L1 (const float * x, const float * y, size_t d)
-{
-    return fvec_L1_ref (x, y, d);
-}
-
-float fvec_Linf (const float * x, const float * y, size_t d)
-{
-    return fvec_Linf_ref (x, y, d);
-}
-
-
-#else
-// scalar implementation
-
-float fvec_L2sqr (const float * x,
-                  const float * y,
-                  size_t d)
-{
-    return fvec_L2sqr_ref (x, y, d);
-}
-
-float fvec_L1 (const float * x, const float * y, size_t d)
-{
-    return fvec_L1_ref (x, y, d);
-}
-
-float fvec_Linf (const float * x, const float * y, size_t d)
-{
-    return fvec_Linf_ref (x, y, d);
-}
-
-float fvec_inner_product (const float * x,
-                             const float * y,
-                             size_t d)
-{
-    return fvec_inner_product_ref (x, y, d);
-}
-
-float fvec_norm_L2sqr (const float *x, size_t d)
-{
-    return fvec_norm_L2sqr_ref (x, d);
-}
-
-void fvec_L2sqr_ny (float * dis, const float * x,
-                        const float * y, size_t d, size_t ny) {
-    fvec_L2sqr_ny_ref (dis, x, y, d, ny);
-}
-
-
-#endif
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-/***************************************************************************
- * heavily optimized table computations
- ***************************************************************************/
-
-
-static inline void fvec_madd_ref (size_t n, const float *a,
-                           float bf, const float *b, float *c) {
-    for (size_t i = 0; i < n; i++)
-        c[i] = a[i] + bf * b[i];
-}
-
-#ifdef __SSE__
-
-static inline void fvec_madd_sse (size_t n, const float *a,
-                                  float bf, const float *b, float *c) {
-    n >>= 2;
-    __m128 bf4 = _mm_set_ps1 (bf);
-    __m128 * a4 = (__m128*)a;
-    __m128 * b4 = (__m128*)b;
-    __m128 * c4 = (__m128*)c;
-
-    while (n--) {
-        *c4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
-        b4++;
-        a4++;
-        c4++;
-    }
-}
-
-void fvec_madd (size_t n, const float *a,
-                float bf, const float *b, float *c)
-{
-    if ((n & 3) == 0 &&
-        ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
-        fvec_madd_sse (n, a, bf, b, c);
-    else
-        fvec_madd_ref (n, a, bf, b, c);
-}
-
-#else
-
-void fvec_madd (size_t n, const float *a,
-                float bf, const float *b, float *c)
-{
-    fvec_madd_ref (n, a, bf, b, c);
-}
-
-#endif
-
-static inline int fvec_madd_and_argmin_ref (size_t n, const float *a,
-                                         float bf, const float *b, float *c) {
-    float vmin = 1e20;
-    int imin = -1;
-
-    for (size_t i = 0; i < n; i++) {
-        c[i] = a[i] + bf * b[i];
-        if (c[i] < vmin) {
-            vmin = c[i];
-            imin = i;
-        }
-    }
-    return imin;
-}
-
-#ifdef __SSE__
-
-static inline int fvec_madd_and_argmin_sse (
-        size_t n, const float *a,
-        float bf, const float *b, float *c) {
-    n >>= 2;
-    __m128 bf4 = _mm_set_ps1 (bf);
-    __m128 vmin4 = _mm_set_ps1 (1e20);
-    __m128i imin4 = _mm_set1_epi32 (-1);
-    __m128i idx4 = _mm_set_epi32 (3, 2, 1, 0);
-    __m128i inc4 = _mm_set1_epi32 (4);
-    __m128 * a4 = (__m128*)a;
-    __m128 * b4 = (__m128*)b;
-    __m128 * c4 = (__m128*)c;
-
-    while (n--) {
-        __m128 vc4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
-        *c4 = vc4;
-        __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
-        // imin4 = _mm_blendv_epi8 (imin4, idx4, mask); // slower!
-
-        imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
-                              _mm_andnot_si128 (mask, imin4));
-        vmin4 = _mm_min_ps (vmin4, vc4);
-        b4++;
-        a4++;
-        c4++;
-        idx4 = _mm_add_epi32 (idx4, inc4);
-    }
-
-    // 4 values -> 2
-    {
-        idx4 = _mm_shuffle_epi32 (imin4, 3 << 2 | 2);
-        __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 3 << 2 | 2);
-        __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
-        imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
-                              _mm_andnot_si128 (mask, imin4));
-        vmin4 = _mm_min_ps (vmin4, vc4);
-    }
-    // 2 values -> 1
-    {
-        idx4 = _mm_shuffle_epi32 (imin4, 1);
-        __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 1);
-        __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
-        imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
-                              _mm_andnot_si128 (mask, imin4));
-        // vmin4 = _mm_min_ps (vmin4, vc4);
-    }
-    return _mm_cvtsi128_si32 (imin4);
-}
-
-
-int fvec_madd_and_argmin (size_t n, const float *a,
-                          float bf, const float *b, float *c)
-{
-    if ((n & 3) == 0 &&
-        ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
-        return fvec_madd_and_argmin_sse (n, a, bf, b, c);
-    else
-        return fvec_madd_and_argmin_ref (n, a, bf, b, c);
-}
-
-#else
-
-int fvec_madd_and_argmin (size_t n, const float *a,
-                          float bf, const float *b, float *c)
-{
-  return fvec_madd_and_argmin_ref (n, a, bf, b, c);
-}
-
-#endif
-
-
-
-
-} // namespace faiss