Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce options for reducing the overhead for a clustering procedure #3731

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 35 additions & 4 deletions faiss/Clustering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <faiss/VectorTransform.h>
#include <faiss/impl/AuxIndexStructures.h>

#include <chrono>
#include <cinttypes>
#include <cmath>
#include <cstdio>
Expand Down Expand Up @@ -74,6 +75,14 @@ void Clustering::train(

namespace {

uint64_t get_actual_rng_seed(const int seed) {
return (seed >= 0)
? seed
: static_cast<uint64_t>(std::chrono::high_resolution_clock::now()
.time_since_epoch()
.count());
}

idx_t subsample_training_set(
const Clustering& clus,
idx_t nx,
Expand All @@ -87,11 +96,30 @@ idx_t subsample_training_set(
clus.k * clus.max_points_per_centroid,
nx);
}
std::vector<int> perm(nx);
rand_perm(perm.data(), nx, clus.seed);

const uint64_t actual_seed = get_actual_rng_seed(clus.seed);

std::vector<int> perm;
if (clus.use_faster_subsampling) {
// use subsampling with splitmix64 rng
SplitMix64RandomGenerator rng(actual_seed);

const idx_t new_nx = clus.k * clus.max_points_per_centroid;
perm.resize(new_nx);
for (idx_t i = 0; i < new_nx; i++) {
perm[i] = rng.rand_int(nx);
}
} else {
// use subsampling with a default std rng
perm.resize(nx);
rand_perm(perm.data(), nx, actual_seed);
}

nx = clus.k * clus.max_points_per_centroid;
uint8_t* x_new = new uint8_t[nx * line_size];
*x_out = x_new;

// might be worth omp-ing as well
for (idx_t i = 0; i < nx; i++) {
memcpy(x_new + i * line_size, x + perm[i] * line_size, line_size);
}
Expand Down Expand Up @@ -280,7 +308,7 @@ void Clustering::train_encoded(

double t0 = getmillisecs();

if (!codec) {
if (!codec && check_input_data_for_NaNs) {
// Check for NaNs in input data. Normally it is the user's
// responsibility, but it may spare us some hard-to-debug
// reports.
Expand Down Expand Up @@ -383,6 +411,9 @@ void Clustering::train_encoded(
}
t0 = getmillisecs();

// initialize seed
const uint64_t actual_seed = get_actual_rng_seed(seed);

// temporary buffer to decode vectors during the optimization
std::vector<float> decode_buffer(codec ? d * decode_block_size : 0);

Expand All @@ -395,7 +426,7 @@ void Clustering::train_encoded(
centroids.resize(d * k);
std::vector<int> perm(nx);

rand_perm(perm.data(), nx, seed + 1 + redo * 15486557L);
rand_perm(perm.data(), nx, actual_seed + 1 + redo * 15486557L);

if (!codec) {
for (int i = n_input_centroids; i < k; i++) {
Expand Down
11 changes: 10 additions & 1 deletion faiss/Clustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,20 @@ struct ClusteringParameters {
int min_points_per_centroid = 39;
/// to limit size of dataset, otherwise the training set is subsampled
int max_points_per_centroid = 256;
/// seed for the random number generator
/// seed for the random number generator.
/// negative values lead to seeding an internal rng with
/// std::high_resolution_clock.
int seed = 1234;

/// when the training set is encoded, batch size of the codec decoder
size_t decode_block_size = 32768;

/// whether to check for NaNs in an input data
bool check_input_data_for_NaNs = true;

/// Whether to use splitmix64-based random number generator for subsampling,
/// which is faster, but may pick duplicate points.
bool use_faster_subsampling = false;
};

struct ClusteringIterationStats {
Expand Down
43 changes: 43 additions & 0 deletions faiss/utils/random.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,37 @@ double RandomGenerator::rand_double() {
return mt() / double(mt.max());
}

SplitMix64RandomGenerator::SplitMix64RandomGenerator(int64_t seed)
: state{static_cast<uint64_t>(seed)} {}

int SplitMix64RandomGenerator::rand_int() {
return next() & 0x7fffffff;
}

int64_t SplitMix64RandomGenerator::rand_int64() {
uint64_t value = next();
return static_cast<int64_t>(value & 0x7fffffffffffffffULL);
}

int SplitMix64RandomGenerator::rand_int(int max) {
return next() % max;
}

float SplitMix64RandomGenerator::rand_float() {
return next() / float(std::numeric_limits<uint64_t>::max());
}

double SplitMix64RandomGenerator::rand_double() {
return next() / double(std::numeric_limits<uint64_t>::max());
}

uint64_t SplitMix64RandomGenerator::next() {
uint64_t z = (state += 0x9e3779b97f4a7c15ULL);
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL;
z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL;
return z ^ (z >> 31);
}

/***********************************************************************
* Random functions in this C file only exist because Torch
* counterparts are slow and not multi-threaded. Typical use is for
Expand Down Expand Up @@ -162,6 +193,18 @@ void rand_perm(int* perm, size_t n, int64_t seed) {
}
}

void rand_perm_splitmix64(int* perm, size_t n, int64_t seed) {
for (size_t i = 0; i < n; i++)
perm[i] = i;

SplitMix64RandomGenerator rng(seed);

for (size_t i = 0; i + 1 < n; i++) {
int i2 = i + rng.rand_int(n - i);
std::swap(perm[i], perm[i2]);
}
}

void byte_rand(uint8_t* x, size_t n, int64_t seed) {
// only try to parallelize on large enough arrays
const size_t nblock = n < 1024 ? 1 : 1024;
Expand Down
25 changes: 25 additions & 0 deletions faiss/utils/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,30 @@ struct RandomGenerator {
explicit RandomGenerator(int64_t seed = 1234);
};

/// fast random generator that cannot be used in multithreaded contexts.
/// based on https://prng.di.unimi.it/
struct SplitMix64RandomGenerator {
uint64_t state;

/// random positive integer
int rand_int();

/// random int64_t
int64_t rand_int64();

/// generate random integer between 0 and max-1
int rand_int(int max);

/// between 0 and 1
float rand_float();

double rand_double();

explicit SplitMix64RandomGenerator(int64_t seed = 1234);

uint64_t next();
};

/* Generate an array of uniform random floats / multi-threaded implementation */
void float_rand(float* x, size_t n, int64_t seed);
void float_randn(float* x, size_t n, int64_t seed);
Expand All @@ -53,6 +77,7 @@ void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed);

/* random permutation */
void rand_perm(int* perm, size_t n, int64_t seed);
void rand_perm_splitmix64(int* perm, size_t n, int64_t seed);

/* Random set of vectors with intrinsic dimensionality 10 that is harder to
* index than a subspace of dim 10 but easier than uniform data in dimension d
Expand Down
Loading