From 1108973783151aa3b844da05efabedcf56a96127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elias=20J=C3=A4=C3=A4saari?= Date: Fri, 28 Feb 2025 18:04:16 +0200 Subject: [PATCH] Python formatting --- .clangd | 4 +- cpp/mrptmodule.cpp | 43 ++++++------- mrpt.py | 96 +++++++++++++++++++++------- pyproject.toml | 29 +++++++++ requirements.txt | 1 - utils/binary_converter.py | 131 -------------------------------------- 6 files changed, 124 insertions(+), 180 deletions(-) create mode 100644 pyproject.toml delete mode 100644 requirements.txt delete mode 100644 utils/binary_converter.py diff --git a/.clangd b/.clangd index 082a321..98e1f64 100644 --- a/.clangd +++ b/.clangd @@ -1,3 +1,3 @@ CompileFlags: - # Treat code as C++, use C++17 standard, enable more warnings. - Add: [-xc++, -std=c++17, -Wall, -Wno-missing-prototypes] + # Treat code as C++, use C++14 standard, enable more warnings. + Add: [-xc++, -std=c++14, -Wall, -Wno-missing-prototypes] diff --git a/cpp/mrptmodule.cpp b/cpp/mrptmodule.cpp index 8ac5475..5418a40 100644 --- a/cpp/mrptmodule.cpp +++ b/cpp/mrptmodule.cpp @@ -1,7 +1,4 @@ -/* - * This file wraps the C++11 Mrpt code to an extension module compatible with - * Python 3. - */ +#define PY_SSIZE_T_CLEAN #include #include @@ -514,25 +511,25 @@ static PyMethodDef MrptMethods[] = { }; static PyTypeObject MrptIndexType = { - PyVarObject_HEAD_INIT(NULL, 0) "mrpt.MrptIndex", /*tp_name*/ - sizeof(mrptIndex), /*tp_basicsize*/ - 0, /*tp_itemsize*/ - (destructor)mrpt_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - 0, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT, /*tp_flags*/ + PyVarObject_HEAD_INIT(NULL, 0) "mrpt.MrptIndex", /* tp_name*/ + sizeof(mrptIndex), /* tp_basicsize*/ + 0, /* tp_itemsize*/ + (destructor)mrpt_dealloc, /* tp_dealloc*/ + 0, /* tp_print*/ + 0, /* tp_getattr*/ + 0, /* tp_setattr*/ + 0, /* tp_compare*/ + 0, /* tp_repr*/ + 0, /* tp_as_number*/ + 0, /* tp_as_sequence*/ + 0, /* tp_as_mapping*/ + 0, /* tp_hash */ + 0, /* tp_call*/ + 0, /* tp_str*/ + 0, /* tp_getattro*/ + 0, /* tp_setattro*/ + 0, /* tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ "Mrpt index object", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ diff --git a/mrpt.py b/mrpt.py index 1241d09..c4ce223 100644 --- a/mrpt.py +++ b/mrpt.py @@ -8,6 +8,7 @@ class MRPTIndex(object): """ An MRPT index object """ + def __init__(self, data, shape=None, mmap=False): """ Initializes an MRPT index object. @@ -21,18 +22,20 @@ def __init__(self, data, shape=None, mmap=False): raise ValueError("The data matrix should be non-empty and two-dimensional") if data.dtype != np.float32: raise ValueError("The data matrix should have type float32") - if not data.flags['C_CONTIGUOUS'] or not data.flags['ALIGNED']: + if not data.flags["C_CONTIGUOUS"] or not data.flags["ALIGNED"]: raise ValueError("The data matrix has to be C_CONTIGUOUS and ALIGNED") n_samples, dim = data.shape elif isinstance(data, str): if not isinstance(shape, tuple) or len(shape) != 2: - raise ValueError("You must specify the shape of the data as a tuple (N, dim) " - "when loading data from a binary file") + raise ValueError( + "You must specify the shape of the data as a tuple (N, dim) " + "when loading data from a binary file" + ) n_samples, dim = shape elif data is not None: raise ValueError("Data must be either an ndarray or a filepath") - if mmap and os_name == 'nt': + if mmap and os_name == "nt": raise ValueError("Memory mapping is not available on Windows") if data is not None: @@ -43,14 +46,15 @@ def __init__(self, data, shape=None, mmap=False): self.autotuned = False def _compute_sparsity(self, projection_sparsity): - if projection_sparsity == 'auto': - return 1. / np.sqrt(self.dim) - elif projection_sparsity is None: + if projection_sparsity == "auto": + return 1.0 / np.sqrt(self.dim) + if projection_sparsity is None: return 1 - elif not 0 < projection_sparsity <= 1: + if not (0 < projection_sparsity <= 1): raise ValueError("Sparsity should be in (0, 1]") + return projection_sparsity - def build(self, depth, n_trees, projection_sparsity='auto'): + def build(self, depth, n_trees, projection_sparsity="auto"): """ Builds a normal MRPT index. :param depth: The depth of the trees; should be in the set {1, 2, ..., floor(log2(n))}. @@ -65,8 +69,18 @@ def build(self, depth, n_trees, projection_sparsity='auto'): self.index.build(n_trees, depth, projection_sparsity) self.built = True - def build_autotune(self, target_recall, Q, k, trees_max=-1, depth_min=-1, depth_max=-1, - votes_max=-1, projection_sparsity='auto', shape=None): + def build_autotune( + self, + target_recall, + Q, + k, + trees_max=-1, + depth_min=-1, + depth_max=-1, + votes_max=-1, + projection_sparsity="auto", + shape=None, + ): """ Builds an autotuned MRPT index. :param target_recall: The target recall level (float) or None if the target recall level @@ -94,19 +108,23 @@ def build_autotune(self, target_recall, Q, k, trees_max=-1, depth_min=-1, depth_ raise ValueError("The test query matrix should be non-empty and two-dimensional") if Q.dtype != np.float32: raise ValueError("The test query matrix should have type float32") - if not Q.flags['C_CONTIGUOUS'] or not Q.flags['ALIGNED']: + if not Q.flags["C_CONTIGUOUS"] or not Q.flags["ALIGNED"]: raise ValueError("The test query matrix has to be C_CONTIGUOUS and ALIGNED") n_test, dim = Q.shape elif isinstance(Q, str): if not isinstance(shape, tuple) or len(shape) != 2: - raise ValueError("You must specify the shape of the data as a tuple (n_test, dim) " - "when loading the test query matrix from a binary file") + raise ValueError( + "You must specify the shape of the data as a tuple (n_test, dim) " + "when loading the test query matrix from a binary file" + ) n_test, dim = shape else: raise ValueError("The test query matrix must be either an ndarray or a filepath") if dim != self.dim: - raise ValueError("The test query matrix should have the same number of columns as the data matrix") + raise ValueError( + "The test query matrix should have the same number of columns as the data matrix" + ) self.built = target_recall is not None self.autotuned = True @@ -116,10 +134,28 @@ def build_autotune(self, target_recall, Q, k, trees_max=-1, depth_min=-1, depth_ projection_sparsity = self._compute_sparsity(projection_sparsity) self.index.build_autotune( - target_recall, Q, n_test, k, trees_max, depth_min, depth_max, votes_max, projection_sparsity) - - def build_autotune_sample(self, target_recall, k, n_test=100, trees_max=-1, - depth_min=-1, depth_max=-1, votes_max=-1, projection_sparsity='auto'): + target_recall, + Q, + n_test, + k, + trees_max, + depth_min, + depth_max, + votes_max, + projection_sparsity, + ) + + def build_autotune_sample( + self, + target_recall, + k, + n_test=100, + trees_max=-1, + depth_min=-1, + depth_max=-1, + votes_max=-1, + projection_sparsity="auto", + ): """ Builds an autotuned MRPT index. :param target_recall: The target recall level (float) or None if the target recall level @@ -148,7 +184,15 @@ def build_autotune_sample(self, target_recall, k, n_test=100, trees_max=-1, projection_sparsity = self._compute_sparsity(projection_sparsity) self.index.build_autotune_sample( - target_recall, n_test, k, trees_max, depth_min, depth_max, votes_max, projection_sparsity) + target_recall, + n_test, + k, + trees_max, + depth_min, + depth_max, + votes_max, + projection_sparsity, + ) def subset(self, target_recall): """ @@ -175,9 +219,15 @@ def parameters(self): n_trees, depth, votes, k, qtime, recall = self.index.parameters() if self.index.is_autotuned(): - return {'n_trees': n_trees, 'depth': depth, 'k': k, 'votes': votes, - 'estimated_qtime': qtime, 'estimated_recall': recall} - return {'n_trees': n_trees, 'depth': depth} + return { + "n_trees": n_trees, + "depth": depth, + "k": k, + "votes": votes, + "estimated_qtime": qtime, + "estimated_recall": recall, + } + return {"n_trees": n_trees, "depth": depth} def save(self, path): """ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4ef8e59 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,29 @@ +[build-system] +requires = ["setuptools>=42", "wheel", "numpy"] +build-backend = "setuptools.build_meta" + +[tool.isort] +profile = "black" +src_paths = ["python"] + +[tool.black] +line-length = 100 +target-version = ['py312'] +include = '(\.pyi?$)' +exclude = ''' + +( + /( + \.github + | \.vscode + | \.venv + | docs\/ + | licenses\/ + | src\/ + )/ +) +''' + +[tool.ruff] +line-length = 100 +indent-width = 4 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e4bc291..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -numpy>=1.10.0 diff --git a/utils/binary_converter.py b/utils/binary_converter.py deleted file mode 100644 index 0e0641f..0000000 --- a/utils/binary_converter.py +++ /dev/null @@ -1,131 +0,0 @@ -import os -import array -import struct -import numpy as np - - -def ndarray_to_binary(data, out, n=-1): - """ - Convert a numpy ndarray to binary format. - :param data: the data as an ndarray - :param out: path to the output file - :param n: write only the first n rows, or -1 for all rows - """ - with open(out, 'wb') as outfile: - for i, row in enumerate(data): - if i == n: break - _write_floats(row.astype(np.float32), outfile) - - -def csv_to_binary(fname, out, delim=',', n=-1, skip_cols=0): - """ - Convert a csv file to binary format. - :param fname: path to the csv file - :param out: path to the output file - :param delim: the delimiter used in the csv file - :param n: write only the first n rows, or -1 for all rows - :param skip_cols: skip this amount of columns for each row - """ - import csv - - with open(fname, 'rb') as csvfile: - datareader = csv.reader(csvfile, delimiter=delim) - with open(out, 'wb') as outfile: - for i, row in enumerate(datareader): - if i == n: break - floats = [float(x) for x in row[skip_cols:]] - _write_floats(floats, outfile) - - -def mat_to_binary(fname, out, dataset, n=-1): - """ - Convert a MAT-file (or any HDF5 file) to binary format. - Note that the resulting binary file can be much larger than the - input as the input file could be in compressed format. - :param fname: path to the MAT-file - :param out: path to the output file - :param dataset: the HDF5 dataset to use - :param n: write only the first n rows, or -1 for all rows - """ - from tables import open_file # PyTables - - fileh = open_file(fname, "r") - data = getattr(fileh.root, dataset) - - with open(out, 'wb') as outfile: - for i, row in enumerate(data.iterrows()): - if i == n: break - _write_floats(row, outfile) - - fileh.close() - - -def rdata_to_binary(fname, out, matrix, n=-1): - """ - Convert a RData file to binary format. - :param fname: path to the RData file - :param out: path to the output file - :param matrix: name of the matrix being converted in the RData - :param n: write only the first n rows, or -1 for all rows - """ - import rpy2.robjects as robjects - import rpy2.robjects.numpy2ri - - robjects.r['load'](fname) - data = robjects.numpy2ri.ri2py(robjects.r[matrix][0]) - - with open(out, 'wb') as outfile: - for i, row in enumerate(data): - if i == n: break - _write_floats(row.astype(np.float32), outfile) - - -def fvecs_to_binary(fname, out, n=-1): - """ - Convert a fvecs file to binary format. - The fvecs format is used in e.g. http://corpus-texmex.irisa.fr/ - :param fname: path to the fvecs file - :param out: path to the output file - :param matrix: name of the matrix being converted in the RData - :param n: write only the first n rows, or -1 for all rows - """ - sz = os.path.getsize(fname) - - with open(fname, 'rb') as inp: - dim = struct.unpack('i', inp.read(4))[0] - - with open(fname, 'rb') as inp: - rows = sz / (dim + 1) / 4 - with open(out, 'wb') as outfile: - for i in xrange(rows): - if i == n: break - tmp = struct.unpack('