From 1108973783151aa3b844da05efabedcf56a96127 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Elias=20J=C3=A4=C3=A4saari?= <elias.jaasaari@gmail.com>
Date: Fri, 28 Feb 2025 18:04:16 +0200
Subject: [PATCH] Python formatting

---
 .clangd                   |   4 +-
 cpp/mrptmodule.cpp        |  43 ++++++-------
 mrpt.py                   |  96 +++++++++++++++++++++-------
 pyproject.toml            |  29 +++++++++
 requirements.txt          |   1 -
 utils/binary_converter.py | 131 --------------------------------------
 6 files changed, 124 insertions(+), 180 deletions(-)
 create mode 100644 pyproject.toml
 delete mode 100644 requirements.txt
 delete mode 100644 utils/binary_converter.py

diff --git a/.clangd b/.clangd
index 082a321..98e1f64 100644
--- a/.clangd
+++ b/.clangd
@@ -1,3 +1,3 @@
 CompileFlags:
-  # Treat code as C++, use C++17 standard, enable more warnings.
-  Add: [-xc++, -std=c++17, -Wall, -Wno-missing-prototypes]
+  # Treat code as C++, use C++14 standard, enable more warnings.
+  Add: [-xc++, -std=c++14, -Wall, -Wno-missing-prototypes]
diff --git a/cpp/mrptmodule.cpp b/cpp/mrptmodule.cpp
index 8ac5475..5418a40 100644
--- a/cpp/mrptmodule.cpp
+++ b/cpp/mrptmodule.cpp
@@ -1,7 +1,4 @@
-/*
- * This file wraps the C++11 Mrpt code to an extension module compatible with
- * Python 3.
- */
+#define PY_SSIZE_T_CLEAN
 
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -514,25 +511,25 @@ static PyMethodDef MrptMethods[] = {
 };
 
 static PyTypeObject MrptIndexType = {
-    PyVarObject_HEAD_INIT(NULL, 0) "mrpt.MrptIndex", /*tp_name*/
-    sizeof(mrptIndex),                               /*tp_basicsize*/
-    0,                                               /*tp_itemsize*/
-    (destructor)mrpt_dealloc,                        /*tp_dealloc*/
-    0,                                               /*tp_print*/
-    0,                                               /*tp_getattr*/
-    0,                                               /*tp_setattr*/
-    0,                                               /*tp_compare*/
-    0,                                               /*tp_repr*/
-    0,                                               /*tp_as_number*/
-    0,                                               /*tp_as_sequence*/
-    0,                                               /*tp_as_mapping*/
-    0,                                               /*tp_hash */
-    0,                                               /*tp_call*/
-    0,                                               /*tp_str*/
-    0,                                               /*tp_getattro*/
-    0,                                               /*tp_setattro*/
-    0,                                               /*tp_as_buffer*/
-    Py_TPFLAGS_DEFAULT,                              /*tp_flags*/
+    PyVarObject_HEAD_INIT(NULL, 0) "mrpt.MrptIndex", /* tp_name*/
+    sizeof(mrptIndex),                               /* tp_basicsize*/
+    0,                                               /* tp_itemsize*/
+    (destructor)mrpt_dealloc,                        /* tp_dealloc*/
+    0,                                               /* tp_print*/
+    0,                                               /* tp_getattr*/
+    0,                                               /* tp_setattr*/
+    0,                                               /* tp_compare*/
+    0,                                               /* tp_repr*/
+    0,                                               /* tp_as_number*/
+    0,                                               /* tp_as_sequence*/
+    0,                                               /* tp_as_mapping*/
+    0,                                               /* tp_hash */
+    0,                                               /* tp_call*/
+    0,                                               /* tp_str*/
+    0,                                               /* tp_getattro*/
+    0,                                               /* tp_setattro*/
+    0,                                               /* tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,        /* tp_flags */
     "Mrpt index object",                             /* tp_doc */
     0,                                               /* tp_traverse */
     0,                                               /* tp_clear */
diff --git a/mrpt.py b/mrpt.py
index 1241d09..c4ce223 100644
--- a/mrpt.py
+++ b/mrpt.py
@@ -8,6 +8,7 @@ class MRPTIndex(object):
     """
     An MRPT index object
     """
+
     def __init__(self, data, shape=None, mmap=False):
         """
         Initializes an MRPT index object.
@@ -21,18 +22,20 @@ def __init__(self, data, shape=None, mmap=False):
                 raise ValueError("The data matrix should be non-empty and two-dimensional")
             if data.dtype != np.float32:
                 raise ValueError("The data matrix should have type float32")
-            if not data.flags['C_CONTIGUOUS'] or not data.flags['ALIGNED']:
+            if not data.flags["C_CONTIGUOUS"] or not data.flags["ALIGNED"]:
                 raise ValueError("The data matrix has to be C_CONTIGUOUS and ALIGNED")
             n_samples, dim = data.shape
         elif isinstance(data, str):
             if not isinstance(shape, tuple) or len(shape) != 2:
-                raise ValueError("You must specify the shape of the data as a tuple (N, dim) "
-                                 "when loading data from a binary file")
+                raise ValueError(
+                    "You must specify the shape of the data as a tuple (N, dim) "
+                    "when loading data from a binary file"
+                )
             n_samples, dim = shape
         elif data is not None:
             raise ValueError("Data must be either an ndarray or a filepath")
 
-        if mmap and os_name == 'nt':
+        if mmap and os_name == "nt":
             raise ValueError("Memory mapping is not available on Windows")
 
         if data is not None:
@@ -43,14 +46,15 @@ def __init__(self, data, shape=None, mmap=False):
         self.autotuned = False
 
     def _compute_sparsity(self, projection_sparsity):
-        if projection_sparsity == 'auto':
-            return 1. / np.sqrt(self.dim)
-        elif projection_sparsity is None:
+        if projection_sparsity == "auto":
+            return 1.0 / np.sqrt(self.dim)
+        if projection_sparsity is None:
             return 1
-        elif not 0 < projection_sparsity <= 1:
+        if not (0 < projection_sparsity <= 1):
             raise ValueError("Sparsity should be in (0, 1]")
+        return projection_sparsity
 
-    def build(self, depth, n_trees, projection_sparsity='auto'):
+    def build(self, depth, n_trees, projection_sparsity="auto"):
         """
         Builds a normal MRPT index.
         :param depth: The depth of the trees; should be in the set {1, 2, ..., floor(log2(n))}.
@@ -65,8 +69,18 @@ def build(self, depth, n_trees, projection_sparsity='auto'):
         self.index.build(n_trees, depth, projection_sparsity)
         self.built = True
 
-    def build_autotune(self, target_recall, Q, k, trees_max=-1, depth_min=-1, depth_max=-1,
-                       votes_max=-1, projection_sparsity='auto', shape=None):
+    def build_autotune(
+        self,
+        target_recall,
+        Q,
+        k,
+        trees_max=-1,
+        depth_min=-1,
+        depth_max=-1,
+        votes_max=-1,
+        projection_sparsity="auto",
+        shape=None,
+    ):
         """
         Builds an autotuned MRPT index.
         :param target_recall: The target recall level (float) or None if the target recall level
@@ -94,19 +108,23 @@ def build_autotune(self, target_recall, Q, k, trees_max=-1, depth_min=-1, depth_
                 raise ValueError("The test query matrix should be non-empty and two-dimensional")
             if Q.dtype != np.float32:
                 raise ValueError("The test query matrix should have type float32")
-            if not Q.flags['C_CONTIGUOUS'] or not Q.flags['ALIGNED']:
+            if not Q.flags["C_CONTIGUOUS"] or not Q.flags["ALIGNED"]:
                 raise ValueError("The test query matrix has to be C_CONTIGUOUS and ALIGNED")
             n_test, dim = Q.shape
         elif isinstance(Q, str):
             if not isinstance(shape, tuple) or len(shape) != 2:
-                raise ValueError("You must specify the shape of the data as a tuple (n_test, dim) "
-                                 "when loading the test query matrix from a binary file")
+                raise ValueError(
+                    "You must specify the shape of the data as a tuple (n_test, dim) "
+                    "when loading the test query matrix from a binary file"
+                )
             n_test, dim = shape
         else:
             raise ValueError("The test query matrix must be either an ndarray or a filepath")
 
         if dim != self.dim:
-            raise ValueError("The test query matrix should have the same number of columns as the data matrix")
+            raise ValueError(
+                "The test query matrix should have the same number of columns as the data matrix"
+            )
 
         self.built = target_recall is not None
         self.autotuned = True
@@ -116,10 +134,28 @@ def build_autotune(self, target_recall, Q, k, trees_max=-1, depth_min=-1, depth_
 
         projection_sparsity = self._compute_sparsity(projection_sparsity)
         self.index.build_autotune(
-                target_recall, Q, n_test, k, trees_max, depth_min, depth_max, votes_max, projection_sparsity)
-
-    def build_autotune_sample(self, target_recall, k, n_test=100, trees_max=-1,
-                              depth_min=-1, depth_max=-1, votes_max=-1, projection_sparsity='auto'):
+            target_recall,
+            Q,
+            n_test,
+            k,
+            trees_max,
+            depth_min,
+            depth_max,
+            votes_max,
+            projection_sparsity,
+        )
+
+    def build_autotune_sample(
+        self,
+        target_recall,
+        k,
+        n_test=100,
+        trees_max=-1,
+        depth_min=-1,
+        depth_max=-1,
+        votes_max=-1,
+        projection_sparsity="auto",
+    ):
         """
         Builds an autotuned MRPT index.
         :param target_recall: The target recall level (float) or None if the target recall level
@@ -148,7 +184,15 @@ def build_autotune_sample(self, target_recall, k, n_test=100, trees_max=-1,
 
         projection_sparsity = self._compute_sparsity(projection_sparsity)
         self.index.build_autotune_sample(
-                target_recall, n_test, k, trees_max, depth_min, depth_max, votes_max, projection_sparsity)
+            target_recall,
+            n_test,
+            k,
+            trees_max,
+            depth_min,
+            depth_max,
+            votes_max,
+            projection_sparsity,
+        )
 
     def subset(self, target_recall):
         """
@@ -175,9 +219,15 @@ def parameters(self):
         n_trees, depth, votes, k, qtime, recall = self.index.parameters()
 
         if self.index.is_autotuned():
-            return {'n_trees': n_trees, 'depth': depth, 'k': k, 'votes': votes,
-                    'estimated_qtime': qtime, 'estimated_recall': recall}
-        return {'n_trees': n_trees, 'depth': depth}
+            return {
+                "n_trees": n_trees,
+                "depth": depth,
+                "k": k,
+                "votes": votes,
+                "estimated_qtime": qtime,
+                "estimated_recall": recall,
+            }
+        return {"n_trees": n_trees, "depth": depth}
 
     def save(self, path):
         """
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..4ef8e59
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,29 @@
+[build-system]
+requires = ["setuptools>=42", "wheel", "numpy"]
+build-backend = "setuptools.build_meta"
+
+[tool.isort]
+profile = "black"
+src_paths = ["python"]
+
+[tool.black]
+line-length = 100
+target-version = ['py312']
+include = '(\.pyi?$)'
+exclude = '''
+
+(
+  /(
+      \.github
+    | \.vscode
+    | \.venv
+    | docs\/
+    | licenses\/
+    | src\/
+  )/
+)
+'''
+
+[tool.ruff]
+line-length = 100
+indent-width = 4
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index e4bc291..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-numpy>=1.10.0
diff --git a/utils/binary_converter.py b/utils/binary_converter.py
deleted file mode 100644
index 0e0641f..0000000
--- a/utils/binary_converter.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import os
-import array
-import struct
-import numpy as np
-
-
-def ndarray_to_binary(data, out, n=-1):
-    """
-    Convert a numpy ndarray to binary format.
-    :param data: the data as an ndarray
-    :param out: path to the output file
-    :param n: write only the first n rows, or -1 for all rows
-    """
-    with open(out, 'wb') as outfile:
-        for i, row in enumerate(data):
-            if i == n: break
-            _write_floats(row.astype(np.float32), outfile)
-
-
-def csv_to_binary(fname, out, delim=',', n=-1, skip_cols=0):
-    """
-    Convert a csv file to binary format.
-    :param fname: path to the csv file
-    :param out: path to the output file
-    :param delim: the delimiter used in the csv file
-    :param n: write only the first n rows, or -1 for all rows
-    :param skip_cols: skip this amount of columns for each row
-    """
-    import csv
-
-    with open(fname, 'rb') as csvfile:
-        datareader = csv.reader(csvfile, delimiter=delim)
-        with open(out, 'wb') as outfile:
-            for i, row in enumerate(datareader):
-                if i == n: break
-                floats = [float(x) for x in row[skip_cols:]]
-                _write_floats(floats, outfile)
-
-
-def mat_to_binary(fname, out, dataset, n=-1):
-    """
-    Convert a MAT-file (or any HDF5 file) to binary format.
-    Note that the resulting binary file can be much larger than the
-    input as the input file could be in compressed format.
-    :param fname: path to the MAT-file
-    :param out: path to the output file
-    :param dataset: the HDF5 dataset to use
-    :param n: write only the first n rows, or -1 for all rows
-    """
-    from tables import open_file # PyTables
-
-    fileh = open_file(fname, "r")
-    data = getattr(fileh.root, dataset)
-
-    with open(out, 'wb') as outfile:
-        for i, row in enumerate(data.iterrows()):
-            if i == n: break
-            _write_floats(row, outfile)
-
-    fileh.close()
-
-
-def rdata_to_binary(fname, out, matrix, n=-1):
-    """
-    Convert a RData file to binary format.
-    :param fname: path to the RData file
-    :param out: path to the output file
-    :param matrix: name of the matrix being converted in the RData
-    :param n: write only the first n rows, or -1 for all rows
-    """
-    import rpy2.robjects as robjects
-    import rpy2.robjects.numpy2ri
-
-    robjects.r['load'](fname)
-    data = robjects.numpy2ri.ri2py(robjects.r[matrix][0])
-
-    with open(out, 'wb') as outfile:
-        for i, row in enumerate(data):
-            if i == n: break
-            _write_floats(row.astype(np.float32), outfile)
-
-
-def fvecs_to_binary(fname, out, n=-1):
-    """
-    Convert a fvecs file to binary format.
-    The fvecs format is used in e.g. http://corpus-texmex.irisa.fr/
-    :param fname: path to the fvecs file
-    :param out: path to the output file
-    :param matrix: name of the matrix being converted in the RData
-    :param n: write only the first n rows, or -1 for all rows
-    """
-    sz = os.path.getsize(fname)
-
-    with open(fname, 'rb') as inp:
-        dim = struct.unpack('i', inp.read(4))[0]
-
-    with open(fname, 'rb') as inp:
-        rows = sz / (dim + 1) / 4
-        with open(out, 'wb') as outfile:
-            for i in xrange(rows):
-                if i == n: break
-                tmp = struct.unpack('<i', inp.read(4))[0]
-                vec = array.array('f')
-                vec.read(inp, dim)
-                _write_floats(vec, outfile)
-
-
-def stdin_to_binary(out, delimiter=',', n=-1):
-    """
-    Write input from stdin to binary format.
-    :param out: path to the output file
-    :param delimiter: the delimiter used to split the values
-    :param n: write only the first n rows, or -1 for all rows
-    """
-    import sys
-
-    with open(out, 'wb') as outfile:
-        for i, row in enumerate(sys.stdin):
-            if i == n: break
-            sample = [float(x) for x in row.strip().split(delimiter)]
-            _write_floats(sample, outfile)
-
-
-def _write_floats(floats, outfile):
-    float_arr = array.array('d', floats)
-    s = struct.pack('f' * len(float_arr), *float_arr)
-    outfile.write(s)
-
-
-if __name__ == '__main__':
-    fvecs_to_binary('sift_base.fvecs', 'sift_base.fvecs.bin')