Skip to content

Commit 27b919f

Browse files
chore: adds crate-ci/typos to check repository's spelling (lancedb#3022)
This PR tries to introduce the spelling check workflow from [typos](https://github.com/crate-ci/typos) to ensure we have correct spelling in our repository. to escape the typo checking, we can add the words and files that we want to escape to `lance_repo/.typos.toml` like this: ``` [default.extend-words] DNE = "DNE" arange = "arange" nd = "nd" terrestial = "terrestial" abd = "abd" afe = "afe" [files] extend-exclude = ["notebooks/*.ipynb"] ```
1 parent 7413344 commit 27b919f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+179
-156
lines changed

.github/workflows/typos.yml

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
name: Typo checker
2+
on: [pull_request]
3+
4+
jobs:
5+
run:
6+
name: Spell Check with Typos
7+
runs-on: "ubuntu-24.04"
8+
steps:
9+
- name: Checkout Actions Repository
10+
uses: actions/checkout@v4
11+
12+
- name: Check spelling of the entire repository
13+
uses: crate-ci/typos@v1.26.0

.typos.toml

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[default.extend-words]
2+
DNE = "DNE"
3+
arange = "arange"
4+
nd = "nd"
5+
terrestial = "terrestial"
6+
abd = "abd"
7+
afe = "afe"
8+
9+
[files]
10+
extend-exclude = ["notebooks/*.ipynb"]

benchmarks/flat/benchmark.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,17 @@ def benchmark(
3030
dim: int,
3131
metric: str,
3232
):
33-
querys = [np.random.random((dim,)).reshape(-1) for _ in range(32)]
33+
queries = [np.random.random((dim,)).reshape(-1) for _ in range(32)]
3434
# warmup
35-
for query in querys:
35+
for query in queries:
3636
ds.to_table(
3737
nearest={"column": "vector", "k": 10, "q": query, "use_index": False}
3838
)
3939

4040
latency = []
4141

4242
for _ in range(10):
43-
for query in querys:
43+
for query in queries:
4444
start = time.perf_counter()
4545
ds.to_table(
4646
nearest={

benchmarks/full_report/report.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -2435,7 +2435,7 @@
24352435
}
24362436
],
24372437
"source": [
2438-
"# test NYT -- TF-IDF sparse vectors projected on to 256D dense -- normlized L2\n",
2438+
"# test NYT -- TF-IDF sparse vectors projected on to 256D dense -- normalized L2\n",
24392439
"data = _get_nyt_vectors()\n",
24402440
"data = data[np.linalg.norm(data, axis=1) != 0]\n",
24412441
"data = np.unique(data, axis=0)\n",

benchmarks/sift/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ Dataset URI: http://corpus-texmex.irisa.fr/
55
The SIFT/GIST-1M benchmarks make use of the [LanceDB](https://github.com/lancedb/lancedb) API to index, manage and query the datasets. Ensure the dependencies are installed. LanceDB is built on top of Lance and stores everything as Lance datasets.
66

77
```sh
8-
# Pin the lancedb version to the latest one availale on your own benchmark
8+
# Pin the lancedb version to the latest one available on your own benchmark
99
pip lancedb==0.3.6
1010
pip install pandas~=2.1.0
1111
pip duckdb~=0.9.0

benchmarks/sift/gt.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def generate_gt(args):
3434
col = args.col or infer_vector_column(ds)
3535
if col is None:
3636
raise ValueError(
37-
"Can not infer vector column, please specifiy the column explicitly"
37+
"Can not infer vector column, please specify the column explicitly"
3838
)
3939

4040
samples = ds.sample(args.samples, columns=[col])[col]

benchmarks/sift/perf.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ def summary(self):
7777
series = []
7878
for k, v in self._configs.items():
7979
timer = self._timers[k]
80-
config_ser = pd.Series(v)
81-
time_ser = timer.summary()
82-
series.append(pd.concat([config_ser, time_ser]))
80+
config_series = pd.Series(v)
81+
time_series = timer.summary()
82+
series.append(pd.concat([config_series, time_series]))
8383
return pd.DataFrame(series)
8484

8585

docs/examples/llm_dataset_creation.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ Creating text dataset for LLM training using Lance
33

44
Lance can be used for creating and caching a text (or code) dataset for pre-training / fine-tuning of Large Language Models.
55
The need for this arises when one needs to train a model on a subset of data or process the data in chunks without downloading
6-
all of it on the disk at once. This becomes a considerable problem when you just want a subset of a Terrabyte or Petabyte-scale dataset.
6+
all of it on the disk at once. This becomes a considerable problem when you just want a subset of a Terabyte or Petabyte-scale dataset.
77

88
In this example, we will be bypassing this problem by downloading a text dataset in parts, tokenizing it and saving it as a Lance dataset.
99
This can be done for as many or as few data samples as you wish with average memory consumption approximately 3-4 GBs!
@@ -41,7 +41,7 @@ Now we will define a function to help us with tokenizing our samples, one-by-one
4141
def tokenize(sample, field='text'):
4242
return tokenizer(sample[field])['input_ids']
4343
44-
This function will recieve a sample from a huggingface dataset and tokenize the values in the `field` column. This is the main text you want
44+
This function will receive a sample from a huggingface dataset and tokenize the values in the `field` column. This is the main text you want
4545
to tokenize.
4646

4747
Creating a Lance dataset
@@ -70,7 +70,7 @@ let's define the main function that takes in the dataset, number of samples and
7070
)
7171
7272
This function will be iterating over the huggingface dataset, one sample at a time, tokenizing the sample and yielding a pyarrow `RecordBatch`
73-
with all the tokens. We will do this untill we have reached the `num_samples` number of samples or the end of the dataset, whichever comes first.
73+
with all the tokens. We will do this until we have reached the `num_samples` number of samples or the end of the dataset, whichever comes first.
7474

7575
Please note that by 'sample', we mean one example (row) in the original dataset. What one example exactly means will depend on the dataset itself as it could
7676
be one line or an entire file of text. In this example, it's varies in length between a line and a paragraph of text.

docs/examples/llm_training.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ In this example, we will be training an LLM using 🤗 transformers on the token
99

1010
Imports and Setup
1111
~~~~~~~~~~~~~~~~~
12-
Let's setup our enviornment by doing all the necessary imports and defining a few basic things.
12+
Let's setup our environment by doing all the necessary imports and defining a few basic things.
1313

1414
.. code-block:: python
1515

docs/format.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ The following values are supported:
108108
- 0.16.0
109109
- Any
110110
- Rework of the Lance file format that removed row groups and introduced null
111-
support for lists, fixed size lists, and primtives
111+
support for lists, fixed size lists, and primitives
112112
* - 2.1 (unstable)
113113
- None
114114
- Any

docs/performance.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Threading Model
77
---------------
88

99
Lance is designed to be thread-safe and performant. Lance APIs can be called concurrently unless
10-
explicity stated otherwise. Users may create multiple tables and share tables between threads.
10+
explicitly stated otherwise. Users may create multiple tables and share tables between threads.
1111
Operations may run in parallel on the same table, but some operations may lead to conflicts. For
1212
details see :ref:`conflict_resolution`.
1313

@@ -80,4 +80,4 @@ with 1024 rows per batch is more appropriate.
8080

8181
In summary, scans could use up to ``(2 * io_buffer_size) + (batch_size * num_compute_threads)`` bytes of memory.
8282
Keep in mind that ``io_buffer_size`` is a soft limit (e.g. we cannot read less than one page at a time right now)
83-
and so it is not neccesarily a bug if you see memory usage exceed this limit by a small margin.
83+
and so it is not necessarily a bug if you see memory usage exceed this limit by a small margin.

java/core/lance-jni/src/blocking_dataset.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,8 @@ fn attach_native_dataset<'local>(
302302
}
303303

304304
fn create_java_dataset_object<'a>(env: &mut JNIEnv<'a>) -> Result<JObject<'a>> {
305-
let objet = env.new_object("com/lancedb/lance/Dataset", "()V", &[])?;
306-
Ok(objet)
305+
let object = env.new_object("com/lancedb/lance/Dataset", "()V", &[])?;
306+
Ok(object)
307307
}
308308

309309
#[no_mangle]

java/core/lance-jni/src/blocking_scanner.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ fn inner_create_scanner<'local>(
121121

122122
let mut scanner = dataset_guard.inner.scan();
123123

124-
// handle frament_ids
124+
// handle fragment_ids
125125
if let Some(fragment_ids) = fragment_ids_opt {
126126
let mut fragments = Vec::with_capacity(fragment_ids.len());
127127
for fragment_id in fragment_ids {

protos/file2.proto

+2-2
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ import "google/protobuf/empty.proto";
4949
//
5050
// If direct I/O is required then most (but not all) fields described
5151
// below must be sector aligned. We have marked these fields with an
52-
// asterick for clarity. Readers should assume there will be optional
52+
// asterisk for clarity. Readers should assume there will be optional
5353
// padding inserted before these fields.
5454
//
5555
// All footer fields are unsigned integers written with little endian
@@ -96,7 +96,7 @@ import "google/protobuf/empty.proto";
9696
//
9797
// ## Data Pages
9898
//
99-
// A lot of flexiblity is provided in how data is stored. Note that the file
99+
// A lot of flexibility is provided in how data is stored. Note that the file
100100
// layout has no explicit notion of a page (however, it is a part of the column
101101
// metadata). A page's buffers do not strictly need to be contiguous on the
102102
// disk. However, it is recommended that buffers within a page be grouped

python/python/benchmarks/test_index.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def test_train_ivf(test_large_dataset, benchmark, num_partitions):
163163
)
164164

165165

166-
# Pre-computing partition assigment only makes sense on CUDA and so this benchmark runs
166+
# Pre-computing partition assignment only makes sense on CUDA and so this benchmark runs
167167
# only on CUDA.
168168
@pytest.mark.benchmark(group="assign_partitions")
169169
@pytest.mark.parametrize("num_partitions", [100, 300])

python/python/lance/_arrow/bf16.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -105,12 +105,12 @@ def __init__(self):
105105
pa.ExtensionType.__init__(self, pa.binary(2), "lance.bfloat16")
106106

107107
def __arrow_ext_serialize__(self):
108-
# TODO: encode endianess
108+
# TODO: encode endianness
109109
return b""
110110

111111
@classmethod
112112
def __arrow_ext_deserialize__(self, storage_type, serialized):
113-
# TODO: decode endianess
113+
# TODO: decode endianness
114114
return BFloat16Type()
115115

116116
def __arrow_ext_class__(self):

python/python/lance/_dataset/sharded_batch_iterator.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
class ShardedBatchIterator:
2222
"""An iterator of RecordBatches, over the sharded dataset.
2323
24-
Parmeters
24+
Parameters
2525
---------
2626
uri: str or Path
2727
Dataset base URI

python/python/lance/dataset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -938,7 +938,7 @@ def add_columns(
938938
The names of the columns that the UDF will read. If None, then the
939939
UDF will read all columns. This is only used when transforms is a
940940
UDF. Otherwise, the read columns are inferred from the SQL expressions.
941-
reader_scheam: pa.Schema, optional
941+
reader_schema: pa.Schema, optional
942942
Only valid if transforms is a `ReaderLike` object. This will be used to
943943
determine the schema of the reader.
944944
batch_size: int, optional

python/python/lance/ray/sink.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ def __call__(self, batch: Union[pa.Table, "pd.DataFrame"]) -> Dict[str, Any]:
351351

352352

353353
class LanceCommitter(_BaseLanceDatasink):
354-
"""Lance Commiter as Ray Datasink.
354+
"""Lance Committer as Ray Datasink.
355355
356356
This is used with `LanceFragmentWriter` to write large-than-memory data to
357357
lance file.
@@ -362,7 +362,7 @@ def num_rows_per_write(self) -> int:
362362
return 1
363363

364364
def get_name(self) -> str:
365-
return f"LanceCommiter({self.mode})"
365+
return f"LanceCommitter({self.mode})"
366366

367367
def write(
368368
self,

python/python/tests/test_optimize.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def test_compact_with_write(tmp_path: Path):
130130
# This test creates a dataset with a manifest containing fragments
131131
# that are not in sorted order (by id)
132132
#
133-
# We do this by runnign compaction concurrently with append
133+
# We do this by running compaction concurrently with append
134134
#
135135
# This is because compaction first reserves a fragment id. Then the
136136
# concurrent writes grab later ids and commit them. Then the compaction

python/python/tests/test_ray.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
_register_hooks,
1818
)
1919

20-
# Use this hook until we have offical DataSink in Ray.
20+
# Use this hook until we have official DataSink in Ray.
2121
_register_hooks()
2222

2323
ray.init()

python/src/dataset.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1702,7 +1702,7 @@ fn prepare_vector_index_params(
17021702
};
17031703

17041704
if let Some(f) = kwargs.get_item("precomputed_partitions_file")? {
1705-
ivf_params.precomputed_partitons_file = Some(f.to_string());
1705+
ivf_params.precomputed_partitions_file = Some(f.to_string());
17061706
};
17071707

17081708
if let Some(storage_options) = storage_options {

rust/lance-core/src/datatypes.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ impl TryFrom<&LogicalType> for DataType {
262262
"dict" => {
263263
if splits.len() != 4 {
264264
Err(Error::Schema {
265-
message: format!("Unsupport dictionary type: {}", lt),
265+
message: format!("Unsupported dictionary type: {}", lt),
266266
location: location!(),
267267
})
268268
} else {
@@ -274,7 +274,7 @@ impl TryFrom<&LogicalType> for DataType {
274274
"decimal" => {
275275
if splits.len() != 4 {
276276
Err(Error::Schema {
277-
message: format!("Unsupport decimal type: {}", lt),
277+
message: format!("Unsupported decimal type: {}", lt),
278278
location: location!(),
279279
})
280280
} else {

rust/lance-core/src/utils/testing.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ pub struct ProxyObjectStorePolicy {
5757
/// be returned instead.
5858
before_policies: HashMap<String, PolicyFn>,
5959
/// Policies which run after calls that return ObjectMeta. The policy can
60-
/// tranform the returned ObjectMeta to mock out file listing results.
60+
/// transform the returned ObjectMeta to mock out file listing results.
6161
object_meta_policies: HashMap<String, ObjectMetaPolicyFn>,
6262
}
6363

rust/lance-datafusion/src/exec.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,14 @@ use log::{debug, info, warn};
3535
/// An source execution node created from an existing stream
3636
///
3737
/// It can only be used once, and will return the stream. After that the node
38-
/// is exhuasted.
38+
/// is exhausted.
3939
///
4040
/// Note: the stream should be finite, otherwise we will report datafusion properties
4141
/// incorrectly.
4242
pub struct OneShotExec {
4343
stream: Mutex<Option<SendableRecordBatchStream>>,
4444
// We save off a copy of the schema to speed up formatting and so ExecutionPlan::schema & display_as
45-
// can still function after exhuasted
45+
// can still function after exhausted
4646
schema: Arc<ArrowSchema>,
4747
properties: PlanProperties,
4848
}
@@ -91,7 +91,7 @@ impl DisplayAs for OneShotExec {
9191
let stream = self.stream.lock().unwrap();
9292
match t {
9393
DisplayFormatType::Default | DisplayFormatType::Verbose => {
94-
let exhausted = if stream.is_some() { "" } else { "EXHUASTED " };
94+
let exhausted = if stream.is_some() { "" } else { "EXHAUSTED" };
9595
let columns = self
9696
.schema
9797
.field_names()

rust/lance-datafusion/src/logical_expr.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ pub mod tests {
287287

288288
#[test]
289289
fn test_resolve_in_expr() {
290-
// Type coersion should apply for `A IN (0)` or `A NOT IN (0)`
290+
// Type coercion should apply for `A IN (0)` or `A NOT IN (0)`
291291
let arrow_schema = ArrowSchema::new(vec![Field::new("a", DataType::Float32, false)]);
292292
let expr = Expr::in_list(
293293
Expr::Column("a".to_string().into()),

rust/lance-datafusion/src/substrait.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ pub async fn parse_substrait(expr: &[u8], input_schema: Arc<Schema>) -> Result<E
350350

351351
// When DF parses the above plan it turns column references into qualified references
352352
// into `dummy` (e.g. we get `WHERE dummy.x < 0` instead of `WHERE x < 0`) We want
353-
// these to be unqualified references instead and so we need a quick trasnformation pass
353+
// these to be unqualified references instead and so we need a quick transformation pass
354354

355355
let expr = expr.transform(&|node| match node {
356356
Expr::Column(column) => {

rust/lance-datagen/src/generator.rs

+8-8
Original file line numberDiff line numberDiff line change
@@ -470,9 +470,9 @@ impl ArrayGenerator for CycleVectorGenerator {
470470
}
471471

472472
#[derive(Default)]
473-
pub struct PseduoUuidGenerator {}
473+
pub struct PseudoUuidGenerator {}
474474

475-
impl ArrayGenerator for PseduoUuidGenerator {
475+
impl ArrayGenerator for PseudoUuidGenerator {
476476
fn generate(
477477
&mut self,
478478
length: RowCount,
@@ -497,9 +497,9 @@ impl ArrayGenerator for PseduoUuidGenerator {
497497
}
498498

499499
#[derive(Default)]
500-
pub struct PseduoUuidHexGenerator {}
500+
pub struct PseudoUuidHexGenerator {}
501501

502-
impl ArrayGenerator for PseduoUuidHexGenerator {
502+
impl ArrayGenerator for PseudoUuidHexGenerator {
503503
fn generate(
504504
&mut self,
505505
length: RowCount,
@@ -1581,8 +1581,8 @@ pub mod array {
15811581
/// Note, these are "pseudo UUIDs". They are 16-byte randomish values but they
15821582
/// are not guaranteed to be unique. We use a simplistic RNG that trades uniqueness
15831583
/// for speed.
1584-
pub fn rand_pseduo_uuid() -> Box<dyn ArrayGenerator> {
1585-
Box::<PseduoUuidGenerator>::default()
1584+
pub fn rand_pseudo_uuid() -> Box<dyn ArrayGenerator> {
1585+
Box::<PseudoUuidGenerator>::default()
15861586
}
15871587

15881588
/// Create a generator of random UUIDs, stored as 32-character strings (hex encoding
@@ -1591,8 +1591,8 @@ pub mod array {
15911591
/// Note, these are "pseudo UUIDs". They are 16-byte randomish values but they
15921592
/// are not guaranteed to be unique. We use a simplistic RNG that trades uniqueness
15931593
/// for speed.
1594-
pub fn rand_pseduo_uuid_hex() -> Box<dyn ArrayGenerator> {
1595-
Box::<PseduoUuidHexGenerator>::default()
1594+
pub fn rand_pseudo_uuid_hex() -> Box<dyn ArrayGenerator> {
1595+
Box::<PseudoUuidHexGenerator>::default()
15961596
}
15971597

15981598
pub fn rand_primitive<T: ArrowPrimitiveType + Send + Sync>(

0 commit comments

Comments
 (0)