Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optional batch_size to BatchAnalyzer.analyze_iterator #883

Closed
wants to merge 11 commits into from
8 changes: 4 additions & 4 deletions presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ def analyze_iterator(
texts = self._validate_types(texts)

# Process the texts as batch for improved performance
nlp_artifacts_batch: Iterator[Tuple[str, NlpArtifacts]] = (
self.analyzer_engine.nlp_engine.process_batch(
texts=texts, language=language, batch_size=batch_size
)
nlp_artifacts_batch: Iterator[
Tuple[str, NlpArtifacts]
] = self.analyzer_engine.nlp_engine.process_batch(
texts=texts, language=language, batch_size=batch_size
)

list_results = []
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod

from typing import Iterable, Iterator, List, Optional, Tuple

from presidio_analyzer.nlp_engine import NlpArtifacts
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import logging

from pathlib import Path
from typing import Dict, Iterator, List, Optional, Tuple, Union

import spacy
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine
from spacy.language import Language
from spacy.tokens import Doc, Span

Expand Down
2 changes: 1 addition & 1 deletion presidio-analyzer/tests/test_batch_analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_analyze_iterator_returns_list_of_recognizer_results(
texts, expected_output, batch_analyzer_engine_simple
):

results = batch_analyzer_engine_simple.analyze_iterator(texts=texts, language="en")
results = batch_analyzer_engine_simple.analyze_iterator(texts=texts, language="en", batch_size=2)
omri374 marked this conversation as resolved.
Show resolved Hide resolved

assert len(results) == len(expected_output)
for result, expected_result in zip(results, expected_output):
Expand Down
1 change: 1 addition & 0 deletions presidio-analyzer/tests/test_spacy_nlp_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def test_simple_process_text(spacy_nlp_engine):
assert nlp_artifacts.lemmas[1] == "text"



def test_process_batch_strings(spacy_nlp_engine):
nlp_artifacts_batch = spacy_nlp_engine.process_batch(
["simple text", "simple text"], language="en"
Expand Down
Loading