From 98ca5afb7c32aa8f34dc83dd524bd13bebe1b559 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yunus=20Emre=20Deligo=CC=88z?= Date: Tue, 2 Jan 2024 05:54:23 +0300 Subject: [PATCH] Refactor Tokenizer class and update method signatures Code has been refactored for the Tokenizer class; this includes changing the constructor to accept parameters and adjusting method signatures for 'tokenize' and 'tokenizeBySentences'. Removal of redundant comments and the reorganization of class components were also carried out. --- src/Tokenizer.php | 47 ++++++++++++++++------------------------------- 1 file changed, 16 insertions(+), 31 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 3251cd7..f475169 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -8,29 +8,22 @@ class Tokenizer { - // region Attributes - - /** @var array */ - protected array $wordFilters; - - /** @var array */ - protected array $wordSeparationPatterns; - - /** @var array */ - public array $sentenceSeparationPatterns; - - protected bool $toLowercase; - - // endregion - // region Public Methods - public function __construct() - { - $this->wordFilters = []; - $this->wordSeparationPatterns = []; - $this->sentenceSeparationPatterns = []; - $this->toLowercase = false; + /** + * Constructor for initializing the object. + * + * @param array<\Phonyland\NGram\TokenizerFilter> $wordFilters Array of word filters. + * @param array $wordSeparationPatterns Array of word separation patterns. + * @param array $sentenceSeparationPatterns Array of sentence separation patterns. + * @param bool $toLowercase Determines if the text will be converted to lowercase. + */ + public function __construct( + protected array $wordFilters = [], + protected array $wordSeparationPatterns = [], + public array $sentenceSeparationPatterns = [], + protected bool $toLowercase = false, + ) { } /** @@ -39,7 +32,7 @@ public function __construct() * * @return array */ - public function tokenize(string $text, int $minWordLength = null): array + public function tokenize(string $text, ?int $minWordLength = null): array { if ($this->wordSeparationPatterns === []) { throw new RuntimeException('No word separation pattern given!'); @@ -91,7 +84,7 @@ public function sentences(string $text): array * * @return array> */ - public function tokenizeBySentences(string $text, int $minWordLength = null): array + public function tokenizeBySentences(string $text, ?int $minWordLength = null): array { $sentences = $this->sentences($text); @@ -172,10 +165,6 @@ public function addWordFilterRule(string|TokenizerFilterType $searchRegex, strin /** * Adds a separator pattern for the splitting the given text. - * - * - * @param string|\Phonyland\NGram\TokenizerFilterType $wordSeparationPattern - * @return \Phonyland\NGram\Tokenizer */ public function addWordSeparatorPattern(string|TokenizerFilterType $wordSeparationPattern): self { @@ -192,7 +181,6 @@ public function addWordSeparatorPattern(string|TokenizerFilterType $wordSeparati * Adds a separator pattern for the splitting into sentences. * * @param string|array $sentenceSeparationPattern - * @return \Phonyland\NGram\Tokenizer */ public function addSentenceSeparatorPattern(string|array $sentenceSeparationPattern): self { @@ -210,9 +198,6 @@ public function addSentenceSeparatorPattern(string|array $sentenceSeparationPatt /** * Converts all tokens to lowercase. - * - * - * @return \Phonyland\NGram\Tokenizer */ public function toLowercase(bool $toLowercase = true): self {