This repository contains the N-Gram Tools for 🙃 Phony Language that includes features like sanitizing, tokenization, n-gram extraction, frequency mapping.
Requires PHP >= 8.0
.
You can install the package via composer:
composer require phonyland/ngram
$tokenizer->tokenize($text);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\TokenizerFilter;
$tokenizer = new Tokenizer();
$tokenizer
->addWordSeparatorPattern(';')
->addWordSeparatorPattern('\s')
->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS);
$text = 'sample text;sample;text';
$tokenizer->tokenize($text);
🖥 Output
[
"sample",
"text",
"sample",
"text",
];
$tokenizer->sentences($text);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
$tokenizer = new Tokenizer();
$tokenizer
->addSentenceSeparatorPattern('.')
->addSentenceSeparatorPattern('!')
->addSentenceSeparatorPattern('?');
$text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';
$tokenizer->sentences($text);
🖥 Output
[
"Sample Sentence.",
"Sample Sentence!",
"Sample Sentence?",
"Sample Sentence no.",
"4?!",
"Sample sample sentence...",
"End",
];
$tokenizer->tokenizeBySentences($text);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\TokenizerFilter;
$tokenizer = new Tokenizer();
$tokenizer
->addSentenceSeparatorPattern('.')
->addSentenceSeparatorPattern('!')
->addSentenceSeparatorPattern('?')
->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS)
->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
$text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';
$tokenizer->tokenizeBySentences($text);
🖥 Output
[
["Sample", "Sentence"],
["Sample", "Sentence"],
["Sample", "Sentence"],
["Sample", "Sentence", "no"],
["Sample", "sample", "sentence"],
["End"],
];
NGramSequence::multigram($n, $tokens, $isUnique);
NGramSequence::trigram($tokens, $isUnique);
NGramSequence::bigram($tokens, $isUnique);
NGramSequence::unigram($tokens, $isUnique);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\NGramSequence;
use Phonyland\NGram\TokenizerFilter;
$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
$tokens = $tokenizer->tokenize('sample text');
NGramSequence::multigram(4, $tokens);
// ['samp', 'ampl', 'mple', 'text'];
// Generate Unique N-Grams
NGramSequence::unigram($tokens, true);
// ['s', 'a', 'm', 'p', 'l', 'e', 't', 'x'];
NGramCount::multigram(4, $tokens);
NGramCount::trigram($tokens);
NGramCount::bigram($tokens);
NGramCount::unigram($tokens);
NGramCount::incrementElementCount($element, $elements);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\NGramCount;
$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
$tokens = $tokenizer->tokenize('sample text');
NGramCount::multigram(4, $tokens);
// [
// 'samp' => 1,
// 'ampl' => 1,
// 'mple' => 1,
// 'text' => 1,
// ];
NGramFrequency::multigram(4, $tokens);
NGramFrequency::multigram($tokens);
NGramFrequency::bigram($tokens);
NGramFrequency::unigram($tokens);
NGramFrequency::frequencyFromCount($countArray);
⌨️ Usage
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\NGramFrequency;
use Phonyland\NGram\TokenizerFilter;
$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
$tokenizer->addWordFilterRule(TokenizerFilterType::ALPHABETICAL);
$tokens = $tokenizer->tokenize('bombadil! bombadillo!');
NGramFrequency::multigram(4, $tokens);
//[
// 'bomb' => 0.16666666666666666,
// 'omba' => 0.16666666666666666,
// 'mbad' => 0.16666666666666666,
// 'badi' => 0.16666666666666666,
// 'adil' => 0.16666666666666666,
// 'dill' => 0.08333333333333333,
// 'illo' => 0.08333333333333333,
//]
Start generating fake data with 🙃 Phony Framework,
visit the main Phony Repository.
Explore the docs » https://phony.land
Follow us on Twitter » @phony_land
🙃 Phony
Fake Data Generation Framework
was created by
Yunus Emre Deligöz
under
MIT license.