1 <?php 2 3 declare(strict_types=1); 4 5 namespace Phpml\Tokenization; 6 7 use Phpml\Exception\InvalidArgumentException; 8 9 class NGramTokenizer extends WordTokenizer 10 { 11 /** 12 * @var int 13 */ 14 private $minGram; 15 16 /** 17 * @var int 18 */ 19 private $maxGram; 20 21 public function __construct(int $minGram = 1, int $maxGram = 2) 22 { 23 if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) { 24 throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram)); 25 } 26 27 $this->minGram = $minGram; 28 $this->maxGram = $maxGram; 29 } 30 31 /** 32 * {@inheritdoc} 33 */ 34 public function tokenize(string $text): array 35 { 36 $words = []; 37 preg_match_all('/\w\w+/u', $text, $words); 38 39 $nGrams = []; 40 foreach ($words[0] as $word) { 41 $this->generateNGrams($word, $nGrams); 42 } 43 44 return $nGrams; 45 } 46 47 private function generateNGrams(string $word, array &$nGrams): void 48 { 49 $length = mb_strlen($word); 50 51 for ($j = 1; $j <= $this->maxGram; $j++) { 52 for ($k = 0; $k < $length - $j + 1; $k++) { 53 if ($j >= $this->minGram) { 54 $nGrams[] = mb_substr($word, $k, $j); 55 } 56 } 57 } 58 } 59 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body