1 <?php 2 3 declare(strict_types=1); 4 5 namespace Phpml\Tokenization; 6 7 use Phpml\Exception\InvalidArgumentException; 8 9 class NGramWordTokenizer extends WordTokenizer 10 { 11 /** 12 * @var int 13 */ 14 private $minGram; 15 16 /** 17 * @var int 18 */ 19 private $maxGram; 20 21 public function __construct(int $minGram = 1, int $maxGram = 2) 22 { 23 if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) { 24 throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram)); 25 } 26 27 $this->minGram = $minGram; 28 $this->maxGram = $maxGram; 29 } 30 31 /** 32 * {@inheritdoc} 33 */ 34 public function tokenize(string $text): array 35 { 36 preg_match_all('/\w\w+/u', $text, $words); 37 38 $words = $words[0]; 39 40 $nGrams = []; 41 for ($j = $this->minGram; $j <= $this->maxGram; $j++) { 42 $nGrams = array_merge($nGrams, $this->getNgrams($words, $j)); 43 } 44 45 return $nGrams; 46 } 47 48 private function getNgrams(array $match, int $n = 2): array 49 { 50 $ngrams = []; 51 $len = count($match); 52 for ($i = 0; $i < $len; $i++) { 53 if ($i > ($n - 2)) { 54 $ng = ''; 55 for ($j = $n - 1; $j >= 0; $j--) { 56 $ng .= ' '.$match[$i - $j]; 57 } 58 $ngrams[] = trim($ng); 59 } 60 } 61 62 return $ngrams; 63 } 64 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body