Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.0.x will end 8 May 2023 (12 months).
  • Bug fixes for security issues in 4.0.x will end 13 November 2023 (18 months).
  • PHP version: minimum PHP 7.3.0 Note: the minimum PHP version has increased since Moodle 3.10. PHP 7.4.x is also supported.
   1  <?php
   2  
   3  declare(strict_types=1);
   4  
   5  namespace Phpml\Tokenization;
   6  
   7  use Phpml\Exception\InvalidArgumentException;
   8  
   9  class NGramWordTokenizer extends WordTokenizer
  10  {
  11      /**
  12       * @var int
  13       */
  14      private $minGram;
  15  
  16      /**
  17       * @var int
  18       */
  19      private $maxGram;
  20  
  21      public function __construct(int $minGram = 1, int $maxGram = 2)
  22      {
  23          if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
  24              throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
  25          }
  26  
  27          $this->minGram = $minGram;
  28          $this->maxGram = $maxGram;
  29      }
  30  
  31      /**
  32       * {@inheritdoc}
  33       */
  34      public function tokenize(string $text): array
  35      {
  36          preg_match_all('/\w\w+/u', $text, $words);
  37  
  38          $words = $words[0];
  39  
  40          $nGrams = [];
  41          for ($j = $this->minGram; $j <= $this->maxGram; $j++) {
  42              $nGrams = array_merge($nGrams, $this->getNgrams($words, $j));
  43          }
  44  
  45          return $nGrams;
  46      }
  47  
  48      private function getNgrams(array $match, int $n = 2): array
  49      {
  50          $ngrams = [];
  51          $len = count($match);
  52          for ($i = 0; $i < $len; $i++) {
  53              if ($i > ($n - 2)) {
  54                  $ng = '';
  55                  for ($j = $n - 1; $j >= 0; $j--) {
  56                      $ng .= ' '.$match[$i - $j];
  57                  }
  58                  $ngrams[] = trim($ng);
  59              }
  60          }
  61  
  62          return $ngrams;
  63      }
  64  }