Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.0.x will end 8 May 2023 (12 months).
  • Bug fixes for security issues in 4.0.x will end 13 November 2023 (18 months).
  • PHP version: minimum PHP 7.3.0 Note: the minimum PHP version has increased since Moodle 3.10. PHP 7.4.x is also supported.
   1  <?php
   2  
   3  declare(strict_types=1);
   4  
   5  namespace Phpml\Tokenization;
   6  
   7  use Phpml\Exception\InvalidArgumentException;
   8  
   9  class NGramTokenizer extends WordTokenizer
  10  {
  11      /**
  12       * @var int
  13       */
  14      private $minGram;
  15  
  16      /**
  17       * @var int
  18       */
  19      private $maxGram;
  20  
  21      public function __construct(int $minGram = 1, int $maxGram = 2)
  22      {
  23          if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
  24              throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
  25          }
  26  
  27          $this->minGram = $minGram;
  28          $this->maxGram = $maxGram;
  29      }
  30  
  31      /**
  32       * {@inheritdoc}
  33       */
  34      public function tokenize(string $text): array
  35      {
  36          $words = [];
  37          preg_match_all('/\w\w+/u', $text, $words);
  38  
  39          $nGrams = [];
  40          foreach ($words[0] as $word) {
  41              $this->generateNGrams($word, $nGrams);
  42          }
  43  
  44          return $nGrams;
  45      }
  46  
  47      private function generateNGrams(string $word, array &$nGrams): void
  48      {
  49          $length = mb_strlen($word);
  50  
  51          for ($j = 1; $j <= $this->maxGram; $j++) {
  52              for ($k = 0; $k < $length - $j + 1; $k++) {
  53                  if ($j >= $this->minGram) {
  54                      $nGrams[] = mb_substr($word, $k, $j);
  55                  }
  56              }
  57          }
  58      }
  59  }