Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.0.x will end 8 May 2023 (12 months).
  • Bug fixes for security issues in 4.0.x will end 13 November 2023 (18 months).
  • PHP version: minimum PHP 7.3.0 Note: the minimum PHP version has increased since Moodle 3.10. PHP 7.4.x is also supported.
<?php

declare(strict_types=1);

namespace Phpml\FeatureExtraction;

use Phpml\Tokenization\Tokenizer;
use Phpml\Transformer;

class TokenCountVectorizer implements Transformer
{
    /**
     * @var Tokenizer
     */
    private $tokenizer;

    /**
     * @var StopWords|null
     */
    private $stopWords;

    /**
     * @var float
     */
    private $minDF;

    /**
     * @var array
     */
    private $vocabulary = [];

    /**
     * @var array
     */
    private $frequencies = [];

    public function __construct(Tokenizer $tokenizer, ?StopWords $stopWords = null, float $minDF = 0.0)
    {
        $this->tokenizer = $tokenizer;
        $this->stopWords = $stopWords;
        $this->minDF = $minDF;
    }

    public function fit(array $samples, ?array $targets = null): void
    {
        $this->buildVocabulary($samples);
    }

< public function transform(array &$samples): void
> public function transform(array &$samples, ?array &$targets = null): void
{ array_walk($samples, function (string &$sample): void { $this->transformSample($sample); }); $this->checkDocumentFrequency($samples); } public function getVocabulary(): array { return array_flip($this->vocabulary); } private function buildVocabulary(array &$samples): void { foreach ($samples as $sample) { $tokens = $this->tokenizer->tokenize($sample); foreach ($tokens as $token) { $this->addTokenToVocabulary($token); } } } private function transformSample(string &$sample): void { $counts = []; $tokens = $this->tokenizer->tokenize($sample); foreach ($tokens as $token) { $index = $this->getTokenIndex($token); if ($index !== false) { $this->updateFrequency($token); if (!isset($counts[$index])) { $counts[$index] = 0; } ++$counts[$index]; } } foreach ($this->vocabulary as $index) { if (!isset($counts[$index])) { $counts[$index] = 0; } } ksort($counts); $sample = $counts; } /** * @return int|bool */ private function getTokenIndex(string $token) { if ($this->isStopWord($token)) { return false; } return $this->vocabulary[$token] ?? false; } private function addTokenToVocabulary(string $token): void { if ($this->isStopWord($token)) { return; } if (!isset($this->vocabulary[$token])) { $this->vocabulary[$token] = count($this->vocabulary); } } private function isStopWord(string $token): bool { return $this->stopWords !== null && $this->stopWords->isStopWord($token); } private function updateFrequency(string $token): void { if (!isset($this->frequencies[$token])) { $this->frequencies[$token] = 0; } ++$this->frequencies[$token]; } private function checkDocumentFrequency(array &$samples): void { if ($this->minDF > 0) { $beyondMinimum = $this->getBeyondMinimumIndexes(count($samples)); foreach ($samples as &$sample) { $this->resetBeyondMinimum($sample, $beyondMinimum); } } } private function resetBeyondMinimum(array &$sample, array $beyondMinimum): void { foreach ($beyondMinimum as $index) { $sample[$index] = 0; } } private function getBeyondMinimumIndexes(int $samplesCount): array { $indexes = []; foreach ($this->frequencies as $token => $frequency) { if (($frequency / $samplesCount) < $this->minDF) { $indexes[] = $this->getTokenIndex((string) $token); } } return $indexes; } }