See Release Notes
Long Term Support Release
<?php declare(strict_types=1); namespace Phpml\FeatureExtraction; use Phpml\Tokenization\Tokenizer; use Phpml\Transformer; class TokenCountVectorizer implements Transformer { /** * @var Tokenizer */ private $tokenizer; /** * @var StopWords|null */ private $stopWords; /** * @var float */ private $minDF; /** * @var array */ private $vocabulary = []; /** * @var array */ private $frequencies = []; public function __construct(Tokenizer $tokenizer, ?StopWords $stopWords = null, float $minDF = 0.0) { $this->tokenizer = $tokenizer; $this->stopWords = $stopWords; $this->minDF = $minDF; } public function fit(array $samples, ?array $targets = null): void { $this->buildVocabulary($samples); }< public function transform(array &$samples): void> public function transform(array &$samples, ?array &$targets = null): void{ array_walk($samples, function (string &$sample): void { $this->transformSample($sample); }); $this->checkDocumentFrequency($samples); } public function getVocabulary(): array { return array_flip($this->vocabulary); } private function buildVocabulary(array &$samples): void { foreach ($samples as $sample) { $tokens = $this->tokenizer->tokenize($sample); foreach ($tokens as $token) { $this->addTokenToVocabulary($token); } } } private function transformSample(string &$sample): void { $counts = []; $tokens = $this->tokenizer->tokenize($sample); foreach ($tokens as $token) { $index = $this->getTokenIndex($token); if ($index !== false) { $this->updateFrequency($token); if (!isset($counts[$index])) { $counts[$index] = 0; } ++$counts[$index]; } } foreach ($this->vocabulary as $index) { if (!isset($counts[$index])) { $counts[$index] = 0; } } ksort($counts); $sample = $counts; } /** * @return int|bool */ private function getTokenIndex(string $token) { if ($this->isStopWord($token)) { return false; } return $this->vocabulary[$token] ?? false; } private function addTokenToVocabulary(string $token): void { if ($this->isStopWord($token)) { return; } if (!isset($this->vocabulary[$token])) { $this->vocabulary[$token] = count($this->vocabulary); } } private function isStopWord(string $token): bool { return $this->stopWords !== null && $this->stopWords->isStopWord($token); } private function updateFrequency(string $token): void { if (!isset($this->frequencies[$token])) { $this->frequencies[$token] = 0; } ++$this->frequencies[$token]; } private function checkDocumentFrequency(array &$samples): void { if ($this->minDF > 0) { $beyondMinimum = $this->getBeyondMinimumIndexes(count($samples)); foreach ($samples as &$sample) { $this->resetBeyondMinimum($sample, $beyondMinimum); } } } private function resetBeyondMinimum(array &$sample, array $beyondMinimum): void { foreach ($beyondMinimum as $index) { $sample[$index] = 0; } } private function getBeyondMinimumIndexes(int $samplesCount): array { $indexes = []; foreach ($this->frequencies as $token => $frequency) { if (($frequency / $samplesCount) < $this->minDF) { $indexes[] = $this->getTokenIndex((string) $token); } } return $indexes; } }