Search moodle.org's
Developer Documentation

See Release Notes
Long Term Support Release

  • Bug fixes for general core bugs in 4.1.x will end 13 November 2023 (12 months).
  • Bug fixes for security issues in 4.1.x will end 10 November 2025 (36 months).
  • PHP version: minimum PHP 7.4.0 Note: minimum PHP version has increased since Moodle 4.0. PHP 8.0.x is supported too.
   1  <?php
   2  // This file is part of Moodle - http://moodle.org/
   3  //
   4  // Moodle is free software: you can redistribute it and/or modify
   5  // it under the terms of the GNU General Public License as published by
   6  // the Free Software Foundation, either version 3 of the License, or
   7  // (at your option) any later version.
   8  //
   9  // Moodle is distributed in the hope that it will be useful,
  10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  // GNU General Public License for more details.
  13  //
  14  // You should have received a copy of the GNU General Public License
  15  // along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
  16  
  17  /**
  18   * Php predictions processor
  19   *
  20   * @package   mlbackend_php
  21   * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
  22   * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  23   */
  24  
  25  namespace mlbackend_php;
  26  
  27  defined('MOODLE_INTERNAL') || die();
  28  
  29  use Phpml\Preprocessing\Normalizer;
  30  use Phpml\CrossValidation\RandomSplit;
  31  use Phpml\Dataset\ArrayDataset;
  32  use Phpml\ModelManager;
  33  use Phpml\Classification\Linear\LogisticRegression;
  34  use Phpml\Metric\ClassificationReport;
  35  
  36  /**
  37   * PHP predictions processor.
  38   *
  39   * @package   mlbackend_php
  40   * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
  41   * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  42   */
  43  class processor implements \core_analytics\classifier, \core_analytics\regressor, \core_analytics\packable {
  44  
  45      /**
  46       * Size of training / prediction batches.
  47       */
  48      const BATCH_SIZE = 5000;
  49  
  50      /**
  51       * Number of train iterations.
  52       */
  53      const TRAIN_ITERATIONS = 500;
  54  
  55      /**
  56       * File name of the serialised model.
  57       */
  58      const MODEL_FILENAME = 'model.ser';
  59  
  60      /**
  61       * @var bool
  62       */
  63      protected $limitedsize = false;
  64  
  65      /**
  66       * Checks if the processor is ready to use.
  67       *
  68       * @return bool
  69       */
  70      public function is_ready() {
  71          if (version_compare(phpversion(), '7.0.0') < 0) {
  72              return get_string('errorphp7required', 'mlbackend_php');
  73          }
  74          return true;
  75      }
  76  
  77      /**
  78       * Delete the stored models.
  79       *
  80       * @param string $uniqueid
  81       * @param string $modelversionoutputdir
  82       * @return null
  83       */
  84      public function clear_model($uniqueid, $modelversionoutputdir) {
  85          remove_dir($modelversionoutputdir);
  86      }
  87  
  88      /**
  89       * Delete the output directory.
  90       *
  91       * @param string $modeloutputdir
  92       * @param string $uniqueid
  93       * @return null
  94       */
  95      public function delete_output_dir($modeloutputdir, $uniqueid) {
  96          remove_dir($modeloutputdir);
  97      }
  98  
  99      /**
 100       * Train this processor classification model using the provided supervised learning dataset.
 101       *
 102       * @param string $uniqueid
 103       * @param \stored_file $dataset
 104       * @param string $outputdir
 105       * @return \stdClass
 106       */
 107      public function train_classification($uniqueid, \stored_file $dataset, $outputdir) {
 108  
 109          $modelfilepath = $this->get_model_filepath($outputdir);
 110  
 111          $modelmanager = new ModelManager();
 112  
 113          if (file_exists($modelfilepath)) {
 114              $classifier = $modelmanager->restoreFromFile($modelfilepath);
 115          } else {
 116              $classifier = $this->instantiate_algorithm();
 117          }
 118  
 119          $fh = $dataset->get_content_file_handle();
 120  
 121          // The first lines are var names and the second one values.
 122          $metadata = $this->extract_metadata($fh);
 123  
 124          // Skip headers.
 125          fgets($fh);
 126  
 127          $samples = array();
 128          $targets = array();
 129          while (($data = fgetcsv($fh)) !== false) {
 130              $sampledata = array_map('floatval', $data);
 131              $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);
 132              $targets[] = intval($data[$metadata['nfeatures']]);
 133  
 134              $nsamples = count($samples);
 135              if ($nsamples === self::BATCH_SIZE) {
 136                  // Training it batches to avoid running out of memory.
 137                  $classifier->partialTrain($samples, $targets, json_decode($metadata['targetclasses']));
 138                  $samples = array();
 139                  $targets = array();
 140              }
 141              if (empty($morethan1sample) && $nsamples > 1) {
 142                  $morethan1sample = true;
 143              }
 144          }
 145          fclose($fh);
 146  
 147          if (empty($morethan1sample)) {
 148              $resultobj = new \stdClass();
 149              $resultobj->status = \core_analytics\model::NO_DATASET;
 150              $resultobj->info = array();
 151              return $resultobj;
 152          }
 153  
 154          // Train the remaining samples.
 155          if ($samples) {
 156              $classifier->partialTrain($samples, $targets, json_decode($metadata['targetclasses']));
 157          }
 158  
 159          $resultobj = new \stdClass();
 160          $resultobj->status = \core_analytics\model::OK;
 161          $resultobj->info = array();
 162  
 163          // Store the trained model.
 164          $modelmanager->saveToFile($classifier, $modelfilepath);
 165  
 166          return $resultobj;
 167      }
 168  
 169      /**
 170       * Classifies the provided dataset samples.
 171       *
 172       * @param string $uniqueid
 173       * @param \stored_file $dataset
 174       * @param string $outputdir
 175       * @return \stdClass
 176       */
 177      public function classify($uniqueid, \stored_file $dataset, $outputdir) {
 178  
 179          $classifier = $this->load_classifier($outputdir);
 180  
 181          $fh = $dataset->get_content_file_handle();
 182  
 183          // The first lines are var names and the second one values.
 184          $metadata = $this->extract_metadata($fh);
 185  
 186          // Skip headers.
 187          fgets($fh);
 188  
 189          $sampleids = array();
 190          $samples = array();
 191          $predictions = array();
 192          while (($data = fgetcsv($fh)) !== false) {
 193              $sampledata = array_map('floatval', $data);
 194              $sampleids[] = $data[0];
 195              $samples[] = array_slice($sampledata, 1, $metadata['nfeatures']);
 196  
 197              if (count($samples) === self::BATCH_SIZE) {
 198                  // Prediction it batches to avoid running out of memory.
 199  
 200                  // Append predictions incrementally, we want $sampleids keys in sync with $predictions keys.
 201                  $newpredictions = $classifier->predict($samples);
 202                  foreach ($newpredictions as $prediction) {
 203                      array_push($predictions, $prediction);
 204                  }
 205                  $samples = array();
 206              }
 207          }
 208          fclose($fh);
 209  
 210          // Finish the remaining predictions.
 211          if ($samples) {
 212              $predictions = $predictions + $classifier->predict($samples);
 213          }
 214  
 215          $resultobj = new \stdClass();
 216          $resultobj->status = \core_analytics\model::OK;
 217          $resultobj->info = array();
 218  
 219          foreach ($predictions as $index => $prediction) {
 220              $resultobj->predictions[$index] = array($sampleids[$index], $prediction);
 221          }
 222  
 223          return $resultobj;
 224      }
 225  
 226      /**
 227       * Evaluates this processor classification model using the provided supervised learning dataset.
 228       *
 229       * During evaluation we need to shuffle the evaluation dataset samples to detect deviated results,
 230       * if the dataset is massive we can not load everything into memory. We know that 2GB is the
 231       * minimum memory limit we should have (\core_analytics\model::heavy_duty_mode), if we substract the memory
 232       * that we already consumed and the memory that Phpml algorithms will need we should still have at
 233       * least 500MB of memory, which should be enough to evaluate a model. In any case this is a robust
 234       * solution that will work for all sites but it should minimize memory limit problems. Site admins
 235       * can still set $CFG->mlbackend_php_no_evaluation_limits to true to skip this 500MB limit.
 236       *
 237       * @param string $uniqueid
 238       * @param float $maxdeviation
 239       * @param int $niterations
 240       * @param \stored_file $dataset
 241       * @param string $outputdir
 242       * @param  string $trainedmodeldir
 243       * @return \stdClass
 244       */
 245      public function evaluate_classification($uniqueid, $maxdeviation, $niterations, \stored_file $dataset,
 246              $outputdir, $trainedmodeldir) {
 247          $fh = $dataset->get_content_file_handle();
 248  
 249          if ($trainedmodeldir) {
 250              // We overwrite the number of iterations as the results will always be the same.
 251              $niterations = 1;
 252              $classifier = $this->load_classifier($trainedmodeldir);
 253          }
 254  
 255          // The first lines are var names and the second one values.
 256          $metadata = $this->extract_metadata($fh);
 257  
 258          // Skip headers.
 259          fgets($fh);
 260  
 261          if (empty($CFG->mlbackend_php_no_evaluation_limits)) {
 262              $samplessize = 0;
 263              $limit = get_real_size('500MB');
 264  
 265              // Just an approximation, will depend on PHP version, compile options...
 266              // Double size + zval struct (6 bytes + 8 bytes + 16 bytes) + array bucket (96 bytes)
 267              // https://nikic.github.io/2011/12/12/How-big-are-PHP-arrays-really-Hint-BIG.html.
 268              $floatsize = (PHP_INT_SIZE * 2) + 6 + 8 + 16 + 96;
 269          }
 270  
 271          $samples = array();
 272          $targets = array();
 273          while (($data = fgetcsv($fh)) !== false) {
 274              $sampledata = array_map('floatval', $data);
 275  
 276              $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);
 277              $targets[] = intval($data[$metadata['nfeatures']]);
 278  
 279              if (empty($CFG->mlbackend_php_no_evaluation_limits)) {
 280                  // We allow admins to disable evaluation memory usage limits by modifying config.php.
 281  
 282                  // We will have plenty of missing values in the dataset so it should be a conservative approximation.
 283                  $samplessize = $samplessize + (count($sampledata) * $floatsize);
 284  
 285                  // Stop fetching more samples.
 286                  if ($samplessize >= $limit) {
 287                      $this->limitedsize = true;
 288                      break;
 289                  }
 290              }
 291          }
 292          fclose($fh);
 293  
 294          // We need at least 2 samples belonging to each target.
 295          $counts = array_count_values($targets);
 296          $ntargets = count(explode(',', $metadata['targetclasses']));
 297          foreach ($counts as $count) {
 298              if ($count < 2) {
 299                  $notenoughdata = true;
 300              }
 301          }
 302          if ($ntargets > count($counts)) {
 303              $notenoughdata = true;
 304          }
 305          if (!empty($notenoughdata)) {
 306              $resultobj = new \stdClass();
 307              $resultobj->status = \core_analytics\model::NOT_ENOUGH_DATA;
 308              $resultobj->score = 0;
 309              $resultobj->info = array(get_string('errornotenoughdata', 'mlbackend_php'));
 310              return $resultobj;
 311          }
 312  
 313          $scores = array();
 314  
 315          // Evaluate the model multiple times to confirm the results are not significantly random due to a short amount of data.
 316          for ($i = 0; $i < $niterations; $i++) {
 317  
 318              if (!$trainedmodeldir) {
 319                  $classifier = $this->instantiate_algorithm();
 320  
 321                  // Split up the dataset in classifier and testing.
 322                  $data = new RandomSplit(new ArrayDataset($samples, $targets), 0.2);
 323  
 324                  $classifier->train($data->getTrainSamples(), $data->getTrainLabels());
 325                  $predictedlabels = $classifier->predict($data->getTestSamples());
 326                  $report = new ClassificationReport($data->getTestLabels(), $predictedlabels,
 327                      ClassificationReport::WEIGHTED_AVERAGE);
 328              } else {
 329                  $predictedlabels = $classifier->predict($samples);
 330                  $report = new ClassificationReport($targets, $predictedlabels,
 331                      ClassificationReport::WEIGHTED_AVERAGE);
 332              }
 333              $averages = $report->getAverage();
 334              $scores[] = $averages['f1score'];
 335          }
 336  
 337          // Let's fill the results changing the returned status code depending on the phi-related calculated metrics.
 338          return $this->get_evaluation_result_object($dataset, $scores, $maxdeviation);
 339      }
 340  
 341      /**
 342       * Returns the results objects from all evaluations.
 343       *
 344       * @param \stored_file $dataset
 345       * @param array $scores
 346       * @param float $maxdeviation
 347       * @return \stdClass
 348       */
 349      protected function get_evaluation_result_object(\stored_file $dataset, $scores, $maxdeviation) {
 350  
 351          // Average f1 score of all evaluations as final score.
 352          if (count($scores) === 1) {
 353              $avgscore = reset($scores);
 354          } else {
 355              $avgscore = \Phpml\Math\Statistic\Mean::arithmetic($scores);
 356          }
 357  
 358          // Standard deviation should ideally be calculated against the area under the curve.
 359          if (count($scores) === 1) {
 360              $modeldev = 0;
 361          } else {
 362              $modeldev = \Phpml\Math\Statistic\StandardDeviation::population($scores);
 363          }
 364  
 365          // Let's fill the results object.
 366          $resultobj = new \stdClass();
 367  
 368          // Zero is ok, now we add other bits if something is not right.
 369          $resultobj->status = \core_analytics\model::OK;
 370          $resultobj->info = array();
 371          $resultobj->score = $avgscore;
 372  
 373          // If each iteration results varied too much we need more data to confirm that this is a valid model.
 374          if ($modeldev > $maxdeviation) {
 375              $resultobj->status = $resultobj->status + \core_analytics\model::NOT_ENOUGH_DATA;
 376              $a = new \stdClass();
 377              $a->deviation = $modeldev;
 378              $a->accepteddeviation = $maxdeviation;
 379              $resultobj->info[] = get_string('errornotenoughdatadev', 'mlbackend_php', $a);
 380          }
 381  
 382          if ($resultobj->score < \core_analytics\model::MIN_SCORE) {
 383              $resultobj->status = $resultobj->status + \core_analytics\model::LOW_SCORE;
 384              $a = new \stdClass();
 385              $a->score = $resultobj->score;
 386              $a->minscore = \core_analytics\model::MIN_SCORE;
 387              $resultobj->info[] = get_string('errorlowscore', 'mlbackend_php', $a);
 388          }
 389  
 390          if ($this->limitedsize === true) {
 391              $resultobj->info[] = get_string('datasetsizelimited', 'mlbackend_php', display_size($dataset->get_filesize()));
 392          }
 393  
 394          return $resultobj;
 395      }
 396  
 397      /**
 398       * Loads the pre-trained classifier.
 399       *
 400       * @throws \moodle_exception
 401       * @param string $outputdir
 402       * @return \Phpml\Classification\Linear\LogisticRegression
 403       */
 404      protected function load_classifier($outputdir) {
 405          $modelfilepath = $this->get_model_filepath($outputdir);
 406  
 407          if (!file_exists($modelfilepath)) {
 408              throw new \moodle_exception('errorcantloadmodel', 'mlbackend_php', '', $modelfilepath);
 409          }
 410  
 411          $modelmanager = new ModelManager();
 412          return $modelmanager->restoreFromFile($modelfilepath);
 413      }
 414  
 415      /**
 416       * Train this processor regression model using the provided supervised learning dataset.
 417       *
 418       * @throws new \coding_exception
 419       * @param string $uniqueid
 420       * @param \stored_file $dataset
 421       * @param string $outputdir
 422       * @return \stdClass
 423       */
 424      public function train_regression($uniqueid, \stored_file $dataset, $outputdir) {
 425          throw new \coding_exception('This predictor does not support regression yet.');
 426      }
 427  
 428      /**
 429       * Estimates linear values for the provided dataset samples.
 430       *
 431       * @throws new \coding_exception
 432       * @param string $uniqueid
 433       * @param \stored_file $dataset
 434       * @param mixed $outputdir
 435       * @return void
 436       */
 437      public function estimate($uniqueid, \stored_file $dataset, $outputdir) {
 438          throw new \coding_exception('This predictor does not support regression yet.');
 439      }
 440  
 441      /**
 442       * Evaluates this processor regression model using the provided supervised learning dataset.
 443       *
 444       * @throws new \coding_exception
 445       * @param string $uniqueid
 446       * @param float $maxdeviation
 447       * @param int $niterations
 448       * @param \stored_file $dataset
 449       * @param string $outputdir
 450       * @param  string $trainedmodeldir
 451       * @return \stdClass
 452       */
 453      public function evaluate_regression($uniqueid, $maxdeviation, $niterations, \stored_file $dataset,
 454              $outputdir, $trainedmodeldir) {
 455          throw new \coding_exception('This predictor does not support regression yet.');
 456      }
 457  
 458      /**
 459       * Exports the machine learning model.
 460       *
 461       * @throws \moodle_exception
 462       * @param  string $uniqueid  The model unique id
 463       * @param  string $modeldir  The directory that contains the trained model.
 464       * @return string            The path to the directory that contains the exported model.
 465       */
 466      public function export(string $uniqueid, string $modeldir) : string {
 467  
 468          $modelfilepath = $this->get_model_filepath($modeldir);
 469  
 470          if (!file_exists($modelfilepath)) {
 471              throw new \moodle_exception('errorexportmodelresult', 'analytics');
 472          }
 473  
 474          // We can use the actual $modeldir as the directory is not modified during export, just copied into a zip.
 475          return $modeldir;
 476      }
 477  
 478      /**
 479       * Imports the provided machine learning model.
 480       *
 481       * @param  string $uniqueid The model unique id
 482       * @param  string $modeldir  The directory that will contain the trained model.
 483       * @param  string $importdir The directory that contains the files to import.
 484       * @return bool Success
 485       */
 486      public function import(string $uniqueid, string $modeldir, string $importdir) : bool {
 487  
 488          $importmodelfilepath = $this->get_model_filepath($importdir);
 489          $modelfilepath = $this->get_model_filepath($modeldir);
 490  
 491          $modelmanager = new ModelManager();
 492  
 493          // Copied from ModelManager::restoreFromFile to validate the serialised contents
 494          // before restoring them.
 495          $importconfig = file_get_contents($importmodelfilepath);
 496  
 497          // Clean stuff like function calls.
 498          $importconfig = preg_replace('/[^a-zA-Z0-9\{\}%\.\*\;\,\:\"\-\0\\\]/', '', $importconfig);
 499  
 500          $object = unserialize($importconfig,
 501              ['allowed_classes' => ['Phpml\\Classification\\Linear\\LogisticRegression']]);
 502          if (!$object) {
 503              return false;
 504          }
 505  
 506          if (get_class($object) == '__PHP_Incomplete_Class') {
 507              return false;
 508          }
 509  
 510          $classifier = $modelmanager->restoreFromFile($importmodelfilepath);
 511  
 512          // This would override any previous classifier.
 513          $modelmanager->saveToFile($classifier, $modelfilepath);
 514  
 515          return true;
 516      }
 517  
 518      /**
 519       * Returns the path to the serialised model file in the provided directory.
 520       *
 521       * @param  string $modeldir The model directory
 522       * @return string           The model file
 523       */
 524      protected function get_model_filepath(string $modeldir) : string {
 525          // Output directory is already unique to the model.
 526          return $modeldir . DIRECTORY_SEPARATOR . self::MODEL_FILENAME;
 527      }
 528  
 529      /**
 530       * Extracts metadata from the dataset file.
 531       *
 532       * The file poiter should be located at the top of the file.
 533       *
 534       * @param resource $fh
 535       * @return array
 536       */
 537      protected function extract_metadata($fh) {
 538          $metadata = fgetcsv($fh);
 539          return array_combine($metadata, fgetcsv($fh));
 540      }
 541  
 542      /**
 543       * Instantiates the ML algorithm.
 544       *
 545       * @return \Phpml\Classification\Linear\LogisticRegression
 546       */
 547      protected function instantiate_algorithm(): \Phpml\Classification\Linear\LogisticRegression {
 548          return new LogisticRegression(self::TRAIN_ITERATIONS, true,
 549              LogisticRegression::CONJUGATE_GRAD_TRAINING, 'log');
 550      }
 551  }