Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 3.11.x will end 14 Nov 2022 (12 months plus 6 months extension).
  • Bug fixes for security issues in 3.11.x will end 13 Nov 2023 (18 months plus 12 months extension).
  • PHP version: minimum PHP 7.3.0 Note: minimum PHP version has increased since Moodle 3.10. PHP 7.4.x is supported too.
   1  <?php
   2  // This file is part of Moodle - http://moodle.org/
   3  //
   4  // Moodle is free software: you can redistribute it and/or modify
   5  // it under the terms of the GNU General Public License as published by
   6  // the Free Software Foundation, either version 3 of the License, or
   7  // (at your option) any later version.
   8  //
   9  // Moodle is distributed in the hope that it will be useful,
  10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  // GNU General Public License for more details.
  13  //
  14  // You should have received a copy of the GNU General Public License
  15  // along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
  16  
  17  /**
  18   * Datasets manager.
  19   *
  20   * @package   core_analytics
  21   * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
  22   * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  23   */
  24  
  25  namespace core_analytics;
  26  
  27  defined('MOODLE_INTERNAL') || die();
  28  
  29  /**
  30   * Datasets manager.
  31   *
  32   * @package   core_analytics
  33   * @copyright 2016 David Monllao {@link http://www.davidmonllao.com}
  34   * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  35   */
  36  class dataset_manager {
  37  
  38      /**
  39       * File area for labelled datasets.
  40       */
  41      const LABELLED_FILEAREA = 'labelled';
  42  
  43      /**
  44       * File area for unlabelled datasets.
  45       */
  46      const UNLABELLED_FILEAREA = 'unlabelled';
  47  
  48      /**
  49       * File area for exported datasets.
  50       */
  51      const EXPORT_FILEAREA = 'export';
  52  
  53      /**
  54       * Evaluation file file name.
  55       */
  56      const EVALUATION_FILENAME = 'evaluation.csv';
  57  
  58      /**
  59       * The model id.
  60       *
  61       * @var int
  62       */
  63      protected $modelid;
  64  
  65      /**
  66       * Range processor in use.
  67       *
  68       * @var string
  69       */
  70      protected $timesplittingid;
  71  
  72      /**
  73       * @var int
  74       */
  75      protected $analysableid;
  76  
  77      /**
  78       * Whether this is a dataset for evaluation or not.
  79       *
  80       * @var bool
  81       */
  82      protected $evaluation;
  83  
  84      /**
  85       * The dataset filearea. Must be one of the self::*_FILEAREA options.
  86       *
  87       * @var string
  88       */
  89      protected $filearea;
  90  
  91      /**
  92       * Constructor method.
  93       *
  94       * @throws \coding_exception
  95       * @param int $modelid
  96       * @param int $analysableid
  97       * @param string $timesplittingid
  98       * @param string $filearea
  99       * @param bool $evaluation
 100       * @return void
 101       */
 102      public function __construct($modelid, $analysableid, $timesplittingid, $filearea, $evaluation = false) {
 103  
 104          if ($filearea !== self::EXPORT_FILEAREA && $filearea !== self::LABELLED_FILEAREA &&
 105                  $filearea !== self::UNLABELLED_FILEAREA) {
 106              throw new \coding_exception('Invalid provided filearea');
 107          }
 108  
 109          $this->modelid = $modelid;
 110          $this->analysableid = $analysableid;
 111          $this->timesplittingid = $timesplittingid;
 112          $this->filearea = $filearea;
 113          $this->evaluation = $evaluation;
 114      }
 115  
 116      /**
 117       * Store the dataset in the internal file system.
 118       *
 119       * @param array $data
 120       * @return \stored_file
 121       */
 122      public function store($data) {
 123  
 124          // Delete previous file if it exists.
 125          $fs = get_file_storage();
 126  
 127          $filerecord = [
 128              'component' => 'analytics',
 129              'filearea' => $this->filearea,
 130              'itemid' => $this->modelid,
 131              'contextid' => \context_system::instance()->id,
 132              'filepath' => '/analysable/' . $this->analysableid . '/' .
 133                  \core_analytics\analysis::clean_time_splitting_id($this->timesplittingid) . '/',
 134              'filename' => self::get_filename($this->evaluation)
 135          ];
 136  
 137          // Delete previous and old (we already checked that previous copies are not recent) evaluation files for this analysable.
 138          if ($this->evaluation) {
 139              $select = " = {$filerecord['itemid']} AND filepath = :filepath";
 140              $fs->delete_area_files_select($filerecord['contextid'], $filerecord['component'], $filerecord['filearea'],
 141                  $select, array('filepath' => $filerecord['filepath']));
 142          }
 143  
 144          // Write all this stuff to a tmp file.
 145          $filepath = make_request_directory() . DIRECTORY_SEPARATOR . $filerecord['filename'];
 146          $fh = fopen($filepath, 'w+');
 147          if (!$fh) {
 148              return false;
 149          }
 150          foreach ($data as $line) {
 151              fputcsv($fh, $line);
 152          }
 153          fclose($fh);
 154  
 155          return $fs->create_file_from_pathname($filerecord, $filepath);
 156      }
 157  
 158      /**
 159       * Returns the previous evaluation file.
 160       *
 161       * Important to note that this is per modelid + timesplittingid, when dealing with multiple
 162       * analysables this is the merged file. Do not confuse with self::get_evaluation_analysable_file
 163       *
 164       * @param int $modelid
 165       * @param string $timesplittingid
 166       * @return \stored_file
 167       */
 168      public static function get_previous_evaluation_file($modelid, $timesplittingid) {
 169          $fs = get_file_storage();
 170          // Evaluation data is always labelled.
 171          $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
 172          return $fs->get_file(\context_system::instance()->id, 'analytics', self::LABELLED_FILEAREA, $modelid,
 173              $filepath, self::EVALUATION_FILENAME);
 174      }
 175  
 176      /**
 177       * Gets the list of files that couldn't be previously used for training and prediction.
 178       *
 179       * @param int $modelid
 180       * @param bool $includetarget
 181       * @param string[] $timesplittingids
 182       * @return null
 183       */
 184      public static function get_pending_files($modelid, $includetarget, $timesplittingids) {
 185          global $DB;
 186  
 187          $fs = get_file_storage();
 188  
 189          if ($includetarget) {
 190              $filearea = self::LABELLED_FILEAREA;
 191              $usedfileaction = 'trained';
 192          } else {
 193              $filearea = self::UNLABELLED_FILEAREA;
 194              $usedfileaction = 'predicted';
 195          }
 196  
 197          $select = 'modelid = :modelid AND action = :action';
 198          $params = array('modelid' => $modelid, 'action' => $usedfileaction);
 199          $usedfileids = $DB->get_fieldset_select('analytics_used_files', 'fileid', $select, $params);
 200  
 201          // Very likely that we will only have 1 time splitting method here.
 202          $filesbytimesplitting = array();
 203          foreach ($timesplittingids as $timesplittingid) {
 204  
 205              $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
 206              $files = $fs->get_directory_files(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath);
 207              foreach ($files as $file) {
 208  
 209                  // Discard evaluation files.
 210                  if ($file->get_filename() === self::EVALUATION_FILENAME) {
 211                      continue;
 212                  }
 213  
 214                  // No dirs.
 215                  if ($file->is_directory()) {
 216                      continue;
 217                  }
 218  
 219                  // Already used for training.
 220                  if (in_array($file->get_id(), $usedfileids)) {
 221                      continue;
 222                  }
 223  
 224                  $filesbytimesplitting[$timesplittingid][] = $file;
 225              }
 226          }
 227  
 228          return $filesbytimesplitting;
 229      }
 230  
 231      /**
 232       * Deletes previous evaluation files of this model.
 233       *
 234       * @param int $modelid
 235       * @param string $timesplittingid
 236       * @return bool
 237       */
 238      public static function delete_previous_evaluation_file($modelid, $timesplittingid) {
 239          if ($file = self::get_previous_evaluation_file($modelid, $timesplittingid)) {
 240              $file->delete();
 241              return true;
 242          }
 243  
 244          return false;
 245      }
 246  
 247      /**
 248       * Returns this (model + analysable + time splitting) file.
 249       *
 250       * @param int $modelid
 251       * @param int $analysableid
 252       * @param string $timesplittingid
 253       * @return \stored_file
 254       */
 255      public static function get_evaluation_analysable_file($modelid, $analysableid, $timesplittingid) {
 256  
 257          // Delete previous file if it exists.
 258          $fs = get_file_storage();
 259  
 260          // Always evaluation.csv and labelled as it is an evaluation file.
 261          $filearea = self::LABELLED_FILEAREA;
 262          $filename = self::get_filename(true);
 263          $filepath = '/analysable/' . $analysableid . '/' .
 264              \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
 265          return $fs->get_file(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath, $filename);
 266      }
 267  
 268      /**
 269       * Merge multiple files into one.
 270       *
 271       * Important! It is the caller responsability to ensure that the datasets are compatible.
 272       *
 273       * @param array  $files
 274       * @param int    $modelid
 275       * @param string $timesplittingid
 276       * @param string $filearea
 277       * @param bool   $evaluation
 278       * @return \stored_file
 279       */
 280      public static function merge_datasets(array $files, $modelid, $timesplittingid, $filearea, $evaluation = false) {
 281  
 282          $tmpfilepath = make_request_directory() . DIRECTORY_SEPARATOR . 'tmpfile.csv';
 283  
 284          // Add headers.
 285          // We could also do this with a single iteration gathering all files headers and appending them to the beginning of the file
 286          // once all file contents are merged.
 287          $varnames = '';
 288          $analysablesvalues = array();
 289          foreach ($files as $file) {
 290              $rh = $file->get_content_file_handle();
 291  
 292              // Copy the var names as they are, all files should have the same var names.
 293              $varnames = fgetcsv($rh);
 294  
 295              $analysablesvalues[] = fgetcsv($rh);
 296  
 297              // Copy the columns as they are, all files should have the same columns.
 298              $columns = fgetcsv($rh);
 299          }
 300  
 301          // Merge analysable values skipping the ones that are the same in all analysables.
 302          $values = array();
 303          foreach ($analysablesvalues as $analysablevalues) {
 304              foreach ($analysablevalues as $varkey => $value) {
 305                  // Sha1 to make it unique.
 306                  $values[$varkey][sha1($value)] = $value;
 307              }
 308          }
 309          foreach ($values as $varkey => $varvalues) {
 310              $values[$varkey] = implode('|', $varvalues);
 311          }
 312  
 313          // Start writing to the merge file.
 314          $wh = fopen($tmpfilepath, 'w');
 315          if (!$wh) {
 316              throw new \moodle_exception('errorcannotwritedataset', 'analytics', '', $tmpfilepath);
 317          }
 318  
 319          fputcsv($wh, $varnames);
 320          fputcsv($wh, $values);
 321          fputcsv($wh, $columns);
 322  
 323          // Iterate through all files and add them to the tmp one. We don't want file contents in memory.
 324          foreach ($files as $file) {
 325              $rh = $file->get_content_file_handle();
 326  
 327              // Skip headers.
 328              fgets($rh);
 329              fgets($rh);
 330              fgets($rh);
 331  
 332              // Copy all the following lines.
 333              while ($line = fgets($rh)) {
 334                  fwrite($wh, $line);
 335              }
 336              fclose($rh);
 337          }
 338          fclose($wh);
 339  
 340          $filerecord = [
 341              'component' => 'analytics',
 342              'filearea' => $filearea,
 343              'itemid' => $modelid,
 344              'contextid' => \context_system::instance()->id,
 345              'filepath' => '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/',
 346              'filename' => self::get_filename($evaluation)
 347          ];
 348  
 349          $fs = get_file_storage();
 350  
 351          return $fs->create_file_from_pathname($filerecord, $tmpfilepath);
 352      }
 353  
 354      /**
 355       * Exports the model training data.
 356       *
 357       * @param int $modelid
 358       * @param string $timesplittingid
 359       * @return \stored_file|false
 360       */
 361      public static function export_training_data($modelid, $timesplittingid) {
 362  
 363          $fs = get_file_storage();
 364  
 365          $contextid = \context_system::instance()->id;
 366          $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/';
 367  
 368          $files = $fs->get_directory_files($contextid, 'analytics', self::LABELLED_FILEAREA, $modelid,
 369              $filepath, true, false);
 370  
 371          // Discard evaluation files.
 372          foreach ($files as $key => $file) {
 373              if ($file->get_filename() === self::EVALUATION_FILENAME) {
 374                  unset($files[$key]);
 375              }
 376          }
 377  
 378          if (empty($files)) {
 379              return false;
 380          }
 381  
 382          return self::merge_datasets($files, $modelid, $timesplittingid, self::EXPORT_FILEAREA);
 383      }
 384  
 385      /**
 386       * Returns the dataset file data structured by sampleids using the indicators and target column names.
 387       *
 388       * @param \stored_file $dataset
 389       * @return array
 390       */
 391      public static function get_structured_data(\stored_file $dataset) {
 392  
 393          if ($dataset->get_filearea() !== 'unlabelled') {
 394              throw new \coding_exception('Sorry, only support for unlabelled data');
 395          }
 396  
 397          $rh = $dataset->get_content_file_handle();
 398  
 399          // Skip dataset info.
 400          fgets($rh);
 401          fgets($rh);
 402  
 403          $calculations = array();
 404  
 405          $headers = fgetcsv($rh);
 406          // Get rid of the sampleid column name.
 407          array_shift($headers);
 408  
 409          while ($columns = fgetcsv($rh)) {
 410              $uniquesampleid = array_shift($columns);
 411  
 412              // Unfortunately fgetcsv does not respect line's var types.
 413              $calculations[$uniquesampleid] = array_map(function($value) {
 414  
 415                  if ($value === '') {
 416                      // We really want them as null because converted to float become 0
 417                      // and we need to treat the values separately.
 418                      return null;
 419                  } else if (is_numeric($value)) {
 420                      return floatval($value);
 421                  }
 422                  return $value;
 423              }, array_combine($headers, $columns));
 424          }
 425  
 426          return $calculations;
 427      }
 428  
 429      /**
 430       * Delete all files of a model.
 431       *
 432       * @param int $modelid
 433       * @return bool
 434       */
 435      public static function clear_model_files($modelid) {
 436          $fs = get_file_storage();
 437          return $fs->delete_area_files(\context_system::instance()->id, 'analytics', false, $modelid);
 438      }
 439  
 440      /**
 441       * Returns the file name to be used.
 442       *
 443       * @param strinbool $evaluation
 444       * @return string
 445       */
 446      protected static function get_filename($evaluation) {
 447  
 448          if ($evaluation === true) {
 449              $filename = self::EVALUATION_FILENAME;
 450          } else {
 451              // Incremental time, the lock will make sure we don't have concurrency problems.
 452              $filename = microtime(true) . '.csv';
 453          }
 454  
 455          return $filename;
 456      }
 457  }