Search moodle.org's
Developer Documentation

See Release Notes
Long Term Support Release

  • Bug fixes for general core bugs in 3.9.x will end* 10 May 2021 (12 months).
  • Bug fixes for security issues in 3.9.x will end* 8 May 2023 (36 months).
  • PHP version: minimum PHP 7.2.0 Note: minimum PHP version has increased since Moodle 3.8. PHP 7.3.x and 7.4.x are supported too.
   1  <?php
   2  // This file is part of Moodle - http://moodle.org/
   3  //
   4  // Moodle is free software: you can redistribute it and/or modify
   5  // it under the terms of the GNU General Public License as published by
   6  // the Free Software Foundation, either version 3 of the License, or
   7  // (at your option) any later version.
   8  //
   9  // Moodle is distributed in the hope that it will be useful,
  10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  // GNU General Public License for more details.
  13  //
  14  // You should have received a copy of the GNU General Public License
  15  // along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
  16  
  17  /**
  18   * Runs an analysis of the site.
  19   *
  20   * @package   core_analytics
  21   * @copyright 2019 David Monllao {@link http://www.davidmonllao.com}
  22   * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  23   */
  24  
  25  namespace core_analytics;
  26  
  27  defined('MOODLE_INTERNAL') || die();
  28  
  29  /**
  30   * Runs an analysis of the site.
  31   *
  32   * @package   core_analytics
  33   * @copyright 2019 David Monllao {@link http://www.davidmonllao.com}
  34   * @license   http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  35   */
  36  class analysis {
  37  
  38      /**
  39       * @var \core_analytics\local\analyser\base
  40       */
  41      private $analyser;
  42  
  43      /**
  44       * @var bool Whether to calculate the target or not in this run.
  45       */
  46      private $includetarget;
  47  
  48      /**
  49       * @var \core_analytics\local\analysis\result
  50       */
  51      private $result;
  52  
  53      /**
  54       * @var \core\lock\lock
  55       */
  56      private $lock;
  57  
  58      /**
  59       * Constructor.
  60       *
  61       * @param \core_analytics\local\analyser\base   $analyser
  62       * @param bool                                  $includetarget Whether to calculate the target or not.
  63       * @param \core_analytics\local\analysis\result $result
  64       */
  65      public function __construct(\core_analytics\local\analyser\base $analyser, bool $includetarget,
  66              \core_analytics\local\analysis\result $result) {
  67          $this->analyser = $analyser;
  68          $this->includetarget = $includetarget;
  69          $this->result = $result;
  70  
  71          // We cache the first time analysables were analysed because time-splitting methods can depend on these info.
  72          self::fill_firstanalyses_cache($this->analyser->get_modelid());
  73      }
  74  
  75      /**
  76       * Runs the analysis.
  77       *
  78       * @param \context[] $contexts Restrict the analysis to these contexts. No context restrictions if null.
  79       * @return null
  80       */
  81      public function run(array $contexts = []) {
  82  
  83          $options = $this->analyser->get_options();
  84  
  85          // Time limit control.
  86          $modeltimelimit = intval(get_config('analytics', 'modeltimelimit'));
  87  
  88          if ($this->includetarget) {
  89              $action = 'training';
  90          } else {
  91              $action = 'prediction';
  92          }
  93          $analysables = $this->analyser->get_analysables_iterator($action, $contexts);
  94  
  95          $processedanalysables = $this->get_processed_analysables();
  96  
  97          $inittime = microtime(true);
  98          foreach ($analysables as $analysable) {
  99              $processed = false;
 100  
 101              if (!$analysable) {
 102                  continue;
 103              }
 104  
 105              $analysableresults = $this->process_analysable($analysable);
 106              if ($analysableresults) {
 107                  $processed = $this->result->add_analysable_results($analysableresults);
 108                  if (!$processed) {
 109                      $errors = array();
 110                      foreach ($analysableresults as $timesplittingid => $result) {
 111                          $str = '';
 112                          if (count($analysableresults) > 1) {
 113                              $str .= $timesplittingid . ': ';
 114                          }
 115                          $str .= $result->message;
 116                          $errors[] = $str;
 117                      }
 118  
 119                      $a = new \stdClass();
 120                      $a->analysableid = $analysable->get_name();
 121                      $a->errors = implode(', ', $errors);
 122                      $this->analyser->add_log(get_string('analysablenotused', 'analytics', $a));
 123                  }
 124              }
 125  
 126              if (!$options['evaluation']) {
 127  
 128                  if (empty($processedanalysables[$analysable->get_id()]) ||
 129                          $this->analyser->get_target()->always_update_analysis_time() || $processed) {
 130                      // We store the list of processed analysables even if the target does not always_update_analysis_time(),
 131                      // what always_update_analysis_time controls is the update of the data.
 132                      $this->update_analysable_analysed_time($processedanalysables, $analysable->get_id());
 133                  }
 134  
 135                  // Apply time limit.
 136                  $timespent = microtime(true) - $inittime;
 137                  if ($modeltimelimit <= $timespent) {
 138                      break;
 139                  }
 140              }
 141          }
 142  
 143          // Force GC to clean up the indicator instances used during the last iteration.
 144          $this->analyser->instantiate_indicators();
 145      }
 146  
 147      /**
 148       * Get analysables that have been already processed.
 149       *
 150       * @return \stdClass[]
 151       */
 152      protected function get_processed_analysables(): array {
 153          global $DB;
 154  
 155          $params = array('modelid' => $this->analyser->get_modelid());
 156          $params['action'] = ($this->includetarget) ? 'training' : 'prediction';
 157          $select = 'modelid = :modelid and action = :action';
 158  
 159          // Weird select fields ordering for performance (analysableid key matching, analysableid is also unique by modelid).
 160          return $DB->get_records_select('analytics_used_analysables', $select,
 161              $params, 'timeanalysed DESC', 'analysableid, modelid, action, firstanalysis, timeanalysed, id AS primarykey');
 162      }
 163  
 164      /**
 165       * Processes an analysable
 166       *
 167       * This method returns the general analysable status, an array of files by time splitting method and
 168       * an error message if there is any problem.
 169       *
 170       * @param \core_analytics\analysable $analysable
 171       * @return \stdClass[] Results objects by time splitting method
 172       */
 173      public function process_analysable(\core_analytics\analysable $analysable): array {
 174  
 175          // Target instances scope is per-analysable (it can't be lower as calculations run once per
 176          // analysable, not time splitting method nor time range).
 177          $target = call_user_func(array($this->analyser->get_target(), 'instance'));
 178  
 179          // We need to check that the analysable is valid for the target even if we don't include targets
 180          // as we still need to discard invalid analysables for the target.
 181          $isvalidresult = $target->is_valid_analysable($analysable, $this->includetarget);
 182          if ($isvalidresult !== true) {
 183              $a = new \stdClass();
 184              $a->analysableid = $analysable->get_name();
 185              $a->result = $isvalidresult;
 186              $this->analyser->add_log(get_string('analysablenotvalidfortarget', 'analytics', $a));
 187              return array();
 188          }
 189  
 190          // Process all provided time splitting methods.
 191          $results = array();
 192          foreach ($this->analyser->get_timesplittings() as $timesplitting) {
 193  
 194              $cachedresult = $this->result->retrieve_cached_result($timesplitting, $analysable);
 195              if ($cachedresult) {
 196                  $result = new \stdClass();
 197                  $result->result = $cachedresult;
 198                  $results[$timesplitting->get_id()] = $result;
 199                  continue;
 200              }
 201  
 202              $results[$timesplitting->get_id()] = $this->process_time_splitting($timesplitting, $analysable, $target);
 203          }
 204  
 205          return $results;
 206      }
 207  
 208      /**
 209       * Processes the analysable samples using the provided time splitting method.
 210       *
 211       * @param \core_analytics\local\time_splitting\base $timesplitting
 212       * @param \core_analytics\analysable $analysable
 213       * @param \core_analytics\local\target\base $target
 214       * @return \stdClass Results object.
 215       */
 216      protected function process_time_splitting(\core_analytics\local\time_splitting\base $timesplitting,
 217              \core_analytics\analysable $analysable, \core_analytics\local\target\base $target): \stdClass {
 218  
 219          $options = $this->analyser->get_options();
 220  
 221          $result = new \stdClass();
 222  
 223          $timesplitting->set_modelid($this->analyser->get_modelid());
 224          if (!$timesplitting->is_valid_analysable($analysable)) {
 225              $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
 226              $result->message = get_string('invalidanalysablefortimesplitting', 'analytics',
 227                  $timesplitting->get_name());
 228              return $result;
 229          }
 230          $timesplitting->set_analysable($analysable);
 231  
 232          if (CLI_SCRIPT && !PHPUNIT_TEST) {
 233              mtrace('Analysing id "' . $analysable->get_id() . '" with "' . $timesplitting->get_name() .
 234                  '" time splitting method...');
 235          }
 236  
 237          // What is a sample is defined by the analyser, it can be an enrolment, a course, a user, a question
 238          // attempt... it is on what we will base indicators calculations.
 239          list($sampleids, $samplesdata) = $this->analyser->get_all_samples($analysable);
 240  
 241          if (count($sampleids) === 0) {
 242              $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
 243              $result->message = get_string('nodata', 'analytics');
 244              return $result;
 245          }
 246  
 247          if ($this->includetarget) {
 248              // All ranges are used when we are calculating data for training.
 249              $ranges = $timesplitting->get_training_ranges();
 250          } else {
 251              // The latest range that has not yet been used for prediction (it depends on the time range where we are right now).
 252              $ranges = $timesplitting->get_most_recent_prediction_range();
 253          }
 254  
 255          // There is no need to keep track of the evaluated samples and ranges as we always evaluate the whole dataset.
 256          if ($options['evaluation'] === false) {
 257  
 258              if (empty($ranges)) {
 259                  $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
 260                  $result->message = get_string('noranges', 'analytics');
 261                  return $result;
 262              }
 263  
 264              // We skip all samples that are already part of a training dataset, even if they have not been used for prediction.
 265              if (!$target::based_on_assumptions()) {
 266                  // Targets based on assumptions can not be trained.
 267                  $this->filter_out_train_samples($sampleids, $timesplitting);
 268              }
 269  
 270              if (count($sampleids) === 0) {
 271                  $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
 272                  $result->message = get_string('nonewdata', 'analytics');
 273                  return $result;
 274              }
 275  
 276              // Only when processing data for predictions.
 277              if (!$this->includetarget) {
 278                  // We also filter out samples and ranges that have already been used for predictions.
 279                  $predictsamplesrecord = $this->filter_out_prediction_samples_and_ranges($sampleids, $ranges, $timesplitting);
 280              }
 281  
 282              if (count($sampleids) === 0) {
 283                  $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
 284                  $result->message = get_string('nonewdata', 'analytics');
 285                  return $result;
 286              }
 287  
 288              if (count($ranges) === 0) {
 289                  $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
 290                  $result->message = get_string('nonewranges', 'analytics');
 291                  return $result;
 292              }
 293          }
 294  
 295          // Flag the model + analysable + timesplitting as being analysed (prevent concurrent executions).
 296          if (!$this->init_analysable_analysis($timesplitting->get_id(), $analysable->get_id())) {
 297              // If this model + analysable + timesplitting combination is being analysed we skip this process.
 298              $result->status = \core_analytics\model::NO_DATASET;
 299              $result->message = get_string('analysisinprogress', 'analytics');
 300              return $result;
 301          }
 302  
 303          // Remove samples the target consider invalid.
 304          try {
 305              $target->add_sample_data($samplesdata);
 306              $target->filter_out_invalid_samples($sampleids, $analysable, $this->includetarget);
 307          } catch (\Throwable $e) {
 308              $this->finish_analysable_analysis();
 309              throw $e;
 310          }
 311  
 312          if (!$sampleids) {
 313              $result->status = \core_analytics\model::NO_DATASET;
 314              $result->message = get_string('novalidsamples', 'analytics');
 315              $this->finish_analysable_analysis();
 316              return $result;
 317          }
 318  
 319          try {
 320              // Instantiate empty indicators to ensure that no garbage is dragged from previous analyses.
 321              $indicators = $this->analyser->instantiate_indicators();
 322              foreach ($indicators as $key => $indicator) {
 323                  // The analyser attaches the main entities the sample depends on and are provided to the
 324                  // indicator to calculate the sample.
 325                  $indicators[$key]->add_sample_data($samplesdata);
 326              }
 327  
 328              // Here we start the memory intensive process that will last until $data var is
 329              // unset (until the method is finished basically).
 330              $data = $this->calculate($timesplitting, $sampleids, $ranges, $target);
 331          } catch (\Throwable $e) {
 332              $this->finish_analysable_analysis();
 333              throw $e;
 334          }
 335  
 336          if (!$data) {
 337              $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD;
 338              $result->message = get_string('novaliddata', 'analytics');
 339              $this->finish_analysable_analysis();
 340              return $result;
 341          }
 342  
 343          try {
 344              // No need to keep track of analysed stuff when evaluating.
 345              if ($options['evaluation'] === false) {
 346                  // Save the samples that have been already analysed so they are not analysed again in future.
 347  
 348                  if ($this->includetarget) {
 349                      $this->save_train_samples($sampleids, $timesplitting);
 350                  } else {
 351                      // The variable $predictsamplesrecord will always be set as filter_out_prediction_samples_and_ranges
 352                      // will always be called before it (no evaluation mode and no includetarget).
 353                      $this->save_prediction_samples($sampleids, $ranges, $timesplitting, $predictsamplesrecord);
 354                  }
 355              }
 356  
 357              // We need to pass all the analysis data.
 358              $formattedresult = $this->result->format_result($data, $target, $timesplitting, $analysable);
 359  
 360          } catch (\Throwable $e) {
 361              $this->finish_analysable_analysis();
 362              throw $e;
 363          }
 364  
 365          if (!$formattedresult) {
 366              $this->finish_analysable_analysis();
 367              throw new \moodle_exception('errorcannotwritedataset', 'analytics');
 368          }
 369  
 370          $result->status = \core_analytics\model::OK;
 371          $result->message = get_string('successfullyanalysed', 'analytics');
 372          $result->result = $formattedresult;
 373  
 374          // Flag the model + analysable + timesplitting as analysed.
 375          $this->finish_analysable_analysis();
 376  
 377          return $result;
 378      }
 379  
 380      /**
 381       * Calculates indicators and targets.
 382       *
 383       * @param \core_analytics\local\time_splitting\base $timesplitting
 384       * @param array $sampleids
 385       * @param array $ranges
 386       * @param \core_analytics\local\target\base $target
 387       * @return array|null
 388       */
 389      public function calculate(\core_analytics\local\time_splitting\base $timesplitting, array &$sampleids,
 390              array $ranges, \core_analytics\local\target\base $target): ?array {
 391  
 392          $calculatedtarget = null;
 393          if ($this->includetarget) {
 394              // We first calculate the target because analysable data may still be invalid or none
 395              // of the analysable samples may be valid.
 396              $calculatedtarget = $target->calculate($sampleids, $timesplitting->get_analysable());
 397  
 398              // We remove samples we can not calculate their target.
 399              $sampleids = array_filter($sampleids, function($sampleid) use ($calculatedtarget) {
 400                  if (is_null($calculatedtarget[$sampleid])) {
 401                      return false;
 402                  }
 403                  return true;
 404              });
 405          }
 406  
 407          // No need to continue calculating if the target couldn't be calculated for any sample.
 408          if (empty($sampleids)) {
 409              return null;
 410          }
 411  
 412          $dataset = $this->calculate_indicators($timesplitting, $sampleids, $ranges);
 413  
 414          if (empty($dataset)) {
 415              return null;
 416          }
 417  
 418          // Now that we have the indicators in place we can add the time range indicators (and target if provided) to each of them.
 419          $this->fill_dataset($timesplitting, $dataset, $calculatedtarget);
 420  
 421          $this->add_context_metadata($timesplitting, $dataset, $target);
 422  
 423          if (!PHPUNIT_TEST && CLI_SCRIPT) {
 424              echo PHP_EOL;
 425          }
 426  
 427          return $dataset;
 428      }
 429  
 430      /**
 431       * Calculates indicators.
 432       *
 433       * @param \core_analytics\local\time_splitting\base $timesplitting
 434       * @param array $sampleids
 435       * @param array $ranges
 436       * @return array
 437       */
 438      protected function calculate_indicators(\core_analytics\local\time_splitting\base $timesplitting, array $sampleids,
 439              array $ranges): array {
 440          global $DB;
 441  
 442          $options = $this->analyser->get_options();
 443  
 444          $dataset = array();
 445  
 446          // Faster to run 1 db query per range.
 447          $existingcalculations = array();
 448          if ($timesplitting->cache_indicator_calculations()) {
 449              foreach ($ranges as $rangeindex => $range) {
 450                  // Load existing calculations.
 451                  $existingcalculations[$rangeindex] = \core_analytics\manager::get_indicator_calculations(
 452                      $timesplitting->get_analysable(), $range['start'], $range['end'], $this->analyser->get_samples_origin());
 453              }
 454          }
 455  
 456          // Here we store samples which calculations are not all null.
 457          $notnulls = array();
 458  
 459          // Fill the dataset samples with indicators data.
 460          $newcalculations = array();
 461          foreach ($this->analyser->get_indicators() as $indicator) {
 462  
 463              // Hook to allow indicators to store analysable-dependant data.
 464              $indicator->fill_per_analysable_caches($timesplitting->get_analysable());
 465  
 466              // Per-range calculations.
 467              foreach ($ranges as $rangeindex => $range) {
 468  
 469                  // Indicator instances are per-range.
 470                  $rangeindicator = clone $indicator;
 471  
 472                  $prevcalculations = array();
 473                  if (!empty($existingcalculations[$rangeindex][$rangeindicator->get_id()])) {
 474                      $prevcalculations = $existingcalculations[$rangeindex][$rangeindicator->get_id()];
 475                  }
 476  
 477                  // Calculate the indicator for each sample in this time range.
 478                  list($samplesfeatures, $newindicatorcalculations, $indicatornotnulls) = $rangeindicator->calculate($sampleids,
 479                      $this->analyser->get_samples_origin(), $range['start'], $range['end'], $prevcalculations);
 480  
 481                  // Associate the extra data generated by the indicator to this range index.
 482                  $rangeindicator->save_calculation_info($timesplitting, $rangeindex);
 483  
 484                  // Free memory ASAP.
 485                  unset($rangeindicator);
 486                  gc_collect_cycles();
 487                  gc_mem_caches();
 488  
 489                  // Copy the features data to the dataset.
 490                  foreach ($samplesfeatures as $analysersampleid => $features) {
 491  
 492                      $uniquesampleid = $timesplitting->append_rangeindex($analysersampleid, $rangeindex);
 493  
 494                      if (!isset($notnulls[$uniquesampleid]) && !empty($indicatornotnulls[$analysersampleid])) {
 495                          $notnulls[$uniquesampleid] = $uniquesampleid;
 496                      }
 497  
 498                      // Init the sample if it is still empty.
 499                      if (!isset($dataset[$uniquesampleid])) {
 500                          $dataset[$uniquesampleid] = array();
 501                      }
 502  
 503                      // Append the features indicator features at the end of the sample.
 504                      $dataset[$uniquesampleid] = array_merge($dataset[$uniquesampleid], $features);
 505                  }
 506  
 507                  if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations()) {
 508                      $timecreated = time();
 509                      foreach ($newindicatorcalculations as $sampleid => $calculatedvalue) {
 510                          // Prepare the new calculations to be stored into DB.
 511  
 512                          $indcalc = new \stdClass();
 513                          $indcalc->contextid = $timesplitting->get_analysable()->get_context()->id;
 514                          $indcalc->starttime = $range['start'];
 515                          $indcalc->endtime = $range['end'];
 516                          $indcalc->sampleid = $sampleid;
 517                          $indcalc->sampleorigin = $this->analyser->get_samples_origin();
 518                          $indcalc->indicator = $indicator->get_id();
 519                          $indcalc->value = $calculatedvalue;
 520                          $indcalc->timecreated = $timecreated;
 521                          $newcalculations[] = $indcalc;
 522                      }
 523                  }
 524              }
 525  
 526              if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations()) {
 527                  $batchsize = self::get_insert_batch_size();
 528                  if (count($newcalculations) > $batchsize) {
 529                      // We don't want newcalculations array to grow too much as we already keep the
 530                      // system memory busy storing $dataset contents.
 531  
 532                      // Insert from the beginning.
 533                      $remaining = array_splice($newcalculations, $batchsize);
 534  
 535                      // Sorry mssql and oracle, this will be slow.
 536                      $DB->insert_records('analytics_indicator_calc', $newcalculations);
 537                      $newcalculations = $remaining;
 538                  }
 539              }
 540          }
 541  
 542          if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations() && $newcalculations) {
 543              // Insert the remaining records.
 544              $DB->insert_records('analytics_indicator_calc', $newcalculations);
 545          }
 546  
 547          // Delete rows where all calculations are null.
 548          // We still store the indicator calculation and we still store the sample id as
 549          // processed so we don't have to process this sample again, but we exclude it
 550          // from the dataset because it is not useful.
 551          $nulls = array_diff_key($dataset, $notnulls);
 552          foreach ($nulls as $uniqueid => $ignoredvalues) {
 553              unset($dataset[$uniqueid]);
 554          }
 555  
 556          return $dataset;
 557      }
 558  
 559      /**
 560       * Adds time range indicators and the target to each sample.
 561       *
 562       * This will identify the sample as belonging to a specific range.
 563       *
 564       * @param \core_analytics\local\time_splitting\base $timesplitting
 565       * @param array $dataset
 566       * @param array|null $calculatedtarget
 567       * @return null
 568       */
 569      protected function fill_dataset(\core_analytics\local\time_splitting\base $timesplitting,
 570              array &$dataset, ?array $calculatedtarget = null) {
 571  
 572          $nranges = count($timesplitting->get_distinct_ranges());
 573  
 574          foreach ($dataset as $uniquesampleid => $unmodified) {
 575  
 576              list($analysersampleid, $rangeindex) = $timesplitting->infer_sample_info($uniquesampleid);
 577  
 578              // No need to add range features if this time splitting method only defines one time range.
 579              if ($nranges > 1) {
 580  
 581                  // 1 column for each range.
 582                  $timeindicators = array_fill(0, $nranges, 0);
 583  
 584                  $timeindicators[$rangeindex] = 1;
 585  
 586                  $dataset[$uniquesampleid] = array_merge($timeindicators, $dataset[$uniquesampleid]);
 587              }
 588  
 589              if ($calculatedtarget) {
 590                  // Add this sampleid's calculated target and the end.
 591                  $dataset[$uniquesampleid][] = $calculatedtarget[$analysersampleid];
 592  
 593              } else {
 594                  // Add this sampleid, it will be used to identify the prediction that comes back from
 595                  // the predictions processor.
 596                  array_unshift($dataset[$uniquesampleid], $uniquesampleid);
 597              }
 598          }
 599      }
 600  
 601      /**
 602       * Updates the analysable analysis time.
 603       *
 604       * @param array $processedanalysables
 605       * @param int $analysableid
 606       * @return null
 607       */
 608      protected function update_analysable_analysed_time(array $processedanalysables, int $analysableid) {
 609          global $DB;
 610  
 611          $now = time();
 612  
 613          if (!empty($processedanalysables[$analysableid])) {
 614              $obj = $processedanalysables[$analysableid];
 615  
 616              $obj->id = $obj->primarykey;
 617              unset($obj->primarykey);
 618  
 619              $obj->timeanalysed = $now;
 620  
 621              $DB->update_record('analytics_used_analysables', $obj);
 622  
 623          } else {
 624  
 625              $obj = new \stdClass();
 626              $obj->modelid = $this->analyser->get_modelid();
 627              $obj->action = ($this->includetarget) ? 'training' : 'prediction';
 628              $obj->analysableid = $analysableid;
 629              $obj->firstanalysis = $now;
 630              $obj->timeanalysed = $now;
 631  
 632              $obj->primarykey = $DB->insert_record('analytics_used_analysables', $obj);
 633  
 634              // Update the cache just in case it is used in the same request.
 635              $key = $this->analyser->get_modelid() . '_' . $analysableid;
 636              $cache = \cache::make('core', 'modelfirstanalyses');
 637              $cache->set($key, $now);
 638          }
 639      }
 640  
 641      /**
 642       * Fills a cache containing the first time each analysable in the provided model was analysed.
 643       *
 644       * @param int $modelid
 645       * @param int|null $analysableid
 646       * @return null
 647       */
 648      public static function fill_firstanalyses_cache(int $modelid, ?int $analysableid = null) {
 649          global $DB;
 650  
 651          // Using composed keys instead of cache $identifiers because of MDL-65358.
 652          $primarykey = $DB->sql_concat($modelid, "'_'", 'analysableid');
 653          $sql = "SELECT $primarykey AS id, MIN(firstanalysis) AS firstanalysis
 654                    FROM {analytics_used_analysables} aua
 655                   WHERE modelid = :modelid";
 656          $params = ['modelid' => $modelid];
 657  
 658          if ($analysableid) {
 659              $sql .= " AND analysableid = :analysableid";
 660              $params['analysableid'] = $analysableid;
 661          }
 662  
 663          $sql .= " GROUP BY modelid, analysableid ORDER BY analysableid";
 664  
 665          $firstanalyses = $DB->get_records_sql($sql, $params);
 666          if ($firstanalyses) {
 667              $cache = \cache::make('core', 'modelfirstanalyses');
 668  
 669              $firstanalyses = array_map(function($record) {
 670                  return $record->firstanalysis;
 671              }, $firstanalyses);
 672  
 673              $cache->set_many($firstanalyses);
 674          }
 675  
 676          return $firstanalyses;
 677      }
 678  
 679      /**
 680       * Adds dataset context info.
 681       *
 682       * The final dataset document will look like this:
 683       * ----------------------------------------------------
 684       * metadata1,metadata2,metadata3,.....
 685       * value1, value2, value3,.....
 686       *
 687       * header1,header2,header3,header4,.....
 688       * stud1value1,stud1value2,stud1value3,stud1value4,.....
 689       * stud2value1,stud2value2,stud2value3,stud2value4,.....
 690       * .....
 691       * ----------------------------------------------------
 692       *
 693       * @param \core_analytics\local\time_splitting\base $timesplitting
 694       * @param array $dataset
 695       * @param \core_analytics\local\target\base $target
 696       * @return null
 697       */
 698      protected function add_context_metadata(\core_analytics\local\time_splitting\base $timesplitting, array &$dataset,
 699              \core_analytics\local\target\base $target) {
 700          $headers = $this->get_headers($timesplitting, $target);
 701  
 702          // This will also reset samples' dataset keys.
 703          array_unshift($dataset, $headers);
 704      }
 705  
 706      /**
 707       * Returns the headers for the csv file based on the indicators and the target.
 708       *
 709       * @param \core_analytics\local\time_splitting\base $timesplitting
 710       * @param \core_analytics\local\target\base $target
 711       * @return string[]
 712       */
 713      public function get_headers(\core_analytics\local\time_splitting\base $timesplitting,
 714              \core_analytics\local\target\base $target): array {
 715          // 3rd column will contain the indicator ids.
 716          $headers = array();
 717  
 718          if (!$this->includetarget) {
 719              // The first column is the sampleid.
 720              $headers[] = 'sampleid';
 721          }
 722  
 723          // We always have 1 column for each time splitting method range, it does not depend on how
 724          // many ranges we calculated.
 725          $ranges = $timesplitting->get_distinct_ranges();
 726          if (count($ranges) > 1) {
 727              foreach ($ranges as $rangeindex) {
 728                  $headers[] = 'range/' . $rangeindex;
 729              }
 730          }
 731  
 732          // Model indicators.
 733          foreach ($this->analyser->get_indicators() as $indicator) {
 734              $headers = array_merge($headers, $indicator::get_feature_headers());
 735          }
 736  
 737          // The target as well.
 738          if ($this->includetarget) {
 739              $headers[] = $target->get_id();
 740          }
 741  
 742          return $headers;
 743      }
 744  
 745      /**
 746       * Filters out samples that have already been used for training.
 747       *
 748       * @param int[] $sampleids
 749       * @param \core_analytics\local\time_splitting\base $timesplitting
 750       * @return  null
 751       */
 752      protected function filter_out_train_samples(array &$sampleids, \core_analytics\local\time_splitting\base $timesplitting) {
 753          global $DB;
 754  
 755          $params = array('modelid' => $this->analyser->get_modelid(), 'analysableid' => $timesplitting->get_analysable()->get_id(),
 756              'timesplitting' => $timesplitting->get_id());
 757  
 758          $trainingsamples = $DB->get_records('analytics_train_samples', $params);
 759  
 760          // Skip each file trained samples.
 761          foreach ($trainingsamples as $trainingfile) {
 762  
 763              $usedsamples = json_decode($trainingfile->sampleids, true);
 764  
 765              if (!empty($usedsamples)) {
 766                  // Reset $sampleids to $sampleids minus this file's $usedsamples.
 767                  $sampleids = array_diff_key($sampleids, $usedsamples);
 768              }
 769          }
 770      }
 771  
 772      /**
 773       * Filters out samples that have already been used for prediction.
 774       *
 775       * @param int[] $sampleids
 776       * @param array $ranges
 777       * @param \core_analytics\local\time_splitting\base $timesplitting
 778       * @return  \stdClass|null The analytics_predict_samples record or null
 779       */
 780      protected function filter_out_prediction_samples_and_ranges(array &$sampleids, array &$ranges,
 781              \core_analytics\local\time_splitting\base $timesplitting) {
 782  
 783          if (count($ranges) > 1) {
 784              throw new \coding_exception('$ranges argument should only contain one range');
 785          }
 786  
 787          $rangeindex = key($ranges);
 788          $predictedrange = $this->get_predict_samples_record($timesplitting, $rangeindex);
 789  
 790          if (!$predictedrange) {
 791              // Nothing to filter out.
 792              return null;
 793          }
 794  
 795          $predictedrange->sampleids = json_decode($predictedrange->sampleids, true);
 796          $missingsamples = array_diff_key($sampleids, $predictedrange->sampleids);
 797          if (count($missingsamples) === 0) {
 798              // All samples already calculated.
 799              unset($ranges[$rangeindex]);
 800              return null;
 801          }
 802  
 803          // Replace the list of samples by the one excluding samples that already got predictions at this range.
 804          $sampleids = $missingsamples;
 805  
 806          return $predictedrange;
 807      }
 808  
 809      /**
 810       * Returns a predict samples record.
 811       *
 812       * @param  \core_analytics\local\time_splitting\base $timesplitting
 813       * @param  int                                       $rangeindex
 814       * @return \stdClass|false
 815       */
 816      private function get_predict_samples_record(\core_analytics\local\time_splitting\base $timesplitting, int $rangeindex) {
 817          global $DB;
 818  
 819          $params = array('modelid' => $this->analyser->get_modelid(), 'analysableid' => $timesplitting->get_analysable()->get_id(),
 820              'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex);
 821          $predictedrange = $DB->get_record('analytics_predict_samples', $params);
 822  
 823          return $predictedrange;
 824      }
 825  
 826      /**
 827       * Saves samples that have just been used for training.
 828       *
 829       * @param int[] $sampleids
 830       * @param \core_analytics\local\time_splitting\base $timesplitting
 831       * @return null
 832       */
 833      protected function save_train_samples(array $sampleids, \core_analytics\local\time_splitting\base $timesplitting) {
 834          global $DB;
 835  
 836          $trainingsamples = new \stdClass();
 837          $trainingsamples->modelid = $this->analyser->get_modelid();
 838          $trainingsamples->analysableid = $timesplitting->get_analysable()->get_id();
 839          $trainingsamples->timesplitting = $timesplitting->get_id();
 840  
 841          $trainingsamples->sampleids = json_encode($sampleids);
 842          $trainingsamples->timecreated = time();
 843  
 844          $DB->insert_record('analytics_train_samples', $trainingsamples);
 845      }
 846  
 847      /**
 848       * Saves samples that have just been used for prediction.
 849       *
 850       * @param int[] $sampleids
 851       * @param array $ranges
 852       * @param \core_analytics\local\time_splitting\base $timesplitting
 853       * @param \stdClass|null $predictsamplesrecord The existing record or null if there is no record yet.
 854       * @return null
 855       */
 856      protected function save_prediction_samples(array $sampleids, array $ranges,
 857              \core_analytics\local\time_splitting\base $timesplitting, ?\stdClass $predictsamplesrecord = null) {
 858          global $DB;
 859  
 860          if (count($ranges) > 1) {
 861              throw new \coding_exception('$ranges argument should only contain one range');
 862          }
 863  
 864          $rangeindex = key($ranges);
 865  
 866          if ($predictsamplesrecord) {
 867              // Append the new samples used for prediction.
 868              $predictsamplesrecord->sampleids = json_encode($predictsamplesrecord->sampleids + $sampleids);
 869              $predictsamplesrecord->timemodified = time();
 870              $DB->update_record('analytics_predict_samples', $predictsamplesrecord);
 871          } else {
 872              $predictsamplesrecord = (object)[
 873                  'modelid' => $this->analyser->get_modelid(),
 874                  'analysableid' => $timesplitting->get_analysable()->get_id(),
 875                  'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex
 876              ];
 877              $predictsamplesrecord->sampleids = json_encode($sampleids);
 878              $predictsamplesrecord->timecreated = time();
 879              $predictsamplesrecord->timemodified = $predictsamplesrecord->timecreated;
 880              $DB->insert_record('analytics_predict_samples', $predictsamplesrecord);
 881          }
 882      }
 883  
 884      /**
 885       * Flags the analysable element as in-analysis and stores a lock for it.
 886       *
 887       * @param  string $timesplittingid
 888       * @param  int    $analysableid
 889       * @return bool Success or not
 890       */
 891      private function init_analysable_analysis(string $timesplittingid, int $analysableid) {
 892  
 893          // Do not include $this->includetarget as we don't want the same analysable to be analysed for training
 894          // and prediction at the same time.
 895          $lockkey = 'modelid:' . $this->analyser->get_modelid() . '-analysableid:' . $analysableid .
 896              '-timesplitting:' . self::clean_time_splitting_id($timesplittingid);
 897  
 898          // Large timeout as processes may be quite long.
 899          $lockfactory = \core\lock\lock_config::get_lock_factory('core_analytics');
 900  
 901          // If it is not ready in 10 secs skip this model + analysable + timesplittingmethod combination
 902          // it will attempt it again during next cron run.
 903          if (!$this->lock = $lockfactory->get_lock($lockkey, 10)) {
 904              return false;
 905          }
 906          return true;
 907      }
 908  
 909  
 910      /**
 911       * Remove all possibly problematic chars from the time splitting method id (id = its full class name).
 912       *
 913       * @param string $timesplittingid
 914       * @return string
 915       */
 916      public static function clean_time_splitting_id($timesplittingid) {
 917          $timesplittingid = str_replace('\\', '-', $timesplittingid);
 918          return clean_param($timesplittingid, PARAM_ALPHANUMEXT);
 919      }
 920  
 921      /**
 922       * Mark the currently analysed analysable+timesplitting as analysed.
 923       *
 924       * @return null
 925       */
 926      private function finish_analysable_analysis() {
 927          $this->lock->release();
 928      }
 929  
 930      /**
 931       * Returns the batch size used for insert_records.
 932       *
 933       * This method tries to find the best batch size without getting
 934       * into dml internals. Maximum 1000 records to save memory.
 935       *
 936       * @return int
 937       */
 938      private static function get_insert_batch_size(): int {
 939          global $DB;
 940  
 941          $dbconfig = $DB->export_dbconfig();
 942  
 943          // 500 is pgsql default so using 1000 is fine, no other db driver uses a hardcoded value.
 944          if (empty($dbconfig) || empty($dbconfig->dboptions) || empty($dbconfig->dboptions['bulkinsertsize'])) {
 945              return 1000;
 946          }
 947  
 948          $bulkinsert = $dbconfig->dboptions['bulkinsertsize'];
 949          if ($bulkinsert < 1000) {
 950              return $bulkinsert;
 951          }
 952  
 953          while ($bulkinsert > 1000) {
 954              $bulkinsert = round($bulkinsert / 2, 0);
 955          }
 956  
 957          return (int)$bulkinsert;
 958      }
 959  }