1 <?php 2 // This file is part of Moodle - http://moodle.org/ 3 // 4 // Moodle is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // Moodle is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU General Public License for more details. 13 // 14 // You should have received a copy of the GNU General Public License 15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>. 16 17 /** 18 * Runs an analysis of the site. 19 * 20 * @package core_analytics 21 * @copyright 2019 David Monllao {@link http://www.davidmonllao.com} 22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later 23 */ 24 25 namespace core_analytics; 26 27 defined('MOODLE_INTERNAL') || die(); 28 29 /** 30 * Runs an analysis of the site. 31 * 32 * @package core_analytics 33 * @copyright 2019 David Monllao {@link http://www.davidmonllao.com} 34 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later 35 */ 36 class analysis { 37 38 /** 39 * @var \core_analytics\local\analyser\base 40 */ 41 private $analyser; 42 43 /** 44 * @var bool Whether to calculate the target or not in this run. 45 */ 46 private $includetarget; 47 48 /** 49 * @var \core_analytics\local\analysis\result 50 */ 51 private $result; 52 53 /** 54 * @var \core\lock\lock 55 */ 56 private $lock; 57 58 /** 59 * Constructor. 60 * 61 * @param \core_analytics\local\analyser\base $analyser 62 * @param bool $includetarget Whether to calculate the target or not. 63 * @param \core_analytics\local\analysis\result $result 64 */ 65 public function __construct(\core_analytics\local\analyser\base $analyser, bool $includetarget, 66 \core_analytics\local\analysis\result $result) { 67 $this->analyser = $analyser; 68 $this->includetarget = $includetarget; 69 $this->result = $result; 70 71 // We cache the first time analysables were analysed because time-splitting methods can depend on these info. 72 self::fill_firstanalyses_cache($this->analyser->get_modelid()); 73 } 74 75 /** 76 * Runs the analysis. 77 * 78 * @param \context[] $contexts Restrict the analysis to these contexts. No context restrictions if null. 79 * @return null 80 */ 81 public function run(array $contexts = []) { 82 83 $options = $this->analyser->get_options(); 84 85 // Time limit control. 86 $modeltimelimit = intval(get_config('analytics', 'modeltimelimit')); 87 88 if ($this->includetarget) { 89 $action = 'training'; 90 } else { 91 $action = 'prediction'; 92 } 93 $analysables = $this->analyser->get_analysables_iterator($action, $contexts); 94 95 $processedanalysables = $this->get_processed_analysables(); 96 97 $inittime = microtime(true); 98 foreach ($analysables as $analysable) { 99 $processed = false; 100 101 if (!$analysable) { 102 continue; 103 } 104 105 $analysableresults = $this->process_analysable($analysable); 106 if ($analysableresults) { 107 $processed = $this->result->add_analysable_results($analysableresults); 108 if (!$processed) { 109 $errors = array(); 110 foreach ($analysableresults as $timesplittingid => $result) { 111 $str = ''; 112 if (count($analysableresults) > 1) { 113 $str .= $timesplittingid . ': '; 114 } 115 $str .= $result->message; 116 $errors[] = $str; 117 } 118 119 $a = new \stdClass(); 120 $a->analysableid = $analysable->get_name(); 121 $a->errors = implode(', ', $errors); 122 $this->analyser->add_log(get_string('analysablenotused', 'analytics', $a)); 123 } 124 } 125 126 if (!$options['evaluation']) { 127 128 if (empty($processedanalysables[$analysable->get_id()]) || 129 $this->analyser->get_target()->always_update_analysis_time() || $processed) { 130 // We store the list of processed analysables even if the target does not always_update_analysis_time(), 131 // what always_update_analysis_time controls is the update of the data. 132 $this->update_analysable_analysed_time($processedanalysables, $analysable->get_id()); 133 } 134 135 // Apply time limit. 136 $timespent = microtime(true) - $inittime; 137 if ($modeltimelimit <= $timespent) { 138 break; 139 } 140 } 141 } 142 143 // Force GC to clean up the indicator instances used during the last iteration. 144 $this->analyser->instantiate_indicators(); 145 } 146 147 /** 148 * Get analysables that have been already processed. 149 * 150 * @return \stdClass[] 151 */ 152 protected function get_processed_analysables(): array { 153 global $DB; 154 155 $params = array('modelid' => $this->analyser->get_modelid()); 156 $params['action'] = ($this->includetarget) ? 'training' : 'prediction'; 157 $select = 'modelid = :modelid and action = :action'; 158 159 // Weird select fields ordering for performance (analysableid key matching, analysableid is also unique by modelid). 160 return $DB->get_records_select('analytics_used_analysables', $select, 161 $params, 'timeanalysed DESC', 'analysableid, modelid, action, firstanalysis, timeanalysed, id AS primarykey'); 162 } 163 164 /** 165 * Processes an analysable 166 * 167 * This method returns the general analysable status, an array of files by time splitting method and 168 * an error message if there is any problem. 169 * 170 * @param \core_analytics\analysable $analysable 171 * @return \stdClass[] Results objects by time splitting method 172 */ 173 public function process_analysable(\core_analytics\analysable $analysable): array { 174 175 // Target instances scope is per-analysable (it can't be lower as calculations run once per 176 // analysable, not time splitting method nor time range). 177 $target = call_user_func(array($this->analyser->get_target(), 'instance')); 178 179 // We need to check that the analysable is valid for the target even if we don't include targets 180 // as we still need to discard invalid analysables for the target. 181 $isvalidresult = $target->is_valid_analysable($analysable, $this->includetarget); 182 if ($isvalidresult !== true) { 183 $a = new \stdClass(); 184 $a->analysableid = $analysable->get_name(); 185 $a->result = $isvalidresult; 186 $this->analyser->add_log(get_string('analysablenotvalidfortarget', 'analytics', $a)); 187 return array(); 188 } 189 190 // Process all provided time splitting methods. 191 $results = array(); 192 foreach ($this->analyser->get_timesplittings() as $timesplitting) { 193 194 $cachedresult = $this->result->retrieve_cached_result($timesplitting, $analysable); 195 if ($cachedresult) { 196 $result = new \stdClass(); 197 $result->result = $cachedresult; 198 $results[$timesplitting->get_id()] = $result; 199 continue; 200 } 201 202 $results[$timesplitting->get_id()] = $this->process_time_splitting($timesplitting, $analysable, $target); 203 } 204 205 return $results; 206 } 207 208 /** 209 * Processes the analysable samples using the provided time splitting method. 210 * 211 * @param \core_analytics\local\time_splitting\base $timesplitting 212 * @param \core_analytics\analysable $analysable 213 * @param \core_analytics\local\target\base $target 214 * @return \stdClass Results object. 215 */ 216 protected function process_time_splitting(\core_analytics\local\time_splitting\base $timesplitting, 217 \core_analytics\analysable $analysable, \core_analytics\local\target\base $target): \stdClass { 218 219 $options = $this->analyser->get_options(); 220 221 $result = new \stdClass(); 222 223 $timesplitting->set_modelid($this->analyser->get_modelid()); 224 if (!$timesplitting->is_valid_analysable($analysable)) { 225 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD; 226 $result->message = get_string('invalidanalysablefortimesplitting', 'analytics', 227 $timesplitting->get_name()); 228 return $result; 229 } 230 $timesplitting->set_analysable($analysable); 231 232 if (CLI_SCRIPT && !PHPUNIT_TEST) { 233 mtrace('Analysing id "' . $analysable->get_id() . '" with "' . $timesplitting->get_name() . 234 '" time splitting method...'); 235 } 236 237 // What is a sample is defined by the analyser, it can be an enrolment, a course, a user, a question 238 // attempt... it is on what we will base indicators calculations. 239 list($sampleids, $samplesdata) = $this->analyser->get_all_samples($analysable); 240 241 if (count($sampleids) === 0) { 242 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD; 243 $result->message = get_string('nodata', 'analytics'); 244 return $result; 245 } 246 247 if ($this->includetarget) { 248 // All ranges are used when we are calculating data for training. 249 $ranges = $timesplitting->get_training_ranges(); 250 } else { 251 // The latest range that has not yet been used for prediction (it depends on the time range where we are right now). 252 $ranges = $timesplitting->get_most_recent_prediction_range(); 253 } 254 255 // There is no need to keep track of the evaluated samples and ranges as we always evaluate the whole dataset. 256 if ($options['evaluation'] === false) { 257 258 if (empty($ranges)) { 259 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD; 260 $result->message = get_string('noranges', 'analytics'); 261 return $result; 262 } 263 264 // We skip all samples that are already part of a training dataset, even if they have not been used for prediction. 265 if (!$target::based_on_assumptions()) { 266 // Targets based on assumptions can not be trained. 267 $this->filter_out_train_samples($sampleids, $timesplitting); 268 } 269 270 if (count($sampleids) === 0) { 271 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD; 272 $result->message = get_string('nonewdata', 'analytics'); 273 return $result; 274 } 275 276 // Only when processing data for predictions. 277 if (!$this->includetarget) { 278 // We also filter out samples and ranges that have already been used for predictions. 279 $predictsamplesrecord = $this->filter_out_prediction_samples_and_ranges($sampleids, $ranges, $timesplitting); 280 } 281 282 if (count($sampleids) === 0) { 283 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD; 284 $result->message = get_string('nonewdata', 'analytics'); 285 return $result; 286 } 287 288 if (count($ranges) === 0) { 289 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD; 290 $result->message = get_string('nonewranges', 'analytics'); 291 return $result; 292 } 293 } 294 295 // Flag the model + analysable + timesplitting as being analysed (prevent concurrent executions). 296 if (!$this->init_analysable_analysis($timesplitting->get_id(), $analysable->get_id())) { 297 // If this model + analysable + timesplitting combination is being analysed we skip this process. 298 $result->status = \core_analytics\model::NO_DATASET; 299 $result->message = get_string('analysisinprogress', 'analytics'); 300 return $result; 301 } 302 303 // Remove samples the target consider invalid. 304 try { 305 $target->add_sample_data($samplesdata); 306 $target->filter_out_invalid_samples($sampleids, $analysable, $this->includetarget); 307 } catch (\Throwable $e) { 308 $this->finish_analysable_analysis(); 309 throw $e; 310 } 311 312 if (!$sampleids) { 313 $result->status = \core_analytics\model::NO_DATASET; 314 $result->message = get_string('novalidsamples', 'analytics'); 315 $this->finish_analysable_analysis(); 316 return $result; 317 } 318 319 try { 320 // Instantiate empty indicators to ensure that no garbage is dragged from previous analyses. 321 $indicators = $this->analyser->instantiate_indicators(); 322 foreach ($indicators as $key => $indicator) { 323 // The analyser attaches the main entities the sample depends on and are provided to the 324 // indicator to calculate the sample. 325 $indicators[$key]->add_sample_data($samplesdata); 326 } 327 328 // Here we start the memory intensive process that will last until $data var is 329 // unset (until the method is finished basically). 330 $data = $this->calculate($timesplitting, $sampleids, $ranges, $target); 331 } catch (\Throwable $e) { 332 $this->finish_analysable_analysis(); 333 throw $e; 334 } 335 336 if (!$data) { 337 $result->status = \core_analytics\model::ANALYSABLE_REJECTED_TIME_SPLITTING_METHOD; 338 $result->message = get_string('novaliddata', 'analytics'); 339 $this->finish_analysable_analysis(); 340 return $result; 341 } 342 343 try { 344 // No need to keep track of analysed stuff when evaluating. 345 if ($options['evaluation'] === false) { 346 // Save the samples that have been already analysed so they are not analysed again in future. 347 348 if ($this->includetarget) { 349 $this->save_train_samples($sampleids, $timesplitting); 350 } else { 351 // The variable $predictsamplesrecord will always be set as filter_out_prediction_samples_and_ranges 352 // will always be called before it (no evaluation mode and no includetarget). 353 $this->save_prediction_samples($sampleids, $ranges, $timesplitting, $predictsamplesrecord); 354 } 355 } 356 357 // We need to pass all the analysis data. 358 $formattedresult = $this->result->format_result($data, $target, $timesplitting, $analysable); 359 360 } catch (\Throwable $e) { 361 $this->finish_analysable_analysis(); 362 throw $e; 363 } 364 365 if (!$formattedresult) { 366 $this->finish_analysable_analysis(); 367 throw new \moodle_exception('errorcannotwritedataset', 'analytics'); 368 } 369 370 $result->status = \core_analytics\model::OK; 371 $result->message = get_string('successfullyanalysed', 'analytics'); 372 $result->result = $formattedresult; 373 374 // Flag the model + analysable + timesplitting as analysed. 375 $this->finish_analysable_analysis(); 376 377 return $result; 378 } 379 380 /** 381 * Calculates indicators and targets. 382 * 383 * @param \core_analytics\local\time_splitting\base $timesplitting 384 * @param array $sampleids 385 * @param array $ranges 386 * @param \core_analytics\local\target\base $target 387 * @return array|null 388 */ 389 public function calculate(\core_analytics\local\time_splitting\base $timesplitting, array &$sampleids, 390 array $ranges, \core_analytics\local\target\base $target): ?array { 391 392 $calculatedtarget = null; 393 if ($this->includetarget) { 394 // We first calculate the target because analysable data may still be invalid or none 395 // of the analysable samples may be valid. 396 $calculatedtarget = $target->calculate($sampleids, $timesplitting->get_analysable()); 397 398 // We remove samples we can not calculate their target. 399 $sampleids = array_filter($sampleids, function($sampleid) use ($calculatedtarget) { 400 if (is_null($calculatedtarget[$sampleid])) { 401 return false; 402 } 403 return true; 404 }); 405 } 406 407 // No need to continue calculating if the target couldn't be calculated for any sample. 408 if (empty($sampleids)) { 409 return null; 410 } 411 412 $dataset = $this->calculate_indicators($timesplitting, $sampleids, $ranges); 413 414 if (empty($dataset)) { 415 return null; 416 } 417 418 // Now that we have the indicators in place we can add the time range indicators (and target if provided) to each of them. 419 $this->fill_dataset($timesplitting, $dataset, $calculatedtarget); 420 421 $this->add_context_metadata($timesplitting, $dataset, $target); 422 423 if (!PHPUNIT_TEST && CLI_SCRIPT) { 424 echo PHP_EOL; 425 } 426 427 return $dataset; 428 } 429 430 /** 431 * Calculates indicators. 432 * 433 * @param \core_analytics\local\time_splitting\base $timesplitting 434 * @param array $sampleids 435 * @param array $ranges 436 * @return array 437 */ 438 protected function calculate_indicators(\core_analytics\local\time_splitting\base $timesplitting, array $sampleids, 439 array $ranges): array { 440 global $DB; 441 442 $options = $this->analyser->get_options(); 443 444 $dataset = array(); 445 446 // Faster to run 1 db query per range. 447 $existingcalculations = array(); 448 if ($timesplitting->cache_indicator_calculations()) { 449 foreach ($ranges as $rangeindex => $range) { 450 // Load existing calculations. 451 $existingcalculations[$rangeindex] = \core_analytics\manager::get_indicator_calculations( 452 $timesplitting->get_analysable(), $range['start'], $range['end'], $this->analyser->get_samples_origin()); 453 } 454 } 455 456 // Here we store samples which calculations are not all null. 457 $notnulls = array(); 458 459 // Fill the dataset samples with indicators data. 460 $newcalculations = array(); 461 foreach ($this->analyser->get_indicators() as $indicator) { 462 463 // Hook to allow indicators to store analysable-dependant data. 464 $indicator->fill_per_analysable_caches($timesplitting->get_analysable()); 465 466 // Per-range calculations. 467 foreach ($ranges as $rangeindex => $range) { 468 469 // Indicator instances are per-range. 470 $rangeindicator = clone $indicator; 471 472 $prevcalculations = array(); 473 if (!empty($existingcalculations[$rangeindex][$rangeindicator->get_id()])) { 474 $prevcalculations = $existingcalculations[$rangeindex][$rangeindicator->get_id()]; 475 } 476 477 // Calculate the indicator for each sample in this time range. 478 list($samplesfeatures, $newindicatorcalculations, $indicatornotnulls) = $rangeindicator->calculate($sampleids, 479 $this->analyser->get_samples_origin(), $range['start'], $range['end'], $prevcalculations); 480 481 // Associate the extra data generated by the indicator to this range index. 482 $rangeindicator->save_calculation_info($timesplitting, $rangeindex); 483 484 // Free memory ASAP. 485 unset($rangeindicator); 486 gc_collect_cycles(); 487 gc_mem_caches(); 488 489 // Copy the features data to the dataset. 490 foreach ($samplesfeatures as $analysersampleid => $features) { 491 492 $uniquesampleid = $timesplitting->append_rangeindex($analysersampleid, $rangeindex); 493 494 if (!isset($notnulls[$uniquesampleid]) && !empty($indicatornotnulls[$analysersampleid])) { 495 $notnulls[$uniquesampleid] = $uniquesampleid; 496 } 497 498 // Init the sample if it is still empty. 499 if (!isset($dataset[$uniquesampleid])) { 500 $dataset[$uniquesampleid] = array(); 501 } 502 503 // Append the features indicator features at the end of the sample. 504 $dataset[$uniquesampleid] = array_merge($dataset[$uniquesampleid], $features); 505 } 506 507 if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations()) { 508 $timecreated = time(); 509 foreach ($newindicatorcalculations as $sampleid => $calculatedvalue) { 510 // Prepare the new calculations to be stored into DB. 511 512 $indcalc = new \stdClass(); 513 $indcalc->contextid = $timesplitting->get_analysable()->get_context()->id; 514 $indcalc->starttime = $range['start']; 515 $indcalc->endtime = $range['end']; 516 $indcalc->sampleid = $sampleid; 517 $indcalc->sampleorigin = $this->analyser->get_samples_origin(); 518 $indcalc->indicator = $indicator->get_id(); 519 $indcalc->value = $calculatedvalue; 520 $indcalc->timecreated = $timecreated; 521 $newcalculations[] = $indcalc; 522 } 523 } 524 } 525 526 if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations()) { 527 $batchsize = self::get_insert_batch_size(); 528 if (count($newcalculations) > $batchsize) { 529 // We don't want newcalculations array to grow too much as we already keep the 530 // system memory busy storing $dataset contents. 531 532 // Insert from the beginning. 533 $remaining = array_splice($newcalculations, $batchsize); 534 535 // Sorry mssql and oracle, this will be slow. 536 $DB->insert_records('analytics_indicator_calc', $newcalculations); 537 $newcalculations = $remaining; 538 } 539 } 540 } 541 542 if (!$options['evaluation'] && $timesplitting->cache_indicator_calculations() && $newcalculations) { 543 // Insert the remaining records. 544 $DB->insert_records('analytics_indicator_calc', $newcalculations); 545 } 546 547 // Delete rows where all calculations are null. 548 // We still store the indicator calculation and we still store the sample id as 549 // processed so we don't have to process this sample again, but we exclude it 550 // from the dataset because it is not useful. 551 $nulls = array_diff_key($dataset, $notnulls); 552 foreach ($nulls as $uniqueid => $ignoredvalues) { 553 unset($dataset[$uniqueid]); 554 } 555 556 return $dataset; 557 } 558 559 /** 560 * Adds time range indicators and the target to each sample. 561 * 562 * This will identify the sample as belonging to a specific range. 563 * 564 * @param \core_analytics\local\time_splitting\base $timesplitting 565 * @param array $dataset 566 * @param array|null $calculatedtarget 567 * @return null 568 */ 569 protected function fill_dataset(\core_analytics\local\time_splitting\base $timesplitting, 570 array &$dataset, ?array $calculatedtarget = null) { 571 572 $nranges = count($timesplitting->get_distinct_ranges()); 573 574 foreach ($dataset as $uniquesampleid => $unmodified) { 575 576 list($analysersampleid, $rangeindex) = $timesplitting->infer_sample_info($uniquesampleid); 577 578 // No need to add range features if this time splitting method only defines one time range. 579 if ($nranges > 1) { 580 581 // 1 column for each range. 582 $timeindicators = array_fill(0, $nranges, 0); 583 584 $timeindicators[$rangeindex] = 1; 585 586 $dataset[$uniquesampleid] = array_merge($timeindicators, $dataset[$uniquesampleid]); 587 } 588 589 if ($calculatedtarget) { 590 // Add this sampleid's calculated target and the end. 591 $dataset[$uniquesampleid][] = $calculatedtarget[$analysersampleid]; 592 593 } else { 594 // Add this sampleid, it will be used to identify the prediction that comes back from 595 // the predictions processor. 596 array_unshift($dataset[$uniquesampleid], $uniquesampleid); 597 } 598 } 599 } 600 601 /** 602 * Updates the analysable analysis time. 603 * 604 * @param array $processedanalysables 605 * @param int $analysableid 606 * @return null 607 */ 608 protected function update_analysable_analysed_time(array $processedanalysables, int $analysableid) { 609 global $DB; 610 611 $now = time(); 612 613 if (!empty($processedanalysables[$analysableid])) { 614 $obj = $processedanalysables[$analysableid]; 615 616 $obj->id = $obj->primarykey; 617 unset($obj->primarykey); 618 619 $obj->timeanalysed = $now; 620 621 $DB->update_record('analytics_used_analysables', $obj); 622 623 } else { 624 625 $obj = new \stdClass(); 626 $obj->modelid = $this->analyser->get_modelid(); 627 $obj->action = ($this->includetarget) ? 'training' : 'prediction'; 628 $obj->analysableid = $analysableid; 629 $obj->firstanalysis = $now; 630 $obj->timeanalysed = $now; 631 632 $obj->primarykey = $DB->insert_record('analytics_used_analysables', $obj); 633 634 // Update the cache just in case it is used in the same request. 635 $key = $this->analyser->get_modelid() . '_' . $analysableid; 636 $cache = \cache::make('core', 'modelfirstanalyses'); 637 $cache->set($key, $now); 638 } 639 } 640 641 /** 642 * Fills a cache containing the first time each analysable in the provided model was analysed. 643 * 644 * @param int $modelid 645 * @param int|null $analysableid 646 * @return null 647 */ 648 public static function fill_firstanalyses_cache(int $modelid, ?int $analysableid = null) { 649 global $DB; 650 651 // Using composed keys instead of cache $identifiers because of MDL-65358. 652 $primarykey = $DB->sql_concat($modelid, "'_'", 'analysableid'); 653 $sql = "SELECT $primarykey AS id, MIN(firstanalysis) AS firstanalysis 654 FROM {analytics_used_analysables} aua 655 WHERE modelid = :modelid"; 656 $params = ['modelid' => $modelid]; 657 658 if ($analysableid) { 659 $sql .= " AND analysableid = :analysableid"; 660 $params['analysableid'] = $analysableid; 661 } 662 663 $sql .= " GROUP BY modelid, analysableid ORDER BY analysableid"; 664 665 $firstanalyses = $DB->get_records_sql($sql, $params); 666 if ($firstanalyses) { 667 $cache = \cache::make('core', 'modelfirstanalyses'); 668 669 $firstanalyses = array_map(function($record) { 670 return $record->firstanalysis; 671 }, $firstanalyses); 672 673 $cache->set_many($firstanalyses); 674 } 675 676 return $firstanalyses; 677 } 678 679 /** 680 * Adds dataset context info. 681 * 682 * The final dataset document will look like this: 683 * ---------------------------------------------------- 684 * metadata1,metadata2,metadata3,..... 685 * value1, value2, value3,..... 686 * 687 * header1,header2,header3,header4,..... 688 * stud1value1,stud1value2,stud1value3,stud1value4,..... 689 * stud2value1,stud2value2,stud2value3,stud2value4,..... 690 * ..... 691 * ---------------------------------------------------- 692 * 693 * @param \core_analytics\local\time_splitting\base $timesplitting 694 * @param array $dataset 695 * @param \core_analytics\local\target\base $target 696 * @return null 697 */ 698 protected function add_context_metadata(\core_analytics\local\time_splitting\base $timesplitting, array &$dataset, 699 \core_analytics\local\target\base $target) { 700 $headers = $this->get_headers($timesplitting, $target); 701 702 // This will also reset samples' dataset keys. 703 array_unshift($dataset, $headers); 704 } 705 706 /** 707 * Returns the headers for the csv file based on the indicators and the target. 708 * 709 * @param \core_analytics\local\time_splitting\base $timesplitting 710 * @param \core_analytics\local\target\base $target 711 * @return string[] 712 */ 713 public function get_headers(\core_analytics\local\time_splitting\base $timesplitting, 714 \core_analytics\local\target\base $target): array { 715 // 3rd column will contain the indicator ids. 716 $headers = array(); 717 718 if (!$this->includetarget) { 719 // The first column is the sampleid. 720 $headers[] = 'sampleid'; 721 } 722 723 // We always have 1 column for each time splitting method range, it does not depend on how 724 // many ranges we calculated. 725 $ranges = $timesplitting->get_distinct_ranges(); 726 if (count($ranges) > 1) { 727 foreach ($ranges as $rangeindex) { 728 $headers[] = 'range/' . $rangeindex; 729 } 730 } 731 732 // Model indicators. 733 foreach ($this->analyser->get_indicators() as $indicator) { 734 $headers = array_merge($headers, $indicator::get_feature_headers()); 735 } 736 737 // The target as well. 738 if ($this->includetarget) { 739 $headers[] = $target->get_id(); 740 } 741 742 return $headers; 743 } 744 745 /** 746 * Filters out samples that have already been used for training. 747 * 748 * @param int[] $sampleids 749 * @param \core_analytics\local\time_splitting\base $timesplitting 750 * @return null 751 */ 752 protected function filter_out_train_samples(array &$sampleids, \core_analytics\local\time_splitting\base $timesplitting) { 753 global $DB; 754 755 $params = array('modelid' => $this->analyser->get_modelid(), 'analysableid' => $timesplitting->get_analysable()->get_id(), 756 'timesplitting' => $timesplitting->get_id()); 757 758 $trainingsamples = $DB->get_records('analytics_train_samples', $params); 759 760 // Skip each file trained samples. 761 foreach ($trainingsamples as $trainingfile) { 762 763 $usedsamples = json_decode($trainingfile->sampleids, true); 764 765 if (!empty($usedsamples)) { 766 // Reset $sampleids to $sampleids minus this file's $usedsamples. 767 $sampleids = array_diff_key($sampleids, $usedsamples); 768 } 769 } 770 } 771 772 /** 773 * Filters out samples that have already been used for prediction. 774 * 775 * @param int[] $sampleids 776 * @param array $ranges 777 * @param \core_analytics\local\time_splitting\base $timesplitting 778 * @return \stdClass|null The analytics_predict_samples record or null 779 */ 780 protected function filter_out_prediction_samples_and_ranges(array &$sampleids, array &$ranges, 781 \core_analytics\local\time_splitting\base $timesplitting) { 782 783 if (count($ranges) > 1) { 784 throw new \coding_exception('$ranges argument should only contain one range'); 785 } 786 787 $rangeindex = key($ranges); 788 $predictedrange = $this->get_predict_samples_record($timesplitting, $rangeindex); 789 790 if (!$predictedrange) { 791 // Nothing to filter out. 792 return null; 793 } 794 795 $predictedrange->sampleids = json_decode($predictedrange->sampleids, true); 796 $missingsamples = array_diff_key($sampleids, $predictedrange->sampleids); 797 if (count($missingsamples) === 0) { 798 // All samples already calculated. 799 unset($ranges[$rangeindex]); 800 return null; 801 } 802 803 // Replace the list of samples by the one excluding samples that already got predictions at this range. 804 $sampleids = $missingsamples; 805 806 return $predictedrange; 807 } 808 809 /** 810 * Returns a predict samples record. 811 * 812 * @param \core_analytics\local\time_splitting\base $timesplitting 813 * @param int $rangeindex 814 * @return \stdClass|false 815 */ 816 private function get_predict_samples_record(\core_analytics\local\time_splitting\base $timesplitting, int $rangeindex) { 817 global $DB; 818 819 $params = array('modelid' => $this->analyser->get_modelid(), 'analysableid' => $timesplitting->get_analysable()->get_id(), 820 'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex); 821 $predictedrange = $DB->get_record('analytics_predict_samples', $params); 822 823 return $predictedrange; 824 } 825 826 /** 827 * Saves samples that have just been used for training. 828 * 829 * @param int[] $sampleids 830 * @param \core_analytics\local\time_splitting\base $timesplitting 831 * @return null 832 */ 833 protected function save_train_samples(array $sampleids, \core_analytics\local\time_splitting\base $timesplitting) { 834 global $DB; 835 836 $trainingsamples = new \stdClass(); 837 $trainingsamples->modelid = $this->analyser->get_modelid(); 838 $trainingsamples->analysableid = $timesplitting->get_analysable()->get_id(); 839 $trainingsamples->timesplitting = $timesplitting->get_id(); 840 841 $trainingsamples->sampleids = json_encode($sampleids); 842 $trainingsamples->timecreated = time(); 843 844 $DB->insert_record('analytics_train_samples', $trainingsamples); 845 } 846 847 /** 848 * Saves samples that have just been used for prediction. 849 * 850 * @param int[] $sampleids 851 * @param array $ranges 852 * @param \core_analytics\local\time_splitting\base $timesplitting 853 * @param \stdClass|null $predictsamplesrecord The existing record or null if there is no record yet. 854 * @return null 855 */ 856 protected function save_prediction_samples(array $sampleids, array $ranges, 857 \core_analytics\local\time_splitting\base $timesplitting, ?\stdClass $predictsamplesrecord = null) { 858 global $DB; 859 860 if (count($ranges) > 1) { 861 throw new \coding_exception('$ranges argument should only contain one range'); 862 } 863 864 $rangeindex = key($ranges); 865 866 if ($predictsamplesrecord) { 867 // Append the new samples used for prediction. 868 $predictsamplesrecord->sampleids = json_encode($predictsamplesrecord->sampleids + $sampleids); 869 $predictsamplesrecord->timemodified = time(); 870 $DB->update_record('analytics_predict_samples', $predictsamplesrecord); 871 } else { 872 $predictsamplesrecord = (object)[ 873 'modelid' => $this->analyser->get_modelid(), 874 'analysableid' => $timesplitting->get_analysable()->get_id(), 875 'timesplitting' => $timesplitting->get_id(), 'rangeindex' => $rangeindex 876 ]; 877 $predictsamplesrecord->sampleids = json_encode($sampleids); 878 $predictsamplesrecord->timecreated = time(); 879 $predictsamplesrecord->timemodified = $predictsamplesrecord->timecreated; 880 $DB->insert_record('analytics_predict_samples', $predictsamplesrecord); 881 } 882 } 883 884 /** 885 * Flags the analysable element as in-analysis and stores a lock for it. 886 * 887 * @param string $timesplittingid 888 * @param int $analysableid 889 * @return bool Success or not 890 */ 891 private function init_analysable_analysis(string $timesplittingid, int $analysableid) { 892 893 // Do not include $this->includetarget as we don't want the same analysable to be analysed for training 894 // and prediction at the same time. 895 $lockkey = 'modelid:' . $this->analyser->get_modelid() . '-analysableid:' . $analysableid . 896 '-timesplitting:' . self::clean_time_splitting_id($timesplittingid); 897 898 // Large timeout as processes may be quite long. 899 $lockfactory = \core\lock\lock_config::get_lock_factory('core_analytics'); 900 901 // If it is not ready in 10 secs skip this model + analysable + timesplittingmethod combination 902 // it will attempt it again during next cron run. 903 if (!$this->lock = $lockfactory->get_lock($lockkey, 10)) { 904 return false; 905 } 906 return true; 907 } 908 909 910 /** 911 * Remove all possibly problematic chars from the time splitting method id (id = its full class name). 912 * 913 * @param string $timesplittingid 914 * @return string 915 */ 916 public static function clean_time_splitting_id($timesplittingid) { 917 $timesplittingid = str_replace('\\', '-', $timesplittingid); 918 return clean_param($timesplittingid, PARAM_ALPHANUMEXT); 919 } 920 921 /** 922 * Mark the currently analysed analysable+timesplitting as analysed. 923 * 924 * @return null 925 */ 926 private function finish_analysable_analysis() { 927 $this->lock->release(); 928 } 929 930 /** 931 * Returns the batch size used for insert_records. 932 * 933 * This method tries to find the best batch size without getting 934 * into dml internals. Maximum 1000 records to save memory. 935 * 936 * @return int 937 */ 938 private static function get_insert_batch_size(): int { 939 global $DB; 940 941 $dbconfig = $DB->export_dbconfig(); 942 943 // 500 is pgsql default so using 1000 is fine, no other db driver uses a hardcoded value. 944 if (empty($dbconfig) || empty($dbconfig->dboptions) || empty($dbconfig->dboptions['bulkinsertsize'])) { 945 return 1000; 946 } 947 948 $bulkinsert = $dbconfig->dboptions['bulkinsertsize']; 949 if ($bulkinsert < 1000) { 950 return $bulkinsert; 951 } 952 953 while ($bulkinsert > 1000) { 954 $bulkinsert = round($bulkinsert / 2, 0); 955 } 956 957 return (int)$bulkinsert; 958 } 959 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body