1 <?php 2 // This file is part of Moodle - http://moodle.org/ 3 // 4 // Moodle is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // Moodle is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU General Public License for more details. 13 // 14 // You should have received a copy of the GNU General Public License 15 // along with Moodle. If not, see <http://www.gnu.org/licenses/>. 16 17 /** 18 * Datasets manager. 19 * 20 * @package core_analytics 21 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com} 22 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later 23 */ 24 25 namespace core_analytics; 26 27 defined('MOODLE_INTERNAL') || die(); 28 29 /** 30 * Datasets manager. 31 * 32 * @package core_analytics 33 * @copyright 2016 David Monllao {@link http://www.davidmonllao.com} 34 * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later 35 */ 36 class dataset_manager { 37 38 /** 39 * File area for labelled datasets. 40 */ 41 const LABELLED_FILEAREA = 'labelled'; 42 43 /** 44 * File area for unlabelled datasets. 45 */ 46 const UNLABELLED_FILEAREA = 'unlabelled'; 47 48 /** 49 * File area for exported datasets. 50 */ 51 const EXPORT_FILEAREA = 'export'; 52 53 /** 54 * Evaluation file file name. 55 */ 56 const EVALUATION_FILENAME = 'evaluation.csv'; 57 58 /** 59 * The model id. 60 * 61 * @var int 62 */ 63 protected $modelid; 64 65 /** 66 * Range processor in use. 67 * 68 * @var string 69 */ 70 protected $timesplittingid; 71 72 /** 73 * @var int 74 */ 75 protected $analysableid; 76 77 /** 78 * Whether this is a dataset for evaluation or not. 79 * 80 * @var bool 81 */ 82 protected $evaluation; 83 84 /** 85 * The dataset filearea. Must be one of the self::*_FILEAREA options. 86 * 87 * @var string 88 */ 89 protected $filearea; 90 91 /** 92 * Constructor method. 93 * 94 * @throws \coding_exception 95 * @param int $modelid 96 * @param int $analysableid 97 * @param string $timesplittingid 98 * @param string $filearea 99 * @param bool $evaluation 100 * @return void 101 */ 102 public function __construct($modelid, $analysableid, $timesplittingid, $filearea, $evaluation = false) { 103 104 if ($filearea !== self::EXPORT_FILEAREA && $filearea !== self::LABELLED_FILEAREA && 105 $filearea !== self::UNLABELLED_FILEAREA) { 106 throw new \coding_exception('Invalid provided filearea'); 107 } 108 109 $this->modelid = $modelid; 110 $this->analysableid = $analysableid; 111 $this->timesplittingid = $timesplittingid; 112 $this->filearea = $filearea; 113 $this->evaluation = $evaluation; 114 } 115 116 /** 117 * Store the dataset in the internal file system. 118 * 119 * @param array $data 120 * @return \stored_file 121 */ 122 public function store($data) { 123 124 // Delete previous file if it exists. 125 $fs = get_file_storage(); 126 127 $filerecord = [ 128 'component' => 'analytics', 129 'filearea' => $this->filearea, 130 'itemid' => $this->modelid, 131 'contextid' => \context_system::instance()->id, 132 'filepath' => '/analysable/' . $this->analysableid . '/' . 133 \core_analytics\analysis::clean_time_splitting_id($this->timesplittingid) . '/', 134 'filename' => self::get_filename($this->evaluation) 135 ]; 136 137 // Delete previous and old (we already checked that previous copies are not recent) evaluation files for this analysable. 138 if ($this->evaluation) { 139 $select = " = {$filerecord['itemid']} AND filepath = :filepath"; 140 $fs->delete_area_files_select($filerecord['contextid'], $filerecord['component'], $filerecord['filearea'], 141 $select, array('filepath' => $filerecord['filepath'])); 142 } 143 144 // Write all this stuff to a tmp file. 145 $filepath = make_request_directory() . DIRECTORY_SEPARATOR . $filerecord['filename']; 146 $fh = fopen($filepath, 'w+'); 147 if (!$fh) { 148 return false; 149 } 150 foreach ($data as $line) { 151 fputcsv($fh, $line); 152 } 153 fclose($fh); 154 155 return $fs->create_file_from_pathname($filerecord, $filepath); 156 } 157 158 /** 159 * Returns the previous evaluation file. 160 * 161 * Important to note that this is per modelid + timesplittingid, when dealing with multiple 162 * analysables this is the merged file. Do not confuse with self::get_evaluation_analysable_file 163 * 164 * @param int $modelid 165 * @param string $timesplittingid 166 * @return \stored_file 167 */ 168 public static function get_previous_evaluation_file($modelid, $timesplittingid) { 169 $fs = get_file_storage(); 170 // Evaluation data is always labelled. 171 $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/'; 172 return $fs->get_file(\context_system::instance()->id, 'analytics', self::LABELLED_FILEAREA, $modelid, 173 $filepath, self::EVALUATION_FILENAME); 174 } 175 176 /** 177 * Gets the list of files that couldn't be previously used for training and prediction. 178 * 179 * @param int $modelid 180 * @param bool $includetarget 181 * @param string[] $timesplittingids 182 * @return null 183 */ 184 public static function get_pending_files($modelid, $includetarget, $timesplittingids) { 185 global $DB; 186 187 $fs = get_file_storage(); 188 189 if ($includetarget) { 190 $filearea = self::LABELLED_FILEAREA; 191 $usedfileaction = 'trained'; 192 } else { 193 $filearea = self::UNLABELLED_FILEAREA; 194 $usedfileaction = 'predicted'; 195 } 196 197 $select = 'modelid = :modelid AND action = :action'; 198 $params = array('modelid' => $modelid, 'action' => $usedfileaction); 199 $usedfileids = $DB->get_fieldset_select('analytics_used_files', 'fileid', $select, $params); 200 201 // Very likely that we will only have 1 time splitting method here. 202 $filesbytimesplitting = array(); 203 foreach ($timesplittingids as $timesplittingid) { 204 205 $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/'; 206 $files = $fs->get_directory_files(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath); 207 foreach ($files as $file) { 208 209 // Discard evaluation files. 210 if ($file->get_filename() === self::EVALUATION_FILENAME) { 211 continue; 212 } 213 214 // No dirs. 215 if ($file->is_directory()) { 216 continue; 217 } 218 219 // Already used for training. 220 if (in_array($file->get_id(), $usedfileids)) { 221 continue; 222 } 223 224 $filesbytimesplitting[$timesplittingid][] = $file; 225 } 226 } 227 228 return $filesbytimesplitting; 229 } 230 231 /** 232 * Deletes previous evaluation files of this model. 233 * 234 * @param int $modelid 235 * @param string $timesplittingid 236 * @return bool 237 */ 238 public static function delete_previous_evaluation_file($modelid, $timesplittingid) { 239 if ($file = self::get_previous_evaluation_file($modelid, $timesplittingid)) { 240 $file->delete(); 241 return true; 242 } 243 244 return false; 245 } 246 247 /** 248 * Returns this (model + analysable + time splitting) file. 249 * 250 * @param int $modelid 251 * @param int $analysableid 252 * @param string $timesplittingid 253 * @return \stored_file 254 */ 255 public static function get_evaluation_analysable_file($modelid, $analysableid, $timesplittingid) { 256 257 // Delete previous file if it exists. 258 $fs = get_file_storage(); 259 260 // Always evaluation.csv and labelled as it is an evaluation file. 261 $filearea = self::LABELLED_FILEAREA; 262 $filename = self::get_filename(true); 263 $filepath = '/analysable/' . $analysableid . '/' . 264 \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/'; 265 return $fs->get_file(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath, $filename); 266 } 267 268 /** 269 * Merge multiple files into one. 270 * 271 * Important! It is the caller responsability to ensure that the datasets are compatible. 272 * 273 * @param array $files 274 * @param int $modelid 275 * @param string $timesplittingid 276 * @param string $filearea 277 * @param bool $evaluation 278 * @return \stored_file 279 */ 280 public static function merge_datasets(array $files, $modelid, $timesplittingid, $filearea, $evaluation = false) { 281 282 $tmpfilepath = make_request_directory() . DIRECTORY_SEPARATOR . 'tmpfile.csv'; 283 284 // Add headers. 285 // We could also do this with a single iteration gathering all files headers and appending them to the beginning of the file 286 // once all file contents are merged. 287 $varnames = ''; 288 $analysablesvalues = array(); 289 foreach ($files as $file) { 290 $rh = $file->get_content_file_handle(); 291 292 // Copy the var names as they are, all files should have the same var names. 293 $varnames = fgetcsv($rh); 294 295 $analysablesvalues[] = fgetcsv($rh); 296 297 // Copy the columns as they are, all files should have the same columns. 298 $columns = fgetcsv($rh); 299 } 300 301 // Merge analysable values skipping the ones that are the same in all analysables. 302 $values = array(); 303 foreach ($analysablesvalues as $analysablevalues) { 304 foreach ($analysablevalues as $varkey => $value) { 305 // Sha1 to make it unique. 306 $values[$varkey][sha1($value)] = $value; 307 } 308 } 309 foreach ($values as $varkey => $varvalues) { 310 $values[$varkey] = implode('|', $varvalues); 311 } 312 313 // Start writing to the merge file. 314 $wh = fopen($tmpfilepath, 'w'); 315 if (!$wh) { 316 throw new \moodle_exception('errorcannotwritedataset', 'analytics', '', $tmpfilepath); 317 } 318 319 fputcsv($wh, $varnames); 320 fputcsv($wh, $values); 321 fputcsv($wh, $columns); 322 323 // Iterate through all files and add them to the tmp one. We don't want file contents in memory. 324 foreach ($files as $file) { 325 $rh = $file->get_content_file_handle(); 326 327 // Skip headers. 328 fgets($rh); 329 fgets($rh); 330 fgets($rh); 331 332 // Copy all the following lines. 333 while ($line = fgets($rh)) { 334 fwrite($wh, $line); 335 } 336 fclose($rh); 337 } 338 fclose($wh); 339 340 $filerecord = [ 341 'component' => 'analytics', 342 'filearea' => $filearea, 343 'itemid' => $modelid, 344 'contextid' => \context_system::instance()->id, 345 'filepath' => '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/', 346 'filename' => self::get_filename($evaluation) 347 ]; 348 349 $fs = get_file_storage(); 350 351 return $fs->create_file_from_pathname($filerecord, $tmpfilepath); 352 } 353 354 /** 355 * Exports the model training data. 356 * 357 * @param int $modelid 358 * @param string $timesplittingid 359 * @return \stored_file|false 360 */ 361 public static function export_training_data($modelid, $timesplittingid) { 362 363 $fs = get_file_storage(); 364 365 $contextid = \context_system::instance()->id; 366 $filepath = '/timesplitting/' . \core_analytics\analysis::clean_time_splitting_id($timesplittingid) . '/'; 367 368 $files = $fs->get_directory_files($contextid, 'analytics', self::LABELLED_FILEAREA, $modelid, 369 $filepath, true, false); 370 371 // Discard evaluation files. 372 foreach ($files as $key => $file) { 373 if ($file->get_filename() === self::EVALUATION_FILENAME) { 374 unset($files[$key]); 375 } 376 } 377 378 if (empty($files)) { 379 return false; 380 } 381 382 return self::merge_datasets($files, $modelid, $timesplittingid, self::EXPORT_FILEAREA); 383 } 384 385 /** 386 * Returns the dataset file data structured by sampleids using the indicators and target column names. 387 * 388 * @param \stored_file $dataset 389 * @return array 390 */ 391 public static function get_structured_data(\stored_file $dataset) { 392 393 if ($dataset->get_filearea() !== 'unlabelled') { 394 throw new \coding_exception('Sorry, only support for unlabelled data'); 395 } 396 397 $rh = $dataset->get_content_file_handle(); 398 399 // Skip dataset info. 400 fgets($rh); 401 fgets($rh); 402 403 $calculations = array(); 404 405 $headers = fgetcsv($rh); 406 // Get rid of the sampleid column name. 407 array_shift($headers); 408 409 while ($columns = fgetcsv($rh)) { 410 $uniquesampleid = array_shift($columns); 411 412 // Unfortunately fgetcsv does not respect line's var types. 413 $calculations[$uniquesampleid] = array_map(function($value) { 414 415 if ($value === '') { 416 // We really want them as null because converted to float become 0 417 // and we need to treat the values separately. 418 return null; 419 } else if (is_numeric($value)) { 420 return floatval($value); 421 } 422 return $value; 423 }, array_combine($headers, $columns)); 424 } 425 426 return $calculations; 427 } 428 429 /** 430 * Delete all files of a model. 431 * 432 * @param int $modelid 433 * @return bool 434 */ 435 public static function clear_model_files($modelid) { 436 $fs = get_file_storage(); 437 return $fs->delete_area_files(\context_system::instance()->id, 'analytics', false, $modelid); 438 } 439 440 /** 441 * Returns the file name to be used. 442 * 443 * @param strinbool $evaluation 444 * @return string 445 */ 446 protected static function get_filename($evaluation) { 447 448 if ($evaluation === true) { 449 $filename = self::EVALUATION_FILENAME; 450 } else { 451 // Incremental time, the lock will make sure we don't have concurrency problems. 452 $filename = microtime(true) . '.csv'; 453 } 454 455 return $filename; 456 } 457 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body