|
| 1 | + |
| 2 | +#include "AuxFunc.h" |
| 3 | +#include "DateTime.h" |
| 4 | + |
| 5 | +//--------------------------------------------------------------- |
| 6 | +// Common code for Simplex and Smap that embeds, extracts |
| 7 | +// the target vector and computes neighbors. |
| 8 | +// |
| 9 | +// NOTE: time column is not returned in the embedding dataBlock. |
| 10 | +// |
| 11 | +// NOTE: If dataIn is embedded by Embed(), the returned dataBlock |
| 12 | +// has tau * (E-1) fewer rows than dataIn. Since dataIn is |
| 13 | +// included in the returned DataEmbedNN struct, the first |
| 14 | +// tau * (E-1) dataIn rows are deleted from dataIn. The |
| 15 | +// target vector is also reduced. |
| 16 | +// |
| 17 | +// NOTE: If rows are deleted, then the library and prediction |
| 18 | +// vectors in Parameters are updated to reflect this. |
| 19 | +//--------------------------------------------------------------- |
| 20 | +DataEmbedNN EmbedNN( DataFrame<double> dataIn, |
| 21 | + Parameters ¶m, |
| 22 | + bool checkDataRows ) |
| 23 | +{ |
| 24 | + if ( checkDataRows ) { |
| 25 | + CheckDataRows( param, dataIn, "EmbedNN" ); |
| 26 | + } |
| 27 | + |
| 28 | + //---------------------------------------------------------- |
| 29 | + // Extract or embedd data block |
| 30 | + //---------------------------------------------------------- |
| 31 | + DataFrame<double> dataBlock; // Multivariate or embedded DataFrame |
| 32 | + |
| 33 | + if ( param.embedded ) { |
| 34 | + // dataIn is multivariable block, no embedding needed |
| 35 | + // Select the specified columns |
| 36 | + if ( param.columnNames.size() ) { |
| 37 | + dataBlock = dataIn.DataFrameFromColumnNames(param.columnNames); |
| 38 | + } |
| 39 | + else if ( param.columnIndex.size() ) { |
| 40 | + dataBlock = dataIn.DataFrameFromColumnIndex(param.columnIndex); |
| 41 | + } |
| 42 | + else { |
| 43 | + throw std::runtime_error( "EmbedNN(): colNames and " |
| 44 | + " colIndex are empty.\n" ); |
| 45 | + } |
| 46 | + } |
| 47 | + else { |
| 48 | + // embedded = false: Create the embedding block via Embed() |
| 49 | + // dataBlock will have tau * (E-1) fewer rows than dataIn |
| 50 | + dataBlock = Embed( dataIn, param.E, param.tau, |
| 51 | + param.columns_str, param.verbose ); |
| 52 | + } |
| 53 | + |
| 54 | + //---------------------------------------------------------- |
| 55 | + // Get target (library) vector |
| 56 | + //---------------------------------------------------------- |
| 57 | + std::valarray<double> target_vec; |
| 58 | + if ( param.targetIndex ) { |
| 59 | + target_vec = dataIn.Column( param.targetIndex ); |
| 60 | + } |
| 61 | + else if ( param.targetName.size() ) { |
| 62 | + target_vec = dataIn.VectorColumnName( param.targetName ); |
| 63 | + } |
| 64 | + else { |
| 65 | + // Default to first column |
| 66 | + target_vec = dataIn.Column( 0 ); |
| 67 | + } |
| 68 | + |
| 69 | + //------------------------------------------------------------ |
| 70 | + // embedded = false: Embed() was called on dataIn |
| 71 | + // Remove target, dataIn rows as needed |
| 72 | + // Adjust param.library and param.prediction indices |
| 73 | + //------------------------------------------------------------ |
| 74 | + if ( not param.embedded ) { |
| 75 | + // If we support negative tau, this will change |
| 76 | + // For now, assume only positive tau is allowed |
| 77 | + size_t shift = std::max( 0, param.tau * (param.E - 1) ); |
| 78 | + |
| 79 | + std::valarray<double> target_vec_embed( dataIn.NRows() - shift ); |
| 80 | + // Bogus cast to ( std::valarray<double> ) for MSVC |
| 81 | + // as it doesn't export its own slice_array applied to [] |
| 82 | + target_vec_embed = ( std::valarray<double> ) |
| 83 | + target_vec[ std::slice( shift, target_vec.size() - shift, 1 ) ]; |
| 84 | + |
| 85 | + target_vec = target_vec_embed; |
| 86 | + |
| 87 | + DataFrame<double> dataInEmbed( dataIn.NRows() - shift, |
| 88 | + dataIn.NColumns(), |
| 89 | + dataIn.ColumnNames() ); |
| 90 | + |
| 91 | + for ( size_t row = 0; row < dataInEmbed.NRows(); row++ ) { |
| 92 | + dataInEmbed.WriteRow( row, dataIn.Row( row + shift ) ); |
| 93 | + } |
| 94 | + |
| 95 | + // Shift and add time vector if present |
| 96 | + if ( dataIn.Time().size() ) { |
| 97 | + dataInEmbed.Time() = |
| 98 | + std::vector< std::string >( dataIn.Time().size() - shift ); |
| 99 | + |
| 100 | + for ( size_t t = shift; t < dataIn.Time().size(); t++ ) { |
| 101 | + dataInEmbed.Time()[ t - shift ] = dataIn.Time()[ t ]; |
| 102 | + } |
| 103 | + dataInEmbed.TimeName() = dataIn.TimeName(); |
| 104 | + } |
| 105 | + |
| 106 | + // dataIn was passed in by value, so copy constructed. |
| 107 | + // JP Is it OK to reassign it here, then copy into dataEmbedNN? |
| 108 | + dataIn = dataInEmbed; |
| 109 | + |
| 110 | + // Adust param.library and param.prediction vectors of indices |
| 111 | + if ( shift > 0 ) { |
| 112 | + size_t library_len = param.library.size(); |
| 113 | + size_t prediction_len = param.prediction.size(); |
| 114 | + |
| 115 | + // If 0, 1, ... shift are in library or prediction |
| 116 | + // those rows were deleted, delete these elements. |
| 117 | + // First, create a vector of indices to delete |
| 118 | + std::vector< size_t > deleted_elements( shift, 0 ); |
| 119 | + std::iota( deleted_elements.begin(), deleted_elements.end(), 0 ); |
| 120 | + |
| 121 | + // erase elements of row indices that were deleted |
| 122 | + for ( auto element = deleted_elements.begin(); |
| 123 | + element != deleted_elements.end(); element++ ) { |
| 124 | + |
| 125 | + std::vector< size_t >::iterator it; |
| 126 | + it = std::find( param.library.begin(), |
| 127 | + param.library.end(), *element ); |
| 128 | + |
| 129 | + if ( it != param.library.end() ) { |
| 130 | + param.library.erase( it ); |
| 131 | + } |
| 132 | + |
| 133 | + it = std::find( param.prediction.begin(), |
| 134 | + param.prediction.end(), *element ); |
| 135 | + |
| 136 | + if ( it != param.prediction.end() ) { |
| 137 | + param.prediction.erase( it ); |
| 138 | + } |
| 139 | + } |
| 140 | + |
| 141 | + // Now offset all values by shift so that vectors indices |
| 142 | + // in library and prediction refer to the same data rows |
| 143 | + // before the deletion/shift. |
| 144 | + for ( auto li = param.library.begin(); |
| 145 | + li != param.library.end(); li++ ) { |
| 146 | + *li = *li - shift; |
| 147 | + } |
| 148 | + for ( auto pi = param.prediction.begin(); |
| 149 | + pi != param.prediction.end(); pi++ ) { |
| 150 | + *pi = *pi - shift; |
| 151 | + } |
| 152 | + } // if ( shift > 0 ) |
| 153 | + } |
| 154 | + |
| 155 | + //---------------------------------------------------------- |
| 156 | + // Nearest neighbors |
| 157 | + //---------------------------------------------------------- |
| 158 | + Neighbors neighbors = FindNeighbors( dataBlock, param ); |
| 159 | + |
| 160 | + // Create struct to return the objects |
| 161 | + DataEmbedNN dataEmbedNN = DataEmbedNN( dataIn, dataBlock, |
| 162 | + target_vec, neighbors ); |
| 163 | + return dataEmbedNN; |
| 164 | +} |
| 165 | + |
| 166 | +//---------------------------------------------------------- |
| 167 | +// Common code for Simplex and Smap output generation |
| 168 | +//---------------------------------------------------------- |
| 169 | +DataFrame<double> FormatOutput( Parameters param, |
| 170 | + std::valarray<double> predictions, |
| 171 | + std::valarray<double> const_predictions, |
| 172 | + std::valarray<double> target_vec, |
| 173 | + std::vector<std::string> time, |
| 174 | + std::string timeName ) |
| 175 | +{ |
| 176 | + |
| 177 | +#ifdef DEBUG_ALL |
| 178 | + std::cout << "FormatOutput() param.prediction.size: " |
| 179 | + << param.prediction.size() << " >>> "; |
| 180 | + for( auto i = 0; i < param.prediction.size(); i++ ) { |
| 181 | + std::cout << param.prediction[i] << ","; |
| 182 | + } std::cout << std::endl; |
| 183 | + std::cout << "FormatOutput() time.size: " << time.size() << " >>> "; |
| 184 | + for( auto i = 0; i < time.size(); i++ ) { |
| 185 | + std::cout << time[i] << ","; |
| 186 | + } std::cout << std::endl; |
| 187 | +#endif |
| 188 | + |
| 189 | + //---------------------------------------------------- |
| 190 | + // Time vector with additional Tp points |
| 191 | + //---------------------------------------------------- |
| 192 | + size_t N_time = time.size(); |
| 193 | + size_t N_row = predictions.size(); |
| 194 | + |
| 195 | + // Populate vector of time strings for output |
| 196 | + std::vector<std::string> timeOut( N_row + param.Tp ); |
| 197 | + |
| 198 | + if ( N_time ) { |
| 199 | + FillTimes( param, time, std::ref( timeOut ) ); |
| 200 | + } |
| 201 | + |
| 202 | + //---------------------------------------------------- |
| 203 | + // Observations: add Tp nan at end |
| 204 | + //---------------------------------------------------- |
| 205 | + std::valarray<double> observations( N_row + param.Tp ); |
| 206 | + std::slice pred_i = std::slice( param.prediction[0], N_row, 1 ); |
| 207 | + |
| 208 | + observations[ std::slice( 0, N_row, 1 ) ] = |
| 209 | + ( std::valarray<double> ) target_vec[ pred_i ]; |
| 210 | + |
| 211 | + for ( size_t i = N_row; i < N_row + param.Tp; i++ ) { |
| 212 | + observations[ i ] = NAN; |
| 213 | + } |
| 214 | + |
| 215 | + //---------------------------------------------------- |
| 216 | + // Predictions: insert Tp nan at start |
| 217 | + //---------------------------------------------------- |
| 218 | + std::valarray<double> predictionsOut( N_row + param.Tp ); |
| 219 | + for ( size_t i = 0; i < param.Tp; i++ ) { |
| 220 | + predictionsOut[ i ] = NAN; |
| 221 | + } |
| 222 | + predictionsOut[ std::slice(param.Tp, N_row, 1) ] = predictions; |
| 223 | + |
| 224 | + std::valarray<double> constPredictionsOut( N_row + param.Tp ); |
| 225 | + if ( param.const_predict ) { |
| 226 | + for ( size_t i = 0; i < param.Tp; i++ ) { |
| 227 | + constPredictionsOut[ i ] = NAN; |
| 228 | + } |
| 229 | + constPredictionsOut[ std::slice(param.Tp, N_row, 1) ] = |
| 230 | + const_predictions; |
| 231 | + } |
| 232 | + |
| 233 | + //---------------------------------------------------- |
| 234 | + // Create output DataFrame |
| 235 | + //---------------------------------------------------- |
| 236 | + size_t dataFrameColumms = param.const_predict ? 3 : 2; |
| 237 | + |
| 238 | + DataFrame<double> dataFrame( N_row + param.Tp, dataFrameColumms ); |
| 239 | + |
| 240 | + if ( param.const_predict ) { |
| 241 | + dataFrame.ColumnNames() = { "Observations", |
| 242 | + "Predictions", "Const_Predictions" }; |
| 243 | + } |
| 244 | + else { |
| 245 | + dataFrame.ColumnNames() = { "Observations", "Predictions" }; |
| 246 | + } |
| 247 | + |
| 248 | + if ( N_time ) { |
| 249 | + dataFrame.TimeName() = timeName; |
| 250 | + dataFrame.Time() = timeOut; |
| 251 | + } |
| 252 | + dataFrame.WriteColumn( 0, observations ); |
| 253 | + dataFrame.WriteColumn( 1, predictionsOut ); |
| 254 | + if ( param.const_predict ) { |
| 255 | + dataFrame.WriteColumn( 2, constPredictionsOut ); |
| 256 | + } |
| 257 | + |
| 258 | +#ifdef DEBUG_ALL |
| 259 | + std::cout << "FormatOutput() time " << timeOut.size() |
| 260 | + << " pred " << predictionsOut.size() |
| 261 | + << " obs " << observations.size() << std::endl; |
| 262 | + std::cout << "FormatOutput() dataFrame -------------------" << std::endl; |
| 263 | + std::cout << dataFrame; |
| 264 | +#endif |
| 265 | + |
| 266 | + return dataFrame; |
| 267 | +} |
| 268 | + |
| 269 | +//---------------------------------------------------------- |
| 270 | +// Copy strings of time values into timeOut. |
| 271 | +// If prediction times exceed times from the data, then |
| 272 | +// create new entries for the additional times. |
| 273 | +//---------------------------------------------------------- |
| 274 | +void FillTimes( Parameters param, |
| 275 | + std::vector<std::string> time, |
| 276 | + std::vector<std::string> &timeOut ) |
| 277 | +{ |
| 278 | + size_t N_time = time.size(); |
| 279 | + size_t N_row = param.prediction.size(); |
| 280 | + size_t max_pred_i = param.prediction[ N_row - 1 ]; |
| 281 | + |
| 282 | + if ( timeOut.size() != N_row + param.Tp ) { |
| 283 | + std::stringstream errMsg; |
| 284 | + errMsg << "FillTimes(): timeOut vector length " << timeOut.size() |
| 285 | + << " is not equal to the number of predictions + Tp " |
| 286 | + << N_row + param.Tp << std::endl; |
| 287 | + throw std::runtime_error( errMsg.str() ); |
| 288 | + } |
| 289 | + |
| 290 | + // Fill in times guaranteed to be in param.prediction indices |
| 291 | + for ( auto i = 0; i < N_row; i++ ) { |
| 292 | + timeOut[ i ] = time[ param.prediction[ i ] ]; |
| 293 | + } |
| 294 | + |
| 295 | + // Now fill in times beyond param.prediction indices |
| 296 | + if ( max_pred_i + param.Tp < N_time ) { |
| 297 | + // All prediction times are available in time, get the rest |
| 298 | + for ( auto i = 0; i < param.Tp; i++ ) { |
| 299 | + timeOut[ N_row + i ] = time[ max_pred_i + i + 1 ]; |
| 300 | + } |
| 301 | + } |
| 302 | + else { |
| 303 | + // Keep track of whether warning of time format already printed |
| 304 | + bool time_format_warning_printed = false; |
| 305 | + |
| 306 | + // Tp introduces time values beyond the range of time |
| 307 | + for ( auto i = N_row; i < N_row + param.Tp; i++ ) { |
| 308 | + std::stringstream tss; |
| 309 | + |
| 310 | + if ( OnlyDigits( time[ max_pred_i ] ) ) { |
| 311 | + // Numeric so add Tp |
| 312 | + tss << std::stod( time[ max_pred_i ] ) + i - N_row + 1; |
| 313 | + } |
| 314 | + else { |
| 315 | + int time_delta = i - N_row + 1; |
| 316 | + //get last two datetimes to compute time diff to add time delta |
| 317 | + std::string time_new( time[ max_pred_i ] ); |
| 318 | + std::string time_old( time[ max_pred_i - 1 ] ); |
| 319 | + std::string new_time = increment_datetime_str( time_old, |
| 320 | + time_new, |
| 321 | + time_delta ); |
| 322 | + |
| 323 | + if ( new_time.size() ) { |
| 324 | + tss << new_time; |
| 325 | + } |
| 326 | + else { |
| 327 | + // Add " +ti" if not a recognized format |
| 328 | + // increment_datetime_str() returns "" |
| 329 | + tss << time[ max_pred_i ] << " +" << i - N_row + 1; |
| 330 | + if ( not time_format_warning_printed ) { |
| 331 | + std::stringstream errMsg; |
| 332 | + errMsg << "FillTimes(): Input time column format " |
| 333 | + << "not recognized. Appending '+ tp' to last " |
| 334 | + << " available time." << std::endl; |
| 335 | + std::cout << errMsg.str(); |
| 336 | + time_format_warning_printed = true; |
| 337 | + } |
| 338 | + } |
| 339 | + } |
| 340 | + |
| 341 | + timeOut[ i ] = tss.str(); |
| 342 | + } |
| 343 | + } |
| 344 | +} |
| 345 | + |
| 346 | +//---------------------------------------------------------- |
| 347 | +// Validate dataFrameIn rows against lib and pred indices |
| 348 | +//---------------------------------------------------------- |
| 349 | +void CheckDataRows( Parameters param, |
| 350 | + DataFrame<double> dataFrameIn, |
| 351 | + std::string call ) |
| 352 | +{ |
| 353 | + // param.prediction & library have been zero-offset in Validate() |
| 354 | + // to convert from user specified data row to array indicies |
| 355 | + size_t prediction_max_i = param.prediction[ param.prediction.size() - 1 ]; |
| 356 | + size_t library_max_i = param.library [ param.library.size() - 1 ]; |
| 357 | + |
| 358 | + size_t shift; |
| 359 | + if ( param.embedded ) { |
| 360 | + shift = 0; |
| 361 | + } |
| 362 | + else { |
| 363 | + shift = std::max( 0, param.tau * (param.E - 1) ); |
| 364 | + } |
| 365 | + |
| 366 | + if ( dataFrameIn.NRows() <= prediction_max_i + shift ) { |
| 367 | + std::stringstream errMsg; |
| 368 | + errMsg << "CheckDataRows(): The prediction index + tau(E-1) " |
| 369 | + << prediction_max_i + shift |
| 370 | + << " equals or exceeds the number of data rows " |
| 371 | + << dataFrameIn.NRows(); |
| 372 | + throw std::runtime_error( errMsg.str() ); |
| 373 | + } |
| 374 | + |
| 375 | + if ( dataFrameIn.NRows() <= library_max_i + shift ) { |
| 376 | + std::stringstream errMsg; |
| 377 | + errMsg << "CheckDataRows(): The library index + tau(E-1) " |
| 378 | + << library_max_i + shift |
| 379 | + << " equals or exceeds the number of data rows " |
| 380 | + << dataFrameIn.NRows(); |
| 381 | + throw std::runtime_error( errMsg.str() ); |
| 382 | + } |
| 383 | +} |
0 commit comments