From 19bdbcf87fac7b0b858efc207138b477efd1e719 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Fri, 12 Jul 2024 12:34:21 +0200 Subject: [PATCH 01/41] Corrections rule instantiation continuous data #17 --- src/C++/Explore/condition.cpp | 2 ++ src/C++/Explore/condition.h | 5 ++++- src/C++/Explore/feature.h | 3 ++- src/C++/Explore/rule.cpp | 38 +++++++++++++++++++++++------------ 4 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/C++/Explore/condition.cpp b/src/C++/Explore/condition.cpp index 267cb000..f96e64c3 100755 --- a/src/C++/Explore/condition.cpp +++ b/src/C++/Explore/condition.cpp @@ -22,6 +22,7 @@ CONDITION::CONDITION() { // Needed for cutoffsets IsSolo = false; NonSoloIncluded = false; + RepeatedFeature = false; NextSame = false; PreviousSame = false; @@ -50,6 +51,7 @@ CONDITION::CONDITION(unsigned int CNumber, string CName, vector CCutoffs // Needed for cutoffsets IsSolo = false; NonSoloIncluded = false; + RepeatedFeature = false; NextSame = false; PreviousSame = false; diff --git a/src/C++/Explore/condition.h b/src/C++/Explore/condition.h index abe68afa..34903db2 100755 --- a/src/C++/Explore/condition.h +++ b/src/C++/Explore/condition.h @@ -42,7 +42,10 @@ class CONDITION { // FeatureOperator bool IsSolo; // FeatureOperator occurs within rule on it's own bool NonSoloIncluded; // FeatureOperator included in a conjunction with size>1 - + + // Feature + bool RepeatedFeature; + // Condition bool PreviousSame; // Feature is equal to previous feature within conjunction (left one) bool NextSame; // Feature is equal to next feature (right one) diff --git a/src/C++/Explore/feature.h b/src/C++/Explore/feature.h index 30376c41..944ae8c1 100755 --- a/src/C++/Explore/feature.h +++ b/src/C++/Explore/feature.h @@ -18,7 +18,7 @@ class FEATURE { private: vector Observations; // Vector of observations enabling direct access - vector Cutoffs; // Vector of cutoffs enabling direct access + // Vector of cutoffs enabling direct access vector LearnClasses; // List of pointers to class objects of observations ordered on value for learning vector ValidationClasses; // List of pointers to class objects of observations ordered on value for validation @@ -160,6 +160,7 @@ class FEATURE { string PrintCutoffMethod(); // Print cutoff method information string PrintOperatorMethod(); // Print operator method information + vector Cutoffs; }; #endif diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 606af187..c9a96a71 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -641,14 +641,14 @@ Out: unsigned int, the minimum order of a cutoff Description: Returns the order of the minimal cutoff for a specific FeatureOperator currently used in the rule. **********************************************************************/ -unsigned int RULE::GetMinCutoff(unsigned int FOperator) { +unsigned int RULE::GetMinCutoff(unsigned int Fnum) { CONDITION* CurrentCondition; - unsigned int Result = FeatureOperators[FOperator].Cutoffs.size(); + unsigned int Result = Features[0][Fnum].Cutoffs.size(); for (unsigned int i=0; i1; j++) { + for (unsigned int j=0; jFeatureOperator==FOperator) { + if (CurrentCondition->FeatureNumber==Fnum && (Conjunctions[i].Size>1 || FeatureOperators[CurrentCondition->FeatureOperator].RepeatedFeature)) { if (CurrentCondition->CutoffNumberCutoffNumber; } @@ -2120,7 +2120,7 @@ bool RULE::NextCutoffSet() { ConditionNr--; } } else if (CurrentCondition->Operator==LESS) { - MaxCutoff = GetMinCutoff(CurrentCondition->FeatureOperator); + MaxCutoff = GetMinCutoff(CurrentCondition->FeatureNumber); if (CurrentCondition->CutoffNumber+1 < MaxCutoff) { CurrentCondition->CutoffNumber++; Incremented = true; @@ -2137,8 +2137,8 @@ bool RULE::NextCutoffSet() { } } } else { - if (CurrentFeatureOperator->IsSolo && CurrentFeatureOperator->NonSoloIncluded && CurrentConjunction->Size>1) { // && !(CurrentFeatureOperator->Operator == LESS) - if (MaxCutoff == 2 || CurrentFeatureOperator->Operator==GREATER) { // MaxCutoff == 2 && Operator==EQUAL? + if (CurrentFeatureOperator->RepeatedFeature && CurrentConjunction->Size>1) { // && !(CurrentFeatureOperator->Operator == LESS) + if (MaxCutoff == 2 || CurrentFeatureOperator->Operator==LESS) { // MaxCutoff == 2 && Operator==EQUAL? MaxCutoff--; // Needed for binary, should be removed for categorical } } @@ -2220,7 +2220,7 @@ bool RULE::NextCutoffSet() { CurrentCondition->CutoffNumber = 0; } else if (CurrentCondition->Operator==GREATER) { // Reset to next cutoff - CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureOperator)+1; + CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureNumber)+1; // TODO: check if correct // Or first if maximum reached if (CurrentCondition->CutoffNumber > CurrentCondition->Cutoffs.size()-1) { @@ -2247,8 +2247,8 @@ bool RULE::NextCutoffSet() { } // Reset to next cutoff - if (!(CurrentCondition->Operator==GREATER) && CurrentFeatureOperator->IsSolo && CurrentConjunction->Size>1) { - CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureOperator)+1; + if (!(CurrentCondition->Operator==LESS) && CurrentFeatureOperator->IsSolo && CurrentConjunction->Size>1) { + CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureNumber)+1; // TODO: check if correct } } } @@ -2272,6 +2272,7 @@ bool RULE::NextCutoffSet() { for (; CFOperator != LFOperator; CFOperator++) { CFOperator->IsSolo = false; CFOperator->NonSoloIncluded = false; + CFOperator->RepeatedFeature = false; } // Reset cutoffs of Conditions in rule and find equal features within conjunctions @@ -2315,6 +2316,17 @@ bool RULE::NextCutoffSet() { for (ConjunctionNr = Conjunctions.size()-1; ConjunctionNr>=0; ConjunctionNr--) { if (Conjunctions[ConjunctionNr].Size==1) { FeatureOperators[Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator].IsSolo=true; + + // Identify occurences of that feature in other terms (of size 1 or more than 1) + for (int C=ConjunctionNr-1; C>=0; C--) { + CurrentConjunction = &Conjunctions[C]; + for (ConditionNr=0; ConditionNr<(int)CurrentConjunction->Size; ConditionNr++) { // Iterate through conditions + if (Conjunctions[C].Conditions[ConditionNr].FeatureNumber == + Conjunctions[ConjunctionNr].Conditions[0].FeatureNumber) { + FeatureOperators[Conjunctions[C].Conditions[ConditionNr].FeatureOperator].RepeatedFeature = true; + } + } + } } } @@ -2328,7 +2340,7 @@ bool RULE::NextCutoffSet() { if (CurrentConjunction->Size>1) { CurrentFeatureOperator->NonSoloIncluded = true; - if (!(CurrentCondition->Operator==GREATER)) { // If operator = less or equal, start from next cutoff + if (!(CurrentCondition->Operator==LESS)) { // If operator = greater or equal, start from next cutoff if (CurrentCondition->Cutoffs.size()>1) { CurrentCondition->CutoffNumber = 1; } else { @@ -2340,9 +2352,9 @@ bool RULE::NextCutoffSet() { } else { CurrentCondition->CutoffNumber = 0; // TODO: unneccesary? - if (CurrentFeatureOperator->NonSoloIncluded && (CurrentCondition->Operator == GREATER)) { + if (CurrentFeatureOperator->RepeatedFeature && (CurrentCondition->Operator == GREATER)) { if (CurrentCondition->Cutoffs.size()>1) { - CurrentCondition-> CutoffNumber = 1; + CurrentCondition->CutoffNumber = 1; } else { CutoffSetGenerated = false; return false; From ecb2d17258583a303ca83a19786060ca97c4c531 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Fri, 12 Jul 2024 14:30:05 +0200 Subject: [PATCH 02/41] Checked up to rule length 2 #17 --- src/C++/Explore/rule.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index c9a96a71..5bebd186 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -2247,7 +2247,7 @@ bool RULE::NextCutoffSet() { } // Reset to next cutoff - if (!(CurrentCondition->Operator==LESS) && CurrentFeatureOperator->IsSolo && CurrentConjunction->Size>1) { + if (!(CurrentCondition->Operator==LESS) && (CurrentFeatureOperator->IsSolo || CurrentFeatureOperator->RepeatedFeature) && CurrentConjunction->Size>1) { CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureNumber)+1; // TODO: check if correct } } @@ -2336,9 +2336,9 @@ bool RULE::NextCutoffSet() { CurrentCondition = &Conjunctions[ConjunctionNr].Conditions[ConditionNr]; CurrentFeatureOperator = &FeatureOperators[CurrentCondition->FeatureOperator]; - if (CurrentFeatureOperator->IsSolo==true) { // Is current condition solo? + if (CurrentFeatureOperator->IsSolo || CurrentFeatureOperator->RepeatedFeature) { // Is current condition solo? if (CurrentConjunction->Size>1) { - CurrentFeatureOperator->NonSoloIncluded = true; + CurrentFeatureOperator->NonSoloIncluded = true; // Used for variables with EQUAL operator, will never have RepeatedFeature (only one operator) if (!(CurrentCondition->Operator==LESS)) { // If operator = greater or equal, start from next cutoff if (CurrentCondition->Cutoffs.size()>1) { @@ -2352,7 +2352,7 @@ bool RULE::NextCutoffSet() { } else { CurrentCondition->CutoffNumber = 0; // TODO: unneccesary? - if (CurrentFeatureOperator->RepeatedFeature && (CurrentCondition->Operator == GREATER)) { + if ((CurrentFeatureOperator->IsSolo || CurrentFeatureOperator->RepeatedFeature) && (CurrentCondition->Operator == GREATER)) { if (CurrentCondition->Cutoffs.size()>1) { CurrentCondition->CutoffNumber = 1; } else { From 11b43e048cd744bc23518b67e30c1a299ad1839b Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Fri, 12 Jul 2024 17:05:46 +0200 Subject: [PATCH 03/41] Added IsRepeated and link between operators <-> helper #17 --- src/C++/Explore/condition.cpp | 2 ++ src/C++/Explore/condition.h | 1 + src/C++/Explore/rule.cpp | 28 ++++++++++++++++++---------- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/C++/Explore/condition.cpp b/src/C++/Explore/condition.cpp index f96e64c3..5137b6ec 100755 --- a/src/C++/Explore/condition.cpp +++ b/src/C++/Explore/condition.cpp @@ -21,6 +21,7 @@ CONDITION::CONDITION() { // Needed for cutoffsets IsSolo = false; + IsRepeated = false; NonSoloIncluded = false; RepeatedFeature = false; @@ -50,6 +51,7 @@ CONDITION::CONDITION(unsigned int CNumber, string CName, vector CCutoffs // Needed for cutoffsets IsSolo = false; + IsRepeated = false; NonSoloIncluded = false; RepeatedFeature = false; diff --git a/src/C++/Explore/condition.h b/src/C++/Explore/condition.h index 34903db2..fd29c13f 100755 --- a/src/C++/Explore/condition.h +++ b/src/C++/Explore/condition.h @@ -44,6 +44,7 @@ class CONDITION { bool NonSoloIncluded; // FeatureOperator included in a conjunction with size>1 // Feature + bool IsRepeated; bool RepeatedFeature; // Condition diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 5bebd186..6f4a0285 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -2137,10 +2137,12 @@ bool RULE::NextCutoffSet() { } } } else { - if (CurrentFeatureOperator->RepeatedFeature && CurrentConjunction->Size>1) { // && !(CurrentFeatureOperator->Operator == LESS) - if (MaxCutoff == 2 || CurrentFeatureOperator->Operator==LESS) { // MaxCutoff == 2 && Operator==EQUAL? - MaxCutoff--; // Needed for binary, should be removed for categorical - } + if (CurrentFeatureOperator->Operator==EQUAL && MaxCutoff==2){ // Needed for binary,should be removed for categorical + if (CurrentFeatureOperator->RepeatedFeature && CurrentConjunction->Size>1) {MaxCutoff--;} + } else if (CurrentFeatureOperator->NonSoloIncluded){ + if (CurrentFeatureOperator->Operator==GREATER) {MaxCutoff--;} + } else if (CurrentFeatureOperator->RepeatedFeature){ + if (CurrentFeatureOperator->Operator==LESS && CurrentConjunction->Size>1) {MaxCutoff--;} } if (CurrentCondition->NextSame) { // For greater, also for equal or less? MaxCutoff--; @@ -2271,6 +2273,7 @@ bool RULE::NextCutoffSet() { vector::iterator LFOperator(FeatureOperators.end()); for (; CFOperator != LFOperator; CFOperator++) { CFOperator->IsSolo = false; + CFOperator->IsRepeated = false; CFOperator->NonSoloIncluded = false; CFOperator->RepeatedFeature = false; } @@ -2321,9 +2324,16 @@ bool RULE::NextCutoffSet() { for (int C=ConjunctionNr-1; C>=0; C--) { CurrentConjunction = &Conjunctions[C]; for (ConditionNr=0; ConditionNr<(int)CurrentConjunction->Size; ConditionNr++) { // Iterate through conditions - if (Conjunctions[C].Conditions[ConditionNr].FeatureNumber == + if (CurrentConjunction->Conditions[ConditionNr].FeatureNumber == Conjunctions[ConjunctionNr].Conditions[0].FeatureNumber) { - FeatureOperators[Conjunctions[C].Conditions[ConditionNr].FeatureOperator].RepeatedFeature = true; + FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].RepeatedFeature = true; + + FeatureOperators[Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator].IsRepeated=true; + + if (CurrentConjunction->Size > 1 && CurrentConjunction->Conditions[ConditionNr].FeatureOperator == + Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator) { + FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].NonSoloIncluded = true; + } } } } @@ -2338,9 +2348,7 @@ bool RULE::NextCutoffSet() { if (CurrentFeatureOperator->IsSolo || CurrentFeatureOperator->RepeatedFeature) { // Is current condition solo? if (CurrentConjunction->Size>1) { - CurrentFeatureOperator->NonSoloIncluded = true; // Used for variables with EQUAL operator, will never have RepeatedFeature (only one operator) - - if (!(CurrentCondition->Operator==LESS)) { // If operator = greater or equal, start from next cutoff + if (!(CurrentCondition->Operator==GREATER && CurrentFeatureOperator->NonSoloIncluded) && !(CurrentCondition->Operator==LESS && CurrentFeatureOperator->RepeatedFeature)) { // If operator = greater or equal, start from next cutoff if (CurrentCondition->Cutoffs.size()>1) { CurrentCondition->CutoffNumber = 1; } else { @@ -2352,7 +2360,7 @@ bool RULE::NextCutoffSet() { } else { CurrentCondition->CutoffNumber = 0; // TODO: unneccesary? - if ((CurrentFeatureOperator->IsSolo || CurrentFeatureOperator->RepeatedFeature) && (CurrentCondition->Operator == GREATER)) { + if ((CurrentFeatureOperator->NonSoloIncluded || CurrentFeatureOperator->RepeatedFeature || CurrentFeatureOperator->IsRepeated) && CurrentCondition->Operator == GREATER) { if (CurrentCondition->Cutoffs.size()>1) { CurrentCondition->CutoffNumber = 1; } else { From 02d1b786e354074ea62b0a159376de5b89b86e71 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Tue, 16 Jul 2024 16:55:33 +0200 Subject: [PATCH 04/41] Up to rule length 3 continuous; maxCutoff also per feature, RepeatedFeature excludes same operator #17 --- src/C++/Explore/rule.cpp | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 6f4a0285..2cb092b3 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -688,14 +688,15 @@ Out: unsigned int, the maximum order of a cutoff Description: Returns the order of the maximum cutoff for a specific FeatureOperator currently used in the rule. **********************************************************************/ -unsigned int RULE::GetMaxCutoff(unsigned int FOperator) { +unsigned int RULE::GetMaxCutoff(unsigned int Fnum) { CONDITION* CurrentCondition; unsigned int Result = 0; for (unsigned int i=0; i1; j++) { CurrentCondition = &Conjunctions[i].Conditions[j]; - if (CurrentCondition->FeatureOperator==FOperator) { + // if (CurrentCondition->FeatureOperator==FOperator) { + if (CurrentCondition->FeatureNumber==Fnum && (Conjunctions[i].Size>1 || FeatureOperators[CurrentCondition->FeatureOperator].RepeatedFeature)) { if (CurrentCondition->CutoffNumber>Result) { Result = CurrentCondition->CutoffNumber; } @@ -2128,7 +2129,7 @@ bool RULE::NextCutoffSet() { ConditionNr--; } } else if (CurrentCondition-> Operator==GREATER){ - MaxCutoff = GetMaxCutoff(CurrentCondition->FeatureOperator); + MaxCutoff = GetMaxCutoff(CurrentCondition->FeatureNumber); if (CurrentCondition->CutoffNumber+1 > MaxCutoff && CurrentCondition->CutoffNumber+1Cutoffs.size()) { CurrentCondition->CutoffNumber++; Incremented = true; @@ -2226,7 +2227,11 @@ bool RULE::NextCutoffSet() { // Or first if maximum reached if (CurrentCondition->CutoffNumber > CurrentCondition->Cutoffs.size()-1) { - CurrentCondition->CutoffNumber = 0; + if (!CurrentFeatureOperator->RepeatedFeature){ + CurrentCondition->CutoffNumber = 0; + } else { + CurrentCondition->CutoffNumber = 1; + } } } } else { @@ -2249,7 +2254,7 @@ bool RULE::NextCutoffSet() { } // Reset to next cutoff - if (!(CurrentCondition->Operator==LESS) && (CurrentFeatureOperator->IsSolo || CurrentFeatureOperator->RepeatedFeature) && CurrentConjunction->Size>1) { + if ((CurrentFeatureOperator->NonSoloIncluded && !(CurrentCondition->Operator==GREATER)) || (CurrentFeatureOperator->RepeatedFeature && !(CurrentCondition->Operator==LESS))) { CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureNumber)+1; // TODO: check if correct } } @@ -2326,14 +2331,15 @@ bool RULE::NextCutoffSet() { for (ConditionNr=0; ConditionNr<(int)CurrentConjunction->Size; ConditionNr++) { // Iterate through conditions if (CurrentConjunction->Conditions[ConditionNr].FeatureNumber == Conjunctions[ConjunctionNr].Conditions[0].FeatureNumber) { - FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].RepeatedFeature = true; + FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].RepeatedFeature = true; FeatureOperators[Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator].IsRepeated=true; if (CurrentConjunction->Size > 1 && CurrentConjunction->Conditions[ConditionNr].FeatureOperator == - Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator) { - FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].NonSoloIncluded = true; - } + Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator) { + FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].NonSoloIncluded = true; + FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].RepeatedFeature = false; + } } } } @@ -2346,11 +2352,11 @@ bool RULE::NextCutoffSet() { CurrentCondition = &Conjunctions[ConjunctionNr].Conditions[ConditionNr]; CurrentFeatureOperator = &FeatureOperators[CurrentCondition->FeatureOperator]; - if (CurrentFeatureOperator->IsSolo || CurrentFeatureOperator->RepeatedFeature) { // Is current condition solo? + if (CurrentFeatureOperator->IsSolo || CurrentFeatureOperator->RepeatedFeature || CurrentFeatureOperator->NonSoloIncluded) { // Is current condition solo? if (CurrentConjunction->Size>1) { - if (!(CurrentCondition->Operator==GREATER && CurrentFeatureOperator->NonSoloIncluded) && !(CurrentCondition->Operator==LESS && CurrentFeatureOperator->RepeatedFeature)) { // If operator = greater or equal, start from next cutoff + if (!(CurrentCondition->Operator==GREATER && CurrentFeatureOperator->NonSoloIncluded) && !(CurrentCondition->Operator==LESS && CurrentFeatureOperator->RepeatedFeature)) { if (CurrentCondition->Cutoffs.size()>1) { - CurrentCondition->CutoffNumber = 1; + CurrentCondition->CutoffNumber = 1; // Then start from next cutoff } else { CutoffSetGenerated = false; return false; @@ -2362,7 +2368,8 @@ bool RULE::NextCutoffSet() { if ((CurrentFeatureOperator->NonSoloIncluded || CurrentFeatureOperator->RepeatedFeature || CurrentFeatureOperator->IsRepeated) && CurrentCondition->Operator == GREATER) { if (CurrentCondition->Cutoffs.size()>1) { - CurrentCondition->CutoffNumber = 1; + CurrentCondition->CutoffNumber = GetMaxCutoff(CurrentCondition->FeatureNumber)+1; // TODO: check if correct, or should it be MinCutoff? + // CurrentCondition->CutoffNumber = 1; } else { CutoffSetGenerated = false; return false; From 09cc42e64b3f99baaaa686f7a6862df070626226 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Thu, 18 Jul 2024 11:58:13 +0200 Subject: [PATCH 05/41] Fix revised MaxCutoff for binary data --- src/C++/Explore/rule.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 2cb092b3..5e500a37 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -2139,7 +2139,7 @@ bool RULE::NextCutoffSet() { } } else { if (CurrentFeatureOperator->Operator==EQUAL && MaxCutoff==2){ // Needed for binary,should be removed for categorical - if (CurrentFeatureOperator->RepeatedFeature && CurrentConjunction->Size>1) {MaxCutoff--;} + if (CurrentFeatureOperator->NonSoloIncluded) {MaxCutoff--;} } else if (CurrentFeatureOperator->NonSoloIncluded){ if (CurrentFeatureOperator->Operator==GREATER) {MaxCutoff--;} } else if (CurrentFeatureOperator->RepeatedFeature){ From 447d89646de88f46d3311ba9fd2de5fb2b7f6144 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Thu, 18 Jul 2024 14:44:00 +0200 Subject: [PATCH 06/41] Fix repeated continuous variables #17 --- src/C++/Explore/rule.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 5e500a37..d10c6ecf 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -2222,15 +2222,19 @@ bool RULE::NextCutoffSet() { } else if (CurrentCondition->Operator==LESS) { CurrentCondition->CutoffNumber = 0; } else if (CurrentCondition->Operator==GREATER) { + CurrentCondition->CutoffNumber = 0; + // Reset to next cutoff - CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureNumber)+1; // TODO: check if correct - - // Or first if maximum reached - if (CurrentCondition->CutoffNumber > CurrentCondition->Cutoffs.size()-1) { - if (!CurrentFeatureOperator->RepeatedFeature){ - CurrentCondition->CutoffNumber = 0; - } else { - CurrentCondition->CutoffNumber = 1; + if (CurrentFeatureOperator->NonSoloIncluded || CurrentFeatureOperator->RepeatedFeature || CurrentFeatureOperator->IsRepeated){ + CurrentCondition->CutoffNumber = GetMaxCutoff(CurrentCondition->FeatureNumber)+1; + + // Or first if maximum reached + if (CurrentCondition->CutoffNumber > CurrentCondition->Cutoffs.size()-1) { + if (!CurrentFeatureOperator->RepeatedFeature){ + CurrentCondition->CutoffNumber = 0; + } else { + CurrentCondition->CutoffNumber = 1; + } } } } From 7978933514cf7e0402060936c4deb893581442ed Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Fri, 19 Jul 2024 16:53:48 +0200 Subject: [PATCH 07/41] Update identifying RepeatedFeature and allow both NonSoloIncluded and RepeatedFeature at the same time #21 --- src/C++/Explore/rule.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index d10c6ecf..53a96e4b 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -2335,14 +2335,13 @@ bool RULE::NextCutoffSet() { for (ConditionNr=0; ConditionNr<(int)CurrentConjunction->Size; ConditionNr++) { // Iterate through conditions if (CurrentConjunction->Conditions[ConditionNr].FeatureNumber == Conjunctions[ConjunctionNr].Conditions[0].FeatureNumber) { - FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].RepeatedFeature = true; - FeatureOperators[Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator].IsRepeated=true; if (CurrentConjunction->Size > 1 && CurrentConjunction->Conditions[ConditionNr].FeatureOperator == Conjunctions[ConjunctionNr].Conditions[0].FeatureOperator) { FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].NonSoloIncluded = true; - FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].RepeatedFeature = false; + } else { + FeatureOperators[CurrentConjunction->Conditions[ConditionNr].FeatureOperator].RepeatedFeature = true; } } } @@ -2358,7 +2357,7 @@ bool RULE::NextCutoffSet() { if (CurrentFeatureOperator->IsSolo || CurrentFeatureOperator->RepeatedFeature || CurrentFeatureOperator->NonSoloIncluded) { // Is current condition solo? if (CurrentConjunction->Size>1) { - if (!(CurrentCondition->Operator==GREATER && CurrentFeatureOperator->NonSoloIncluded) && !(CurrentCondition->Operator==LESS && CurrentFeatureOperator->RepeatedFeature)) { + if ((CurrentFeatureOperator->NonSoloIncluded && !(CurrentCondition->Operator==GREATER)) || (CurrentFeatureOperator->RepeatedFeature && !(CurrentCondition->Operator==LESS))) { if (CurrentCondition->Cutoffs.size()>1) { CurrentCondition->CutoffNumber = 1; // Then start from next cutoff } else { From d74f419aade1e8989607ae81cfab86d04c224827 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Mon, 22 Jul 2024 09:22:42 +0200 Subject: [PATCH 08/41] Restrict MinCutoff to specific range of index, Maxcutoff-- per operator #21 --- src/C++/Explore/rule.cpp | 19 +++++++++---------- src/C++/Explore/rule.h | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 53a96e4b..c9c5278c 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -641,11 +641,11 @@ Out: unsigned int, the minimum order of a cutoff Description: Returns the order of the minimal cutoff for a specific FeatureOperator currently used in the rule. **********************************************************************/ -unsigned int RULE::GetMinCutoff(unsigned int Fnum) { +unsigned int RULE::GetMinCutoff(unsigned int Fnum, int ConjunctionNr) { CONDITION* CurrentCondition; unsigned int Result = Features[0][Fnum].Cutoffs.size(); - for (unsigned int i=0; iFeatureNumber==Fnum && (Conjunctions[i].Size>1 || FeatureOperators[CurrentCondition->FeatureOperator].RepeatedFeature)) { @@ -695,7 +695,6 @@ unsigned int RULE::GetMaxCutoff(unsigned int Fnum) { for (unsigned int i=0; i1; j++) { CurrentCondition = &Conjunctions[i].Conditions[j]; - // if (CurrentCondition->FeatureOperator==FOperator) { if (CurrentCondition->FeatureNumber==Fnum && (Conjunctions[i].Size>1 || FeatureOperators[CurrentCondition->FeatureOperator].RepeatedFeature)) { if (CurrentCondition->CutoffNumber>Result) { Result = CurrentCondition->CutoffNumber; @@ -2048,7 +2047,7 @@ bool RULE::NextCutoffSet() { MaxCutoff = CurrentCondition->Cutoffs.size(); if (CurrentConjunction->Size==1 && Conjunctions.size()>1) { // More than one conjunction and current conjunction size = 1 - if (CurrentCondition-> Operator==EQUAL){ + if (CurrentCondition->Operator==EQUAL){ if (CurrentCondition->CutoffNumber+1 < MaxCutoff) { CurrentCondition->CutoffNumber++; Incremented = true; @@ -2121,7 +2120,7 @@ bool RULE::NextCutoffSet() { ConditionNr--; } } else if (CurrentCondition->Operator==LESS) { - MaxCutoff = GetMinCutoff(CurrentCondition->FeatureNumber); + MaxCutoff = GetMinCutoff(CurrentCondition->FeatureNumber, ConjunctionNr); if (CurrentCondition->CutoffNumber+1 < MaxCutoff) { CurrentCondition->CutoffNumber++; Incremented = true; @@ -2140,10 +2139,10 @@ bool RULE::NextCutoffSet() { } else { if (CurrentFeatureOperator->Operator==EQUAL && MaxCutoff==2){ // Needed for binary,should be removed for categorical if (CurrentFeatureOperator->NonSoloIncluded) {MaxCutoff--;} - } else if (CurrentFeatureOperator->NonSoloIncluded){ - if (CurrentFeatureOperator->Operator==GREATER) {MaxCutoff--;} - } else if (CurrentFeatureOperator->RepeatedFeature){ - if (CurrentFeatureOperator->Operator==LESS && CurrentConjunction->Size>1) {MaxCutoff--;} + } else if (CurrentFeatureOperator->Operator==GREATER) { + if (CurrentFeatureOperator->NonSoloIncluded) {MaxCutoff--;} + } else if (CurrentFeatureOperator->Operator==LESS){ + if (CurrentFeatureOperator->RepeatedFeature) {MaxCutoff--;} } if (CurrentCondition->NextSame) { // For greater, also for equal or less? MaxCutoff--; @@ -2259,7 +2258,7 @@ bool RULE::NextCutoffSet() { // Reset to next cutoff if ((CurrentFeatureOperator->NonSoloIncluded && !(CurrentCondition->Operator==GREATER)) || (CurrentFeatureOperator->RepeatedFeature && !(CurrentCondition->Operator==LESS))) { - CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureNumber)+1; // TODO: check if correct + CurrentCondition->CutoffNumber = GetMinCutoff(CurrentCondition->FeatureNumber, (int)Conjunctions.size())+1; // TODO: check if correct } } } diff --git a/src/C++/Explore/rule.h b/src/C++/Explore/rule.h index 84d7ff28..d6a4961d 100755 --- a/src/C++/Explore/rule.h +++ b/src/C++/Explore/rule.h @@ -148,7 +148,7 @@ unsigned int NoFeatureOperators{}; // vector GetOperators(); // Get a list of operators vector GetCutoffs(); // Get a list of cutoffs - unsigned int GetMinCutoff(unsigned int FOperator); + unsigned int GetMinCutoff(unsigned int FOperator, int ConjunctionNr); bool CutoffsAtMax(int ConjunctionNr, int ConditionNr); unsigned int GetMaxCutoff(unsigned int FOperator); From 34b152847abbaf3651da884212e5a1b30f985ef9 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Tue, 23 Jul 2024 10:42:38 +0200 Subject: [PATCH 09/41] Fix building R package; 'no template named unary function' --- src/C++/Explore/set.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/C++/Explore/set.cpp b/src/C++/Explore/set.cpp index de7a7dc6..7e19301b 100755 --- a/src/C++/Explore/set.cpp +++ b/src/C++/Explore/set.cpp @@ -423,7 +423,7 @@ string SET::PrintPerformance() { return Result.str(); } -struct AndJibu : public std ::unary_function +struct AndJibu : public std ::__unary_function { const boost::dynamic_bitset<> Source; boost::dynamic_bitset<> Dest; From 8cf7f5b81c10d2041c7c7b613e9dea1bd68eb636 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Tue, 23 Jul 2024 10:52:14 +0200 Subject: [PATCH 10/41] Fixed order of data; first binary then continuous #20 --- R/HelperFunctions.R | 3 +++ R/MainFunctions.R | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index c6577485..7583cc56 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -80,6 +80,9 @@ saveData <- function(output_path, train_data, file_name) { binary_cols <- sapply(1:ncol(train_data), function(c) all(train_data[[c]] %in% 0:1)) train_data[binary_cols] <- lapply(colnames(train_data[binary_cols]), function(c) factor(train_data[[c]], labels=c(0,1))) + # Order data (first binary then continuous features) + train_data <- cbind(train_data[binary_cols],train_data[!binary_cols]) # Order needed for correct functioning of main algorithm in C++ + # Save data as arff file if (file.exists(paste0(output_path, file_name, ".arff"))) {file.remove(paste0(output_path, file_name, ".arff"))} farff::writeARFF(train_data, paste0(output_path, file_name, ".arff")) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index 29bcbe61..c8b27671 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -155,7 +155,7 @@ trainExplore <- function(train_data = NULL, # } coef <- names(cor)[order(-abs(cor))] - train_data <- train_data[,c(coef,ClassFeature_)] # sort data features by LASSO importance + train_data <- train_data[,c(coef,ClassFeature_)] # sort data features by importance } saveData(output_path, train_data, file_name) From db102f645b7be10087a443f45f3b1c854d36fae2 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Tue, 23 Jul 2024 13:23:18 +0200 Subject: [PATCH 11/41] Update default settings --- R/MainFunctions.R | 27 +++++++++++++++++++-------- inst/settings/template.project | 8 ++++---- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index c8b27671..c39179ab 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -39,20 +39,22 @@ trainExplore <- function(train_data = NULL, StartRulelength = 1, EndRulelength = 3, OperatorMethod = "EXHAUSTIVE", - CutoffMethod = "RVAC", + CutoffMethod = "ALL", ClassFeature = "'class'", PositiveClass = "'Iris-versicolor'", FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, PrintSettings = TRUE, PrintPerformance = TRUE, - Subsumption = TRUE, + Subsumption = FALSE, BranchBound = TRUE, Sorted = "none", - Parallel = FALSE) { + Parallel = TRUE, + ParallelMethod = "ONE", + BinaryReduction = FALSE) { if (!dir.exists(output_path)) { dir.create(output_path, recursive = TRUE) @@ -103,6 +105,8 @@ trainExplore <- function(train_data = NULL, checkLogical(BranchBound), checkString(Sorted), checkLogical(Parallel), + checkString(ParallelMethod), + checkLogical(BinaryReduction), add = errorMessage, combine = "and" ) @@ -113,6 +117,7 @@ trainExplore <- function(train_data = NULL, Subsumption <- ifelse(Subsumption == TRUE, "yes", "no") BranchBound <- ifelse(BranchBound == TRUE, "yes", "no") Parallel <- ifelse(Parallel == TRUE, "yes", "no") + BinaryReduction <- ifelse(BinaryReduction == TRUE, "yes", "no") Accuracy <- ifelse(Accuracy == 0, "", Accuracy) BalancedAccuracy <- ifelse(BalancedAccuracy == 0, "", BalancedAccuracy) Specificity <- ifelse(Specificity == 0, "", Specificity) @@ -182,7 +187,9 @@ trainExplore <- function(train_data = NULL, PrintPerformance = PrintPerformance, Subsumption = Subsumption, BranchBound = BranchBound, - Parallel = Parallel) + Parallel = Parallel, + ParallelMethod = ParallelMethod, + BinaryReduction = BinaryReduction) # Train EXPLORE model # TODO: allow to enter settings file instead of path? @@ -243,15 +250,17 @@ settingsExplore <- function(settings, ClassFeature, PositiveClass, FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, PrintSettings = "yes", PrintPerformance = "yes", - Subsumption = "yes", + Subsumption = "no", BranchBound = "yes", - Parallel = "no") { + Parallel = "yes", + ParallelMethod = "ONE", + BinaryReduction = "no") { # Insert location training data and cutoff file if train_data is entered @@ -278,6 +287,8 @@ settingsExplore <- function(settings, settings <- changeSetting(settings, parameter = "Subsumption", input = Subsumption) settings <- changeSetting(settings, parameter = "BranchBound", input = BranchBound) settings <- changeSetting(settings, parameter = "Parallel", input = Parallel) + settings <- changeSetting(settings, parameter = "ParallelMethod", input = ParallelMethod) + settings <- changeSetting(settings, parameter = "BinaryReduction", input = BinaryReduction) # Save settings file settings_path <- paste0(output_path, file_name,".project") diff --git a/inst/settings/template.project b/inst/settings/template.project index 79455862..f31805fc 100755 --- a/inst/settings/template.project +++ b/inst/settings/template.project @@ -13,7 +13,7 @@ EndRulelength=3 LearnRatio=0.8 NumberofPartitions=1 OperatorMethod=EXHAUSTIVE -CutoffMethod=RVAC +CutoffMethod=ALL CutoffFile=@CutoffFile ClassFeature=@ClassFeature PositiveClass=@PositiveClass @@ -22,7 +22,7 @@ Rule= FeatureInclude= FeatureRule= [Constraints] -Maximize=ACCURACY +Maximize=BALANCEDACCURACY Accuracy= BalancedAccuracy= Specificity= @@ -43,8 +43,8 @@ PrintPerformance=yes PrintSets=no SavePartitions=no [Run] -Subsumption=yes +Subsumption=no BranchBound=yes -Parallel=no +Parallel=yes ParallelMethod=ONE BinaryReduction=no From 5f7f94f78c355dceaf9671d363cc40b8bac8b736 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Wed, 24 Jul 2024 09:40:52 +0200 Subject: [PATCH 12/41] Update print statements for rashomon ratio --- src/C++/Explore/explore.cpp | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/src/C++/Explore/explore.cpp b/src/C++/Explore/explore.cpp index 22bd7e52..a4920773 100755 --- a/src/C++/Explore/explore.cpp +++ b/src/C++/Explore/explore.cpp @@ -2929,7 +2929,6 @@ bool Explore::RunProject() { #endif do { - // CountCombinations = 0; CountFeatureOperatorPairs = 0; CountCutoffSets = 0; CountCandidatesPartition = 0; @@ -2953,23 +2952,20 @@ if (!Parallel) { int FOSets_old = 0; while (Rule.NextCombinationGenerator()) { - // cout << "FO pairs: " << CountFeatureOperatorPairs - FOSets_old << endl; - // FOSets_old = CountFeatureOperatorPairs; - - cout << "Cutoff sets: " << CountCutoffSets - FOSets_old << endl; - FOSets_old = CountCutoffSets; + // cout << "FO pairs: " << CountFeatureOperatorPairs - FOSets_old << endl; + // FOSets_old = CountFeatureOperatorPairs; if (IsPrintCombinations) Rule.PrintCombination(); StartTimeTermTuple = clock(); while (Rule.NextFeatureSetGenerator(0, Rule.GetFeatureOperatorSize())) { - + // cout << "Cutoff sets: " << CountCutoffSets - FOSets_old << endl; + // FOSets_old = CountCutoffSets; if (IsPrintFeatureSets) Rule.PrintFeatureSet(); CountFeatureOperatorPairs++; - // CalculateProgress(); while (Rule.NextCutoffSetGenerator()) { switch (MaximizeMeasure) { @@ -3034,9 +3030,6 @@ if (!Parallel) { // std::stringstream sstr; // TermTupleTiming.Clear(); // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); -// Rule.PrintCombination(); -// cout << TermTupleTiming.PrintTotal(); -// cout << "Candidates: " << Rule.GetCountCandidates() << endl << endl; CountCandidatesPartition += Rule.GetCountCandidates(); Rule.ResetCountCandidates(); } @@ -3095,7 +3088,7 @@ if (!Parallel) { while (Rule_i.NextFeatureSetGenerator(0, Rule_i.GetFeatureOperatorSize())) { if (IsPrintFeatureSets) Rule_i.PrintFeatureSet_Thread(); - // CalculateProgress(); + m0.lock(); CountFeatureOperatorPairs++; m0.unlock(); @@ -3176,9 +3169,6 @@ if (!Parallel) { // std::stringstream sstr; // TermTupleTiming.Clear(); // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); -// Rule_i.PrintCombination(); -// cout << TermTupleTiming.PrintTotal(); -// cout << "Candidates: " << Rule_i.GetCountCandidates() << endl << endl; m3.lock(); CountCandidatesPartition += Rule_i.GetCountCandidates(); m3.unlock(); @@ -3253,7 +3243,6 @@ if (!Parallel) { m0.lock(); CountFeatureOperatorPairs++; m0.unlock(); - // CalculateProgress(); while (Rule_ij.NextCutoffSetGenerator()) { @@ -3331,9 +3320,6 @@ if (!Parallel) { // std::stringstream sstr; // TermTupleTiming.Clear(); // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); -// Rule_ij.PrintCombination(); -// cout << TermTupleTiming.PrintTotal(); -// cout << "Candidates: " << Rule_ij.GetCountCandidates() << endl << endl; m3.lock(); CountCandidatesPartition += Rule_ij.GetCountCandidates(); m3.unlock(); @@ -3368,8 +3354,7 @@ if (!Parallel) { if ((GetPartitionMethod())==CROSS_VALIDATION || (GetPartitionMethod())==HOLDOUT) { // Re-train model with full train set (learn + validate) Population.ResetTestPartitions(); // Sets all partitions to LEARN - // PartitionCandidates.clear(); // Remove all the partition candidates used to find BestLength - PartitionCandidates.Clear(); + PartitionCandidates.Clear(); // Remove all the partition candidates used to find BestLength SetRerun(); @@ -3399,8 +3384,8 @@ if (!Parallel) { cout << "Total Count Combinations:" << Rule.GetCombinationsGenerated() << endl; cout << "Total Count Feature Operator Pairs:" << CountFeatureOperatorPairs << endl; - cout << "Total Count Cutoff Sets:" << CountCutoffSets << endl; // = CountCandidatesPartition - // cout << "Total Count Candidates:" << CountCandidatesPartition << endl; + cout << "Total Count Cutoff Sets:" << CountCutoffSets << endl; // = CountCandidatesPartition with restrictions (mandatory features) without constraints (accuracy/sensitivity) + cout << "Total Count Candidates (incl constraints):" << CountCandidatesPartition << endl; } // } From 48af05f0e8fd5a1025ee374231bc62436d64b6ca Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Wed, 24 Jul 2024 13:42:26 +0200 Subject: [PATCH 13/41] Fixes generation of FO pairs for mixed data with binary reduction #20 --- src/C++/Explore/rule.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index c9c5278c..40bdd51c 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -1728,7 +1728,7 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { for (unsigned int j=0; j0) { if (BinaryReduction && Conjunctions[ConjunctionNr].Size==1) { // Simply go to next FeatureOperator, no repeats + + // Unless previous feature is continuous and not also term size 1, then repeat so "go back one" + if (FeatureOperators[Conjunctions[ConjunctionNr-1].Conditions[Conjunctions[ConjunctionNr-1].Size-1].FeatureOperator].Operator!=EQUAL + && Conjunctions[ConjunctionNr-1].Size!=1) { + FOperatorNr--; + } + } else if (Conjunctions[ConjunctionNr-1].Size>1) { FOperatorNr=0; - NumRepeats = 0; + NumRepeats=0; } else {//allow multiple occurences of nominal features if (FeatureOperators[Conjunctions[ConjunctionNr - 1].Conditions[0].FeatureOperator].Operator == EQUAL && !BinaryReduction) { @@ -1865,7 +1872,7 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { Condition = &Conjunctions[ConjunctionNr].Conditions[ConditionNr]; // Save reference to condition - if (Conjunctions[ConjunctionNr].Size>1 ) { + if (Conjunctions[ConjunctionNr].Size>1) { PreviousCondition = &FeatureOperators[Conjunctions[ConjunctionNr-1].Conditions[ConditionNr].FeatureOperator]; // AM: copy previous term NumRepeats = 0; } else { @@ -1877,7 +1884,7 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { for (i=0; i<=ConjunctionNr-1; i++) { // Go through all previous conjunctions (front of rule) PreviousConjunction = &Conjunctions[i]; for (unsigned int j = 0; j < PreviousConjunction->Conditions.size(); j++) { - if (FONext == PreviousConjunction->Conditions[j].FeatureOperator) { + if (FONext == PreviousConjunction->Conditions[j].FeatureOperator && FeatureOperators[ PreviousConjunction->Conditions[j].FeatureOperator].Operator==EQUAL) { FONext++; i=0; j=0; @@ -1933,7 +1940,7 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { for (unsigned int j=0; j Date: Thu, 25 Jul 2024 10:37:37 +0200 Subject: [PATCH 14/41] Change: if new rule same as current best - do not update --- src/C++/Explore/rule.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 40bdd51c..76c48743 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -3615,7 +3615,7 @@ bool RULE::CompareBestCandidate(PERFORMANCE CurrentPerformance, bool Initialised break; } - if (CandidateValue<=RuleValue) { // TODO: why = included here? + if (CandidateValueRuleValue) { + if (CandidateValue>=RuleValue) { #ifdef DEBUG_TIMING End = clock(); ExploreTiming.AddTime("EXPLORE::CompareBestCandidate", Start, End); From bfff55c6a1a00b17a91108dbdca3af7506006b82 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Thu, 25 Jul 2024 10:41:28 +0200 Subject: [PATCH 15/41] Remove PrintCutoffSetsBestLength and TRAIN partition - both not used --- src/C++/Explore/explore.cpp | 36 +------ src/C++/Explore/explore.h | 3 - src/C++/Explore/rule.cpp | 176 -------------------------------- src/C++/Explore/rule.h | 2 - src/C++/IOExplore/IOExplore.cpp | 16 --- src/C++/IOExplore/IOExplore.h | 1 - 6 files changed, 2 insertions(+), 232 deletions(-) diff --git a/src/C++/Explore/explore.cpp b/src/C++/Explore/explore.cpp index a4920773..ef8b40f1 100755 --- a/src/C++/Explore/explore.cpp +++ b/src/C++/Explore/explore.cpp @@ -232,12 +232,12 @@ void Explore::ValidateBestCandidate() { Start = clock(); #endif - if (Initialised) { // TODO: adjust this to train instead of split validation/learn + if (Initialised) { if (!Final) { Rule.SetTestMode(VALIDATION); } else { - Rule.SetTestMode(LEARN); // TODO: don't need new type train, remove? + Rule.SetTestMode(LEARN); // HERE: VALIDATE is also part of LEARN } // cout << endl << "BEST RULES (" << PartitionCandidates.size() << " candidates)" << endl << endl; @@ -2060,19 +2060,6 @@ bool Explore::GetPrintCutoffSets() { return IsPrintCutoffSets; } -/********************************************************************** -Function: GetPrintCutoffSets() -Category: Selectors -Scope: public -In: - -Out: bool -Description: -**********************************************************************/ -bool Explore::GetPrintCutoffSetsBestLength() { - return IsPrintCutoffSetsBestLength; -} - - /********************************************************************** Function: GetPrintPerformance() Category: Selectors @@ -2489,19 +2476,6 @@ void Explore::SetPrintCutoffSets(bool Setting) { Rule.SetPrintCutoffSets(Setting); } -/********************************************************************** -Function: SetPrintconditionSets() -Category: Modifiers -Scope: public -In: bool, yes or no -Out: - -Description: Indicate whether explore has to cout conditionsets. -**********************************************************************/ -void Explore::SetPrintCutoffSetsBestLength(bool Setting) { - IsPrintCutoffSetsBestLength = Setting; - Rule.SetPrintCutoffSetsBestLength(Setting); -} - /********************************************************************** Function: SetPrintPerformance() Category: Modifiers @@ -3461,12 +3435,6 @@ void Explore::Induce(int nStart, int nEnd) { PartitionCandidates = Rule.SaveCandidate(MaximizeMeasure, RestrictionSet); } - - - if (IsPrintCutoffSetsBestLength) { - cout << "Candidate model BestLength: "; - Rule.PrintCutoffSet(); - } #ifndef COMMANDVERSION // BreatheCount++; // Increment breathe counter diff --git a/src/C++/Explore/explore.h b/src/C++/Explore/explore.h index e4ec8e79..183bd59f 100755 --- a/src/C++/Explore/explore.h +++ b/src/C++/Explore/explore.h @@ -107,7 +107,6 @@ class Explore { bool IsPrintCombinations; // Print combinations to output bool IsPrintFeatureSets; // Print featuresets to output bool IsPrintCutoffSets; // Print conditionsets to output - bool IsPrintCutoffSetsBestLength; bool IsPrintPerformance; // Print performance to output bool IsPrintSets; // Print sets to output bool IsPrintOperatorMethod; // Print operator-method information to output @@ -211,7 +210,6 @@ class Explore { bool GetPrintCombinations(); // Should combinations be printed to output bool GetPrintFeatureSets(); // Should featuresets be printed to output bool GetPrintCutoffSets(); // Should conditionsets be printed to output - bool GetPrintCutoffSetsBestLength(); // Should conditionsets be printed to output bool GetPrintPerformance(); // Should performance be printed to output bool GetPrintSets(); // Should sets be printed to output bool GetPrintOperatorMethod(); // Should operator-method information be printed to output @@ -258,7 +256,6 @@ class Explore { void SetPrintCombinations(bool Setting); // Print combinations to output void SetPrintFeatureSets(bool Setting); // Print featuresets to output void SetPrintCutoffSets(bool Setting); // Print conditionsets to output - void SetPrintCutoffSetsBestLength(bool Setting); // Print conditionsets to output void SetPrintPerformance(bool Setting); // Print performance to output void SetPrintSets(bool Setting); // Print sets to output void SetPrintOperatorMethod(bool Setting); // Print operator-method information to output diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 76c48743..66a7b470 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -2796,18 +2796,6 @@ void RULE::SetPrintCutoffSets(bool Setting) { IsPrintCutoffSets = Setting; } -/********************************************************************** -Function: SetPrintCutoffSets() -Category: Modifiers -Scope: public -In: bool, yes or no -Out: - -Description: Rule must cout conditionsets that it generates. -**********************************************************************/ -void RULE::SetPrintCutoffSetsBestLength(bool Setting) { - IsPrintCutoffSetsBestLength = Setting; -} - /********************************************************************** Function: SetTestMode() @@ -2830,9 +2818,6 @@ void RULE::SetTestMode(PARTITION_TYPE PType) { case VALIDATION: PartitionClasses = (*Features)[FeatureOperators[j].FeatureNumber].GetValidationClasses(); break; - case TRAIN: // both learn and validation set - PartitionClasses = (*Features)[FeatureOperators[j].FeatureNumber].GetTrainClasses(); - break; } FeatureOperators[j].InitialiseSets(PartitionClasses); } @@ -3685,167 +3670,6 @@ CANDIDATE RULE::SaveCandidate(PERFORMANCE_MEASURE MaximizeMeasure, bool Restrict #endif } -/********************************************************************** -Function: BestLength() -Category: Modifiers -Scope: public -In: - -Out: - -Description: is stop criterium met? -**********************************************************************/ -int RULE::FindBestLength(bool Initialised, CANDIDATE PartitionCandidates, PARTITION_METHOD PartitionMethod,PERFORMANCE_MEASURE MaximizeMeasure) { - float best; - float current; - int Opt=0; - - CANDIDATE BestCandidate; - - if (Initialised) { - SetTestMode(VALIDATION); - - if (PartitionCandidates.IsValid()) { - // if (PartitionCandidates.size()>0) { - for (unsigned int i=GetMinRuleLength(); i<=GetMaxRuleLength(); i++){ - BestCandidate = ChooseBestCandidate(i, Initialised, PartitionCandidates, MaximizeMeasure); - - if (BestCandidate.Performance.Accuracy.Value != 0) { // Check if BestCandidate not empty - if (SetRule(BestCandidate)) - { - cout << "RULELENGTH " << i << endl << endl; - cout << "Best candidate (within this partition): "; - PrintCutoffSet(); - cout << endl; - cout << "Learn-set: "; - BestCandidate.Performance.Print(); - cout << endl; - - if (!(PartitionMethod==RESUBSTITUTION)){ - BestCandidate.Performance = CalculatePerformance(); // Test BestCandidate on validation partition - cout << "Validation-set: "; - BestCandidate.Performance.Print(); - cout << endl; - } - switch (MaximizeMeasure){ - - case ACCURACY: - current = BestCandidate.Performance.Accuracy.Value; - break; - case SENSITIVITY: - current = BestCandidate.Performance.Sensitivity.Value; - break; - case SPECIFICITY: - current = BestCandidate.Performance.Specificity.Value; - break; - case NPV: - current = BestCandidate.Performance.NPV.Value; - break; - case PPV: - current = BestCandidate.Performance.PPV.Value; - break; - case BALANCEDACCURACY: - current = BestCandidate.Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - current = BestCandidate.Performance.F1score.Value; - break; - } - if (i==1) { - best = current; - Opt = 1; - } - else { - if (current > best) { - best = current; - Opt = i; - } - } - } - } - } - return Opt; - } else { -#if defined(EXPLORE_MPI_DEBUG) - cout << "--> No Candidates" << endl; -#endif - } - } - return 0; -} - - - -/********************************************************************** -Function: ChooseBestCandidate() -Category: Modifiers -Scope: public -In: insigned int, rule length -Out: - -Description: Retrieves the best candidate and puts it in -BestCandidate. -**********************************************************************/ -CANDIDATE RULE::ChooseBestCandidate(unsigned int RuleLength, bool Initialised, CANDIDATE PartitionCandidates, PERFORMANCE_MEASURE MaximizeMeasure) { -#ifdef DEBUG_TIMING - clock_t Start, End; - Start = clock(); -#endif -// bool Found = false; -// CANDIDATE BestCandidate; -// -// if (Initialised) { -// tbb::concurrent_vector::iterator CurrentCandidate(PartitionCandidates.begin()); -// tbb::concurrent_vector::iterator LastCandidate(PartitionCandidates.end()); -// -// // TODO: check if better place to create variable -// BestCandidate = (*CurrentCandidate); -// -// float CurrentValue; -// float BestValue; -// -// while (CurrentCandidate != LastCandidate) { -// CurrentValue = 0; -// if ((*CurrentCandidate).Features.size()==RuleLength){ -// switch (MaximizeMeasure) { -// case SENSITIVITY: -// CurrentValue = (*CurrentCandidate).Performance.Sensitivity.Value; -// break; -// case SPECIFICITY: -// CurrentValue = (*CurrentCandidate).Performance.Specificity.Value; -// break; -// case NPV: -// CurrentValue = (*CurrentCandidate).Performance.NPV.Value; -// break; -// case PPV: -// CurrentValue = (*CurrentCandidate).Performance.PPV.Value; -// break; -// case ACCURACY: -// CurrentValue = (*CurrentCandidate).Performance.Accuracy.Value; -// break; -// } -// -// if (BestValue<=CurrentValue) { -// BestCandidate = (*CurrentCandidate); -// BestValue = CurrentValue; -// Found = true; -// } -// } -// CurrentCandidate++; -// } -// } -// -//#ifdef DEBUG_TIMING -// End = clock(); -// ExploreTiming.AddTime("EXPLORE::ChooseBestCandidate", Start, End); -//#endif -// -// if (Found) { -// return BestCandidate; -// } else { -// return CANDIDATE(); -// } - - return PartitionCandidates; -} - /********************************************************************** Function: GetFeatureOperatorSize() Category: . diff --git a/src/C++/Explore/rule.h b/src/C++/Explore/rule.h index d6a4961d..b2e2d631 100755 --- a/src/C++/Explore/rule.h +++ b/src/C++/Explore/rule.h @@ -86,7 +86,6 @@ class RULE { bool IsPrintCombinations{}; bool IsPrintFeatureSets{}; bool IsPrintCutoffSets{}; - bool IsPrintCutoffSetsBestLength{}; vector ROCCurves; @@ -176,7 +175,6 @@ unsigned int NoFeatureOperators{}; // void SetPrintCombinations(bool Setting); void SetPrintFeatureSets(bool Setting); void SetPrintCutoffSets(bool Setting); - void SetPrintCutoffSetsBestLength(bool Setting); void PrintSettings(); void PrintCombination(); // Print partition information of the rule diff --git a/src/C++/IOExplore/IOExplore.cpp b/src/C++/IOExplore/IOExplore.cpp index f69a7561..aa33e922 100644 --- a/src/C++/IOExplore/IOExplore.cpp +++ b/src/C++/IOExplore/IOExplore.cpp @@ -146,7 +146,6 @@ void IOExplore::ClearSettings() { ProjectSettings.PrintCombinations = false; ProjectSettings.PrintFeatureSets = false; ProjectSettings.PrintCutoffSets = false; - ProjectSettings.PrintCutoffSetsBestLength = false; ProjectSettings.PrintPerformance = false; ProjectSettings.PrintSets = false; ProjectSettings.BranchBound = false; @@ -692,10 +691,6 @@ bool IOExplore::SaveExploreToProject(string IOFilename) { if (Project->GetPrintCutoffSets()) { ProjectSettings.PrintCutoffSets = true; } - ProjectSettings.PrintCutoffSetsBestLength = false; - if (Project->GetPrintCutoffSetsBestLength()) { - ProjectSettings.PrintCutoffSetsBestLength = true; - } ProjectSettings.PrintPerformance = false; if (Project->GetPrintPerformance()) { ProjectSettings.PrintPerformance = true; @@ -1498,16 +1493,6 @@ bool IOExplore::SetupExploreFromProject(string IOFilename) { } } - if (CurrentHeading.compare("PrintCutoffSetsBestLength")==0) { // Print cutoffsets - if (CurrentValue.compare("yes")==0) { - ProjectSettings.PrintCutoffSetsBestLength = true; - } else if (CurrentValue.compare("no")==0) { - ProjectSettings.PrintCutoffSetsBestLength = false; - } else { - ProjectLoadErrors.push_back("Invalid value for print cutoffsets bestlength."); - return false; - } - } if (CurrentHeading.compare("PrintPerformance")==0) { // Print performance if (CurrentValue.compare("yes")==0) { ProjectSettings.PrintPerformance = true; @@ -1718,7 +1703,6 @@ bool IOExplore::SetupExploreFromStruct() { Project->SetPrintCombinations(ProjectSettings.PrintCombinations); Project->SetPrintFeatureSets(ProjectSettings.PrintFeatureSets); Project->SetPrintCutoffSets(ProjectSettings.PrintCutoffSets); - Project->SetPrintCutoffSetsBestLength(ProjectSettings.PrintCutoffSetsBestLength); Project->SetPrintPerformance(ProjectSettings.PrintPerformance); Project->SetPrintSets(ProjectSettings.PrintSets); Project->SetSavePartitions(ProjectSettings.SavePartitions); diff --git a/src/C++/IOExplore/IOExplore.h b/src/C++/IOExplore/IOExplore.h index bf1b25e5..f1af3942 100755 --- a/src/C++/IOExplore/IOExplore.h +++ b/src/C++/IOExplore/IOExplore.h @@ -64,7 +64,6 @@ struct ExploreSettings { bool PrintCombinations; bool PrintFeatureSets; bool PrintCutoffSets; - bool PrintCutoffSetsBestLength; bool PrintPerformance; bool PrintSets; bool SavePartitions; From faa18aaad5028efe8f2f659ade5b8fd47807d998 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Thu, 25 Jul 2024 10:44:54 +0200 Subject: [PATCH 16/41] Clean up and fixes finding best length using HOLDOUT and CROSS_VALIDATION --- src/C++/Explore/candidate.cpp | 2 + src/C++/Explore/explore.cpp | 863 +++++++++++++++++----------------- src/C++/Explore/rule.cpp | 5 +- 3 files changed, 435 insertions(+), 435 deletions(-) diff --git a/src/C++/Explore/candidate.cpp b/src/C++/Explore/candidate.cpp index 987cd7f2..e79a71ea 100755 --- a/src/C++/Explore/candidate.cpp +++ b/src/C++/Explore/candidate.cpp @@ -40,6 +40,8 @@ void CANDIDATE::Clear() { Performance.Accuracy.Value = 0; Performance.NPV.Value = 0; Performance.PPV.Value = 0; + Performance.BalancedAccuracy.Value = 0; + Performance.F1score.Value = 0; } /********************************************************************** diff --git a/src/C++/Explore/explore.cpp b/src/C++/Explore/explore.cpp index ef8b40f1..521f7856 100755 --- a/src/C++/Explore/explore.cpp +++ b/src/C++/Explore/explore.cpp @@ -240,42 +240,31 @@ void Explore::ValidateBestCandidate() { Rule.SetTestMode(LEARN); // HERE: VALIDATE is also part of LEARN } - // cout << endl << "BEST RULES (" << PartitionCandidates.size() << " candidates)" << endl << endl; - cout << endl << "BEST RULES" << endl << endl; - - if(PartitionCandidates.IsValid()) { + if (PartitionCandidates.IsValid()) { CANDIDATE BestCandidate = PartitionCandidates; - // for (unsigned int i=GetMinRuleLength(); i<=GetMaxRuleLength(); i++){ - cout << "RULELENGTH " << BestCandidate.Size() << endl << endl; - // if (ChooseBestCandidate(i)){ + cout << "Best length: " << BestCandidate.Size() << endl; if (Rule.SetRule(BestCandidate)) { - cout << "Best candidate (overall): "; + cout << "Best candidate: "; Rule.PrintCutoffSet(); - cout << endl; - cout << "Learn-set: "; + cout << "Performance learn-set: "; BestCandidate.Performance.Print(); - cout << endl; - if (!(GetPartitionMethod()==RESUBSTITUTION)){ + if (!(GetPartitionMethod()==RESUBSTITUTION) && !Final){ BestCandidate.Performance = Rule.CalculatePerformance(); // Test BestCandidate on validation partition - cout << "Validation-set: "; + cout << "Performance validation-set: "; BestCandidate.Performance.Print(); cout << endl; } ProjectCandidates.push_back(BestCandidate); - } -// } else { -// cout << "None." << endl << endl; -// } - // } - PartitionCandidates.Clear(); } else { #if defined(EXPLORE_MPI_DEBUG) cout << "--> No Candidates" << endl; #endif } + + PartitionCandidates.Clear(); } #ifdef DEBUG_TIMING @@ -1157,7 +1146,7 @@ Out: - Description: Partition the population of explore. **********************************************************************/ bool Explore::Partition() { - // ValidateBestCandidate(); // Do not remove! Is needed for summarising best candidates at the end of projects (ie. HOLDOUT) + ValidateBestCandidate(); // Do not remove! Is needed for summarising best candidates at the end of projects (ie. HOLDOUT) if (Population.Partition()) { // Will return false with holdout on second call! SetRerun(); // Reset rule (findcutoffs etc) @@ -2884,14 +2873,18 @@ bool Explore::RunProject() { unsigned int Partitionnr = 0; time_t dummy; unsigned int ActiveRuleLength; + int CountCandidatesPartition; int CountFeatureOperatorPairs; int CountCutoffSets; + + CANDIDATE BestCandidate; int BestLengthPartition; int BestLengthFinal = 0; vector BestLength(Rule.GetMaxRuleLength()); - int CPBest_global = 0; - int CTBest_global = 0; + + int CPBest_global; + int CTBest_global; TIMING TermTupleTiming; clock_t StartTimeTermTuple; @@ -2921,402 +2914,429 @@ bool Explore::RunProject() { if (IsPrintCutoffMethod) Population.PrintCutoffMethod(); if (IsPrintCutoffValues) Population.PrintCutoffs(); -if (!Parallel) { - float CandidatePerformance; - int FOSets_old = 0; - - while (Rule.NextCombinationGenerator()) { - // cout << "FO pairs: " << CountFeatureOperatorPairs - FOSets_old << endl; - // FOSets_old = CountFeatureOperatorPairs; - - if (IsPrintCombinations) Rule.PrintCombination(); - - StartTimeTermTuple = clock(); - - while (Rule.NextFeatureSetGenerator(0, Rule.GetFeatureOperatorSize())) { - // cout << "Cutoff sets: " << CountCutoffSets - FOSets_old << endl; - // FOSets_old = CountCutoffSets; - - if (IsPrintFeatureSets) Rule.PrintFeatureSet(); - CountFeatureOperatorPairs++; - - while (Rule.NextCutoffSetGenerator()) { - - switch (MaximizeMeasure) { - case SENSITIVITY: - CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; - break; - case SPECIFICITY: - CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; - break; - case NPV: - CandidatePerformance = PartitionCandidates.Performance.NPV.Value; - break; - case PPV: - CandidatePerformance = PartitionCandidates.Performance.PPV.Value; - break; - case ACCURACY: - CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; - break; - case BALANCEDACCURACY: - CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - CandidatePerformance = PartitionCandidates.Performance.F1score.Value; - break; - } - - if (Rule.TestRule(Initialised, Constraints, - CandidatePerformance, MaximizeMeasure, RestrictionSet, - RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { - - PartitionCandidates = Rule.SaveCandidate(MaximizeMeasure, RestrictionSet); - } + if (!Parallel) { + float CandidatePerformance = 0; + int count = 0; + + while (Rule.NextCombinationGenerator()) { + // cout << "FO pairs: " << CountFeatureOperatorPairs - count << endl; + // count = CountFeatureOperatorPairs; + + // cout << "Cutoff sets: " << CountCutoffSets - count << endl; + // count = CountCutoffSets; + + if (IsPrintCombinations) Rule.PrintCombination(); + + StartTimeTermTuple = clock(); + + while (Rule.NextFeatureSetGenerator(0, Rule.GetFeatureOperatorSize())) { + + if (IsPrintFeatureSets) Rule.PrintFeatureSet(); + CountFeatureOperatorPairs++; + + while (Rule.NextCutoffSetGenerator()) { + + switch (MaximizeMeasure) { + case SENSITIVITY: + CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; + break; + case SPECIFICITY: + CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; + break; + case NPV: + CandidatePerformance = PartitionCandidates.Performance.NPV.Value; + break; + case PPV: + CandidatePerformance = PartitionCandidates.Performance.PPV.Value; + break; + case ACCURACY: + CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; + break; + case BALANCEDACCURACY: + CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; + break; + case F1SCORE: + CandidatePerformance = PartitionCandidates.Performance.F1score.Value; + break; + } - // TODO: check if inside or outside TestRule - if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set - cout << "Candidate model: "; - Rule.PrintCutoffSet(); - } + if (Rule.TestRule(Initialised, Constraints, + CandidatePerformance, MaximizeMeasure, RestrictionSet, + RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { - CountCutoffSets++; - // if (IsUpdateRealtime) CalculateProgress(); + PartitionCandidates = Rule.SaveCandidate(MaximizeMeasure, RestrictionSet); + } -#ifndef COMMANDVERSION - // BreatheCount++;// Increment breathe counter + if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set + cout << "Candidate model: "; + Rule.PrintCutoffSet(); + } - if (BreatheCount>BREATHE_INTERVAL) { - if (PauseFunction()) { // User paused the project - PrintSummary(); - return false; - } - if (CancelFunction()) { // User cancelled the project - PrintSummary(); - CloseFunction(); - return false; - } - BreatheCount = 0; - } -#endif + CountCutoffSets++; + // if (IsUpdateRealtime) CalculateProgress(); + + #ifndef COMMANDVERSION + // BreatheCount++;// Increment breathe counter + + if (BreatheCount>BREATHE_INTERVAL) { + if (PauseFunction()) { // User paused the project + PrintSummary(); + return false; + } + if (CancelFunction()) { // User cancelled the project + PrintSummary(); + CloseFunction(); + return false; + } + BreatheCount = 0; + } + #endif + } } - } - -// std::stringstream sstr; -// TermTupleTiming.Clear(); -// TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); - CountCandidatesPartition += Rule.GetCountCandidates(); - Rule.ResetCountCandidates(); - } - -} else { - - if (ParallelMethod == ONE) { - vector all_rules; - while(Rule.NextCombinationGenerator()) { - all_rules.push_back(this->Rule); + // std::stringstream sstr; + // TermTupleTiming.Clear(); + // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); + CountCandidatesPartition += Rule.GetCountCandidates(); + Rule.ResetCountCandidates(); } - tbb::parallel_for(tbb::blocked_range(0, Rule.GetCombinationsGenerated()), [&](tbb::blocked_range r) { - for (int i = r.begin(); i < r.end(); ++i) { - StartTimeTermTuple = clock(); - - RULE Rule_i = RULE(all_rules[i]); // CREATE DEEP COPY - - float CandidatePerformance; - CANDIDATE PotentialCandidate; - - m2.lock(); - - Rule_i.CPBest = CPBest_global; - Rule_i.CTBest = CTBest_global; - - switch (MaximizeMeasure) { - case SENSITIVITY: - CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; - break; - case SPECIFICITY: - CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; - break; - case NPV: - CandidatePerformance = PartitionCandidates.Performance.NPV.Value; - break; - case PPV: - CandidatePerformance = PartitionCandidates.Performance.PPV.Value; - break; - case ACCURACY: - CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; - break; - case BALANCEDACCURACY: - CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - CandidatePerformance = PartitionCandidates.Performance.F1score.Value; - break; - } - m2.unlock(); - - if (IsPrintCombinations) Rule_i.PrintCombination(); - - // printf("Combination %d and feature operators set %d and address %p and thread id %d \n", i, j, &Rule_i, tbb::this_task_arena::current_thread_index()); - - while (Rule_i.NextFeatureSetGenerator(0, Rule_i.GetFeatureOperatorSize())) { - if (IsPrintFeatureSets) Rule_i.PrintFeatureSet_Thread(); - - m0.lock(); - CountFeatureOperatorPairs++; - m0.unlock(); - - while (Rule_i.NextCutoffSetGenerator()) { - - if (Rule_i.TestRule(Initialised, Constraints, - CandidatePerformance, MaximizeMeasure, RestrictionSet, - RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { - - PotentialCandidate = Rule_i.SaveCandidate(MaximizeMeasure, - RestrictionSet); - m2.lock(); - - bool change; - switch (MaximizeMeasure) { - case SENSITIVITY: - change = (PotentialCandidate.Performance.Sensitivity.Value > PartitionCandidates.Performance.Sensitivity.Value); - break; - case SPECIFICITY: - change = (PotentialCandidate.Performance.Specificity.Value > PartitionCandidates.Performance.Specificity.Value); - break; - case NPV: - change = (PotentialCandidate.Performance.NPV.Value > PartitionCandidates.Performance.NPV.Value); - break; - case PPV: - change = (PotentialCandidate.Performance.PPV.Value > PartitionCandidates.Performance.PPV.Value); - break; - case ACCURACY: - change = (PotentialCandidate.Performance.Accuracy.Value > PartitionCandidates.Performance.Accuracy.Value); - break; - case BALANCEDACCURACY: - change = (PotentialCandidate.Performance.BalancedAccuracy.Value > PartitionCandidates.Performance.BalancedAccuracy.Value); - break; - case F1SCORE: - change = (PotentialCandidate.Performance.F1score.Value > PartitionCandidates.Performance.F1score.Value); - break; - } - - if (change) { - PartitionCandidates = PotentialCandidate; - - CPBest_global = Rule_i.CPBest; - CTBest_global = Rule_i.CTBest; - } - m2.unlock(); - } + } else { - // TODO: check if inside or outside TestRule - if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set - cout << "Candidate model: "; - Rule_i.PrintCutoffSet(); - } - m1.lock(); - CountCutoffSets++; - m1.unlock(); - // if (IsUpdateRealtime) CalculateProgress(); + CPBest_global = 0; + CTBest_global = 0; -#ifndef COMMANDVERSION - // BreatheCount++;// Increment breathe counter - - if (BreatheCount>BREATHE_INTERVAL) { - if (PauseFunction()) { // User paused the project - PrintSummary(); - return false; - } - if (CancelFunction()) { // User cancelled the project - PrintSummary(); - CloseFunction(); - return false; - } - BreatheCount = 0; - } -#endif - } - } + if (ParallelMethod == ONE) { -// std::stringstream sstr; -// TermTupleTiming.Clear(); -// TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); - m3.lock(); - CountCandidatesPartition += Rule_i.GetCountCandidates(); - m3.unlock(); + vector all_rules; + while (Rule.NextCombinationGenerator()) { + all_rules.push_back(this->Rule); } - // } - }); - } else if (ParallelMethod == TWO) { - vector all_rules; - while(Rule.NextCombinationGenerator()) { - all_rules.push_back(this->Rule); - } - - tbb::parallel_for(tbb::blocked_range(0, Rule.GetCombinationsGenerated()), [&](tbb::blocked_range r) { - - // for (int i = r.begin(); i < r.end(); ++i) - int i = r.begin(); - { - StartTimeTermTuple = clock(); - - // NOTE: blocked_range uses open interval [start,end) - tbb::parallel_for(tbb::blocked_range(0, Rule.GetFeatureOperatorSize() + 1), [&](tbb::blocked_range s) { - - // for (int j = s.begin(); j < s.end(); j++) - { - int j = s.begin(); - - RULE Rule_ij = RULE(all_rules[i]); // CREATE DEEP COPY - - float CandidatePerformance; - CANDIDATE PotentialCandidate; - - m2.lock(); - - Rule_ij.CPBest = CPBest_global; - Rule_ij.CTBest = CTBest_global; - - switch (MaximizeMeasure) { - case SENSITIVITY: - CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; - break; - case SPECIFICITY: - CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; - break; - case NPV: - CandidatePerformance = PartitionCandidates.Performance.NPV.Value; - break; - case PPV: - CandidatePerformance = PartitionCandidates.Performance.PPV.Value; - break; - case ACCURACY: - CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; - break; - case BALANCEDACCURACY: - CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - CandidatePerformance = PartitionCandidates.Performance.F1score.Value; - break; - } - m2.unlock(); - - if (IsPrintCombinations) Rule_ij.PrintCombination(); - - // printf("Combination %d and feature operators set %d and address %p and thread id %d \n", i, j, &Rule_ij, tbb::this_task_arena::current_thread_index()); - - while (Rule_ij.NextFeatureSetGenerator(j, j)) { - // TODO: create function that releases all print statements of one thread at once - - if (IsPrintFeatureSets) Rule_ij.PrintFeatureSet_Thread(); - - m0.lock(); - CountFeatureOperatorPairs++; - m0.unlock(); - - while (Rule_ij.NextCutoffSetGenerator()) { - - if (Rule_ij.TestRule(Initialised, Constraints, - CandidatePerformance, MaximizeMeasure, RestrictionSet, - RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { - - PotentialCandidate = Rule_ij.SaveCandidate(MaximizeMeasure, RestrictionSet); - - m2.lock(); - - bool change; - switch (MaximizeMeasure) { - case SENSITIVITY: - change = (PotentialCandidate.Performance.Sensitivity.Value > PartitionCandidates.Performance.Sensitivity.Value); - break; - case SPECIFICITY: - change = (PotentialCandidate.Performance.Specificity.Value > PartitionCandidates.Performance.Specificity.Value); - break; - case NPV: - change = (PotentialCandidate.Performance.NPV.Value > PartitionCandidates.Performance.NPV.Value); - break; - case PPV: - change = (PotentialCandidate.Performance.PPV.Value > PartitionCandidates.Performance.PPV.Value); - break; - case ACCURACY: - change = (PotentialCandidate.Performance.Accuracy.Value > PartitionCandidates.Performance.Accuracy.Value); - break; - case BALANCEDACCURACY: - change = (PotentialCandidate.Performance.BalancedAccuracy.Value > PartitionCandidates.Performance.BalancedAccuracy.Value); - break; - case F1SCORE: - change = (PotentialCandidate.Performance.F1score.Value > PartitionCandidates.Performance.F1score.Value); - break; - } - - if (change) { - PartitionCandidates = PotentialCandidate; - - CPBest_global = Rule_ij.CPBest; - CTBest_global = Rule_ij.CTBest; - } - m2.unlock(); + tbb::parallel_for(tbb::blocked_range(0, Rule.GetCombinationsGenerated()), [&](tbb::blocked_range r) { + for (int i = r.begin(); i < r.end(); ++i) { + StartTimeTermTuple = clock(); + + RULE Rule_i = RULE(all_rules[i]); // CREATE DEEP COPY + + float CandidatePerformance; + CANDIDATE PotentialCandidate; + + m2.lock(); + + Rule_i.CPBest = CPBest_global; + Rule_i.CTBest = CTBest_global; + + switch (MaximizeMeasure) { + case SENSITIVITY: + CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; + break; + case SPECIFICITY: + CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; + break; + case NPV: + CandidatePerformance = PartitionCandidates.Performance.NPV.Value; + break; + case PPV: + CandidatePerformance = PartitionCandidates.Performance.PPV.Value; + break; + case ACCURACY: + CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; + break; + case BALANCEDACCURACY: + CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; + break; + case F1SCORE: + CandidatePerformance = PartitionCandidates.Performance.F1score.Value; + break; + } + m2.unlock(); + + if (IsPrintCombinations) Rule_i.PrintCombination(); + + // printf("Combination %d and feature operators set %d and address %p and thread id %d \n", i, j, &Rule_i, tbb::this_task_arena::current_thread_index()); + + while (Rule_i.NextFeatureSetGenerator(0, Rule_i.GetFeatureOperatorSize())) { + if (IsPrintFeatureSets) Rule_i.PrintFeatureSet_Thread(); + + m0.lock(); + CountFeatureOperatorPairs++; + m0.unlock(); + + while (Rule_i.NextCutoffSetGenerator()) { + + if (Rule_i.TestRule(Initialised, Constraints, + CandidatePerformance, MaximizeMeasure, RestrictionSet, + RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { + + PotentialCandidate = Rule_i.SaveCandidate(MaximizeMeasure, + RestrictionSet); + m2.lock(); + + bool change; + switch (MaximizeMeasure) { + case SENSITIVITY: + change = (PotentialCandidate.Performance.Sensitivity.Value > + PartitionCandidates.Performance.Sensitivity.Value); + break; + case SPECIFICITY: + change = (PotentialCandidate.Performance.Specificity.Value > + PartitionCandidates.Performance.Specificity.Value); + break; + case NPV: + change = (PotentialCandidate.Performance.NPV.Value > + PartitionCandidates.Performance.NPV.Value); + break; + case PPV: + change = (PotentialCandidate.Performance.PPV.Value > + PartitionCandidates.Performance.PPV.Value); + break; + case ACCURACY: + change = (PotentialCandidate.Performance.Accuracy.Value > + PartitionCandidates.Performance.Accuracy.Value); + break; + case BALANCEDACCURACY: + change = (PotentialCandidate.Performance.BalancedAccuracy.Value > + PartitionCandidates.Performance.BalancedAccuracy.Value); + break; + case F1SCORE: + change = (PotentialCandidate.Performance.F1score.Value > + PartitionCandidates.Performance.F1score.Value); + break; } - // TODO: check if inside or outside TestRule - if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set - cout << "Candidate model: "; - Rule_ij.PrintCutoffSet(); + if (change) { + PartitionCandidates = PotentialCandidate; + + CPBest_global = Rule_i.CPBest; + CTBest_global = Rule_i.CTBest; } - m1.lock(); - CountCutoffSets++; - m1.unlock(); - // if (IsUpdateRealtime) CalculateProgress(); + m2.unlock(); + } -#ifndef COMMANDVERSION - // BreatheCount++;// Increment breathe counter - - if (BreatheCount>BREATHE_INTERVAL) { - if (PauseFunction()) { // User paused the project - PrintSummary(); - return false; - } - if (CancelFunction()) { // User cancelled the project - PrintSummary(); - CloseFunction(); - return false; - } - BreatheCount = 0; - } -#endif + if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set + cout << "Candidate model: "; + Rule_i.PrintCutoffSet(); } + m1.lock(); + CountCutoffSets++; + m1.unlock(); + // if (IsUpdateRealtime) CalculateProgress(); + + #ifndef COMMANDVERSION + // BreatheCount++;// Increment breathe counter + + if (BreatheCount>BREATHE_INTERVAL) { + if (PauseFunction()) { // User paused the project + PrintSummary(); + return false; + } + if (CancelFunction()) { // User cancelled the project + PrintSummary(); + CloseFunction(); + return false; + } + BreatheCount = 0; + } + #endif } - -// std::stringstream sstr; -// TermTupleTiming.Clear(); -// TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); - m3.lock(); - CountCandidatesPartition += Rule_ij.GetCountCandidates(); - m3.unlock(); } - }); + + // std::stringstream sstr; + // TermTupleTiming.Clear(); + // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); + m3.lock(); + CountCandidatesPartition += Rule_i.GetCountCandidates(); + m3.unlock(); + } + }); + } else if (ParallelMethod == TWO) { + + vector all_rules; + while (Rule.NextCombinationGenerator()) { + all_rules.push_back(this->Rule); } - }); + tbb::parallel_for(tbb::blocked_range(0, Rule.GetCombinationsGenerated()), [&](tbb::blocked_range r) { + + // for (int i = r.begin(); i < r.end(); ++i) + int i = r.begin(); + { + StartTimeTermTuple = clock(); + + // NOTE: blocked_range uses open interval [start,end) + tbb::parallel_for(tbb::blocked_range(0, Rule.GetFeatureOperatorSize() + 1), + [&](tbb::blocked_range s) { + + // for (int j = s.begin(); j < s.end(); j++) + { + int j = s.begin(); + + RULE Rule_ij = RULE(all_rules[i]); // CREATE DEEP COPY + + float CandidatePerformance; + CANDIDATE PotentialCandidate; + + m2.lock(); + + Rule_ij.CPBest = CPBest_global; + Rule_ij.CTBest = CTBest_global; + + switch (MaximizeMeasure) { + case SENSITIVITY: + CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; + break; + case SPECIFICITY: + CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; + break; + case NPV: + CandidatePerformance = PartitionCandidates.Performance.NPV.Value; + break; + case PPV: + CandidatePerformance = PartitionCandidates.Performance.PPV.Value; + break; + case ACCURACY: + CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; + break; + case BALANCEDACCURACY: + CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; + break; + case F1SCORE: + CandidatePerformance = PartitionCandidates.Performance.F1score.Value; + break; + } + m2.unlock(); + + if (IsPrintCombinations) Rule_ij.PrintCombination(); + + // printf("Combination %d and feature operators set %d and address %p and thread id %d \n", i, j, &Rule_ij, tbb::this_task_arena::current_thread_index()); + + while (Rule_ij.NextFeatureSetGenerator(j, j)) { + // TODO: create function that releases all print statements of one thread at once + + if (IsPrintFeatureSets) Rule_ij.PrintFeatureSet_Thread(); + + m0.lock(); + CountFeatureOperatorPairs++; + m0.unlock(); + + while (Rule_ij.NextCutoffSetGenerator()) { + + if (Rule_ij.TestRule(Initialised, Constraints, + CandidatePerformance, MaximizeMeasure, + RestrictionSet, + RuleOutputMethod, IsPrintPerformance, + IsPrintSets)) { + + PotentialCandidate = Rule_ij.SaveCandidate(MaximizeMeasure, + RestrictionSet); + + m2.lock(); + + bool change; + switch (MaximizeMeasure) { + case SENSITIVITY: + change = ( + PotentialCandidate.Performance.Sensitivity.Value > + PartitionCandidates.Performance.Sensitivity.Value); + break; + case SPECIFICITY: + change = ( + PotentialCandidate.Performance.Specificity.Value > + PartitionCandidates.Performance.Specificity.Value); + break; + case NPV: + change = (PotentialCandidate.Performance.NPV.Value > + PartitionCandidates.Performance.NPV.Value); + break; + case PPV: + change = (PotentialCandidate.Performance.PPV.Value > + PartitionCandidates.Performance.PPV.Value); + break; + case ACCURACY: + change = (PotentialCandidate.Performance.Accuracy.Value > + PartitionCandidates.Performance.Accuracy.Value); + break; + case BALANCEDACCURACY: + change = ( + PotentialCandidate.Performance.BalancedAccuracy.Value > + PartitionCandidates.Performance.BalancedAccuracy.Value); + break; + case F1SCORE: + change = (PotentialCandidate.Performance.F1score.Value > + PartitionCandidates.Performance.F1score.Value); + break; + } + + if (change) { + PartitionCandidates = PotentialCandidate; + + CPBest_global = Rule_ij.CPBest; + CTBest_global = Rule_ij.CTBest; + } + m2.unlock(); + } + + if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set + cout << "Candidate model: "; + Rule_ij.PrintCutoffSet(); + } + m1.lock(); + CountCutoffSets++; + m1.unlock(); + // if (IsUpdateRealtime) CalculateProgress(); + + #ifndef COMMANDVERSION + // BreatheCount++;// Increment breathe counter + + if (BreatheCount>BREATHE_INTERVAL) { + if (PauseFunction()) { // User paused the project + PrintSummary(); + return false; + } + if (CancelFunction()) { // User cancelled the project + PrintSummary(); + CloseFunction(); + return false; + } + BreatheCount = 0; + } + #endif + } + } + + // std::stringstream sstr; + // TermTupleTiming.Clear(); + // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); + m3.lock(); + CountCandidatesPartition += Rule_ij.GetCountCandidates(); + m3.unlock(); + } + }); + } + }); + + } } -} -// TODO: is "Rule" needed? - BestLengthPartition = PartitionCandidates.Size(); // TODO: for multiple projectcandidates? + if (PartitionCandidates.IsValid()) { + BestCandidate = PartitionCandidates; + BestLengthPartition = BestCandidate.Size(); + +// cout << "Total Count Combinations:" << Rule.GetCombinationsGenerated() << endl; +// cout << "Total Count Feature Operator Pairs:" << CountFeatureOperatorPairs << endl; +// cout << "Total Count Cutoff Sets:" << CountCutoffSets << endl; // = CountCandidatesPartition with restrictions (mandatory features) without constraints (accuracy/sensitivity) +// cout << "Total Count Candidates (incl constraints):" << CountCandidatesPartition << endl; +// cout << endl; - cout << endl << endl; - cout << "Best Length:" << BestLengthPartition << endl; - cout << "====================================================" << endl; + Rule.SetRule(BestCandidate); // Needed for parallel? - if (BestLengthPartition != 0) { - // BestLengthPartition = Rule.FindBestLength(Initialised,PartitionCandidates, PartitionMethod, MaximizeMeasure); - // BestLength[BestLengthPartition - 1] = BestLength[BestLengthPartition - 1] + 1; // Calculate performance of current rule in validation set + // Update counter of best length BestLength.at(BestLengthPartition - 1) = BestLength.at(BestLengthPartition - 1) + 1; + + } else { + BestLengthPartition = 0; } } while (Partition()); @@ -3324,7 +3344,12 @@ if (!Parallel) { auto MostFrequent = std::max_element(BestLength.begin(), BestLength.end()); BestLengthFinal = std::distance(std::begin(BestLength), MostFrequent) + 1; - cout << "Results EXPLORE with BestLength " << BestLengthFinal << " on full train set" << endl; + cout << "====================================================" << endl; + cout << endl; + + cout << "RESULT: full train set" << endl; + cout << endl; + if ((GetPartitionMethod())==CROSS_VALIDATION || (GetPartitionMethod())==HOLDOUT) { // Re-train model with full train set (learn + validate) Population.ResetTestPartitions(); // Sets all partitions to LEARN @@ -3332,53 +3357,30 @@ if (!Parallel) { SetRerun(); - Induce(BestLengthFinal, BestLengthFinal); + Induce(BestLengthFinal, BestLengthFinal); // TODO: need to support running in parallel Final = true; ValidateBestCandidate(); // Print results on full train set and save best rule } else { - -#ifndef PARALLEL // Directly print results on full train set and save best rule - if (PartitionCandidates.IsValid()) { - // if (PartitionCandidates.size()>0) { - CANDIDATE BestCandidate = PartitionCandidates; - // CANDIDATE BestCandidate = Rule.ChooseBestCandidate(BestLengthFinal, Initialised, PartitionCandidates, MaximizeMeasure); - // if (ChooseBestCandidate(BestLengthFinal)){ - if (Rule.SetRule(BestCandidate)) { - cout << "Best candidate (overall): "; - Rule.PrintCutoffSet(); - cout << endl; - cout << "Learn-set: "; - BestCandidate.Performance.Print(); - cout << endl; - - ProjectCandidates.push_back(BestCandidate); - - cout << "Total Count Combinations:" << Rule.GetCombinationsGenerated() << endl; - cout << "Total Count Feature Operator Pairs:" << CountFeatureOperatorPairs << endl; - cout << "Total Count Cutoff Sets:" << CountCutoffSets << endl; // = CountCandidatesPartition with restrictions (mandatory features) without constraints (accuracy/sensitivity) - cout << "Total Count Candidates (incl constraints):" << CountCandidatesPartition << endl; - - } - // } - } -# else - Population.ResetTestPartitions(); // Sets all partitions to LEARN - PartitionCandidates.clear(); // Remove all the partition candidates used to find BestLength - - SetRerun(); - - Induce(BestLengthFinal, BestLengthFinal); - - Final = true; - ValidateBestCandidate(); // Print results on full train set and save best rule - -# endif - + cout << "Best length: " << BestLengthPartition << endl; + if (Rule.SetRule(BestCandidate)) { + cout << "Best candidate: "; + Rule.PrintCutoffSet(); + cout << "Performance learn-set: "; + BestCandidate.Performance.Print(); + + ProjectCandidates.push_back(BestCandidate); + } } + cout << endl; + cout << "Total Count Combinations:" << Rule.GetCombinationsGenerated() << endl; + cout << "Total Count Feature Operator Pairs:" << CountFeatureOperatorPairs << endl; + cout << "Total Count Cutoff Sets:" << CountCutoffSets << endl; // = CountCandidatesPartition with restrictions (mandatory features) without constraints (accuracy/sensitivity) + cout << "Total Count Candidates (incl constraints):" << CountCandidatesPartition << endl; + return true; } @@ -3455,7 +3457,6 @@ void Explore::Induce(int nStart, int nEnd) { } } - cout << "Total Count Candidates:" << Rule.GetCountCandidates() << endl; } diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 66a7b470..1e406b83 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -3448,11 +3448,9 @@ bool RULE::TestRule(bool Initialised, vector Constraints, float Cand CountCandidates++; if (CompareBestCandidate(CurrentPerformance, Initialised, CandidatePerformance, MaximizeMeasure)) { - // PartitionCandidates = SaveCandidate(CurrentPerformance, PartitionCandidates, MaximizeMeasure, RestrictionSet); Found = true; } } else { - // PartitionCandidates = SaveCandidate(CurrentPerformance, PartitionCandidates, MaximizeMeasure, RestrictionSet); Candidate = true; CountCandidates++; @@ -3488,8 +3486,7 @@ bool RULE::TestRule(bool Initialised, vector Constraints, float Cand End = clock(); ExploreTiming.AddTime("EXPLORE::TestRule", Start, End); #endif - // TODO: indicate when partition candidates NOT updated or return Candidate instead? - // return Candidate; + return Found; } From 39ca5011f7c2ab90f151aac5453ca2a5f070dfea Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Thu, 1 Aug 2024 11:45:33 +0200 Subject: [PATCH 17/41] Clean model string --- R/MainFunctions.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index c39179ab..57625fc9 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -318,6 +318,10 @@ predictExplore <- function(model, test_data) { return(NULL) } + # Clean string + model <- stringr::str_remove_all(model, '\"') + model <- stringr::str_replace_all(model, "=", "==") # TODO: check here! + # Split string all_terms <- stringr::str_split_fixed(model, "OR", n=Inf) From 7a8a60f9a8d22f94817b55ec0fd735e918f012f9 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Fri, 2 Aug 2024 10:18:36 +0200 Subject: [PATCH 18/41] Fix two levels binary features (in case one of values not occuring) --- R/HelperFunctions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index 7583cc56..1032c2b8 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -78,7 +78,7 @@ saveData <- function(output_path, train_data, file_name) { # Fix col type for binary data binary_cols <- sapply(1:ncol(train_data), function(c) all(train_data[[c]] %in% 0:1)) - train_data[binary_cols] <- lapply(colnames(train_data[binary_cols]), function(c) factor(train_data[[c]], labels=c(0,1))) + train_data[binary_cols] <- lapply(colnames(train_data[binary_cols]), function(c) factor(train_data[[c]], levels=c("0","1"), labels=c(0,1))) # Order data (first binary then continuous features) train_data <- cbind(train_data[binary_cols],train_data[!binary_cols]) # Order needed for correct functioning of main algorithm in C++ From bd98d6692a720a551ea703c755f4bdfd26d3fe8b Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Mon, 5 Aug 2024 19:48:21 +0200 Subject: [PATCH 19/41] Bug clean string <= and update defaults in modelsCurveExplore --- R/MainFunctions.R | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index 57625fc9..b5d35b55 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -320,7 +320,8 @@ predictExplore <- function(model, test_data) { # Clean string model <- stringr::str_remove_all(model, '\"') - model <- stringr::str_replace_all(model, "=", "==") # TODO: check here! + model <- stringr::str_replace_all(model, "=", "==") + model <- stringr::str_replace_all(model, "<=", "<") # to correct initial case <= -> <== -> <= # Split string all_terms <- stringr::str_split_fixed(model, "OR", n=Inf) @@ -343,7 +344,7 @@ predictExplore <- function(model, test_data) { data_model <- cbind(data_model, as.integer(col==length(all_literals))) } - colnames(data_model) <- all_terms + colnames(data_model) <- all_terms # TODO: CHECK HERE WHY DATA_MODEL NO COLUMNS predictions <- as.integer(rowSums(data_model)>0) return(predictions) @@ -369,19 +370,22 @@ modelsCurveExplore <- function(train_data = NULL, StartRulelength = 1, EndRulelength = 3, OperatorMethod = "EXHAUSTIVE", - CutoffMethod = "RVAC", + CutoffMethod = "ALL", ClassFeature = "'class'", PositiveClass = "'Iris-versicolor'", FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, PrintSettings = TRUE, PrintPerformance = TRUE, - Subsumption = TRUE, + Subsumption = FALSE, BranchBound = TRUE, - Parallel = FALSE) { + Sorted = "none", + Parallel = TRUE, + ParallelMethod = "ONE", + BinaryReduction = FALSE) { # TODO: only input required variables? # Range of specificities to check From 75f8e05b3199dd8c843736d27836195a0fcb844c Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Tue, 6 Aug 2024 08:45:52 +0200 Subject: [PATCH 20/41] Added phi metric for sorting. --- R/HelperFunctions.R | 22 +++++++++++++++++++++- R/MainFunctions.R | 4 +++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index 1032c2b8..d0bb575b 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -105,4 +105,24 @@ jaccard <- function(a, b) { intersection = length(intersect(a, b)) union = length(a) + length(b) - intersection return (intersection/union) -} \ No newline at end of file +} + +phi <- function(a, b) { + contingency_tb <- table(a, b) + + r.sum <- rowSums(contingency_tb) + c.sum <- colSums(contingency_tb) + + total <- sum(r.sum) + r.sum <- r.sum/total + c.sum <- c.sum/total + + v <- prod(r.sum, c.sum) + phi <- (contingency_tb[1,1] / total - c.sum[1] * r.sum[1] / sqrt(v)) + names(phi) <- NULL + + return(phi) +} + + + diff --git a/R/MainFunctions.R b/R/MainFunctions.R index b5d35b55..8531ca76 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -151,7 +151,9 @@ trainExplore <- function(train_data = NULL, cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) cor(col, train_data[ClassFeature_]==PositiveClass_, method=Sorted)) } else if (Sorted == "jaccard") { cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) jaccard(col, train_data[ClassFeature_]==PositiveClass_)) - } + } else if (Sorted == "phi") { + cor <- sapply(train_data[, -which(names(train_data) == ClassFeature_)], function(col) phi(col, train_data[ClassFeature_]==PositiveClass_)) + } # else if (Sorted == "LASSO") { # model_lasso <- glmnet::cv.glmnet(x=data.matrix(train_data[, -which(names(train_data) == ClassFeature_)]), y = train_data[ClassFeature_]==PositiveClass_, alpha = 1, lambda = 10^seq(3, -2, by = -.1), maxit=10000000, standardize = TRUE, nfolds = 5, family = "binomial") # coef <- as.matrix(coef(model_lasso, s = "lambda.min")) # get importance From f0af7c40b376c96f539e512ce110ed93b6ae3073 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Fri, 9 Aug 2024 16:19:04 +0200 Subject: [PATCH 21/41] Fixed running with constraints BA and F1 score (compilation flags important!) --- inst/CMakeLists.txt | 9 ++--- src/C++/Explore/explore.cpp | 12 +++--- src/C++/Explore/rule.cpp | 6 +++ src/C++/IOExplore/IOExplore.cpp | 65 ++++++++++++++++----------------- 4 files changed, 48 insertions(+), 44 deletions(-) diff --git a/inst/CMakeLists.txt b/inst/CMakeLists.txt index 16bbc7ee..344a7a34 100644 --- a/inst/CMakeLists.txt +++ b/inst/CMakeLists.txt @@ -38,9 +38,9 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ltbb") # set(CMAKE_PREFIX_PATH "/opt/intel/oneapi/tbb/latest/lib/intel64/gcc4.8") # find_library(TBB_LIB tbb) # find_path(TBB_PATH - # HINTS /opt/intel/oneapi/tbb/latest/include - # NAMES tbb/parallel_for.h) - +# HINTS /opt/intel/oneapi/tbb/latest/include +# NAMES tbb/parallel_for.h) + set(SOURCE_FILES Clion/main.cpp ../src/C++/CMExplore/cmdline.h @@ -93,8 +93,7 @@ set(SOURCE_FILES ../src/C++/IOExplore/IOExplore.h ../src/C++/common.cpp ../src/C++/common.h - ../src/C++/stl.h - ../src/C++/stlpmt.lib) + ../src/C++/stl.h) add_executable(Explore ${SOURCE_FILES}) diff --git a/src/C++/Explore/explore.cpp b/src/C++/Explore/explore.cpp index 521f7856..1fc5ca70 100755 --- a/src/C++/Explore/explore.cpp +++ b/src/C++/Explore/explore.cpp @@ -395,12 +395,12 @@ void Explore::SummarisePerformance() { case ACCURACY: CurrentValue = (*CurrentCandidate).Performance.Accuracy.Value; break; - case BALANCEDACCURACY: - CurrentValue = (*CurrentCandidate).Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - CurrentValue = (*CurrentCandidate).Performance.F1score.Value; - break; + case BALANCEDACCURACY: + CurrentValue = (*CurrentCandidate).Performance.BalancedAccuracy.Value; + break; + case F1SCORE: + CurrentValue = (*CurrentCandidate).Performance.F1score.Value; + break; } // Calculate minimum performance if (CurrentValue RuleValue) { diff --git a/src/C++/IOExplore/IOExplore.cpp b/src/C++/IOExplore/IOExplore.cpp index aa33e922..e02df908 100644 --- a/src/C++/IOExplore/IOExplore.cpp +++ b/src/C++/IOExplore/IOExplore.cpp @@ -62,8 +62,8 @@ IOExplore::IOExplore() { Dummy.push_back("Accuracy"); Dummy.push_back("PPV"); Dummy.push_back("NPV"); - Dummy.push_back("BalancedAccuracy"); - Dummy.push_back("F1score"); + Dummy.push_back("BalancedAccuracy"); + Dummy.push_back("F1score"); Sections.push_back(Dummy); Dummy.clear(); @@ -585,13 +585,12 @@ bool IOExplore::SaveExploreToProject(string IOFilename) { case NPV: ProjectSettings.Maximize = NPV; break; - - case BALANCEDACCURACY: - ProjectSettings.Maximize = BALANCEDACCURACY; - break; - case F1SCORE: - ProjectSettings.Maximize = F1SCORE; - break; + case BALANCEDACCURACY: + ProjectSettings.Maximize = BALANCEDACCURACY; + break; + case F1SCORE: + ProjectSettings.Maximize = F1SCORE; + break; } vector Constraints = Project->GetConstraints(); @@ -624,16 +623,16 @@ bool IOExplore::SaveExploreToProject(string IOFilename) { ProjectSettings.Accuracy = (*CurrentConstraint).Value; } break; - case BALANCEDACCURACY: - if ((*CurrentConstraint).Value != 0){ - ProjectSettings.BalancedAccuracy = (*CurrentConstraint).Value; - } - break; - case F1SCORE: - if ((*CurrentConstraint).Value != 0){ - ProjectSettings.F1score = (*CurrentConstraint).Value; - } - break; + case BALANCEDACCURACY: + if ((*CurrentConstraint).Value != 0){ + ProjectSettings.BalancedAccuracy = (*CurrentConstraint).Value; + } + break; + case F1SCORE: + if ((*CurrentConstraint).Value != 0){ + ProjectSettings.F1score = (*CurrentConstraint).Value; + } + break; } } ProjectFile.flush(); @@ -865,12 +864,12 @@ bool IOExplore::SaveSettingsToFile(string IOFilename) { case NPV: ProjectFile << "Maximize=NPV" << endl; break; - case BALANCEDACCURACY: - ProjectFile << "Maximize=BALANCEDACCURACY" << endl; - break; - case F1SCORE: - ProjectFile << "Maximize=F1SCORE" << endl; - break; + case BALANCEDACCURACY: + ProjectFile << "Maximize=BALANCEDACCURACY" << endl; + break; + case F1SCORE: + ProjectFile << "Maximize=F1SCORE" << endl; + break; } if (ProjectSettings.Sensitivity>0) { ProjectFile << "Sensitivity=" << ProjectSettings.Sensitivity << endl; @@ -887,12 +886,12 @@ bool IOExplore::SaveSettingsToFile(string IOFilename) { if (ProjectSettings.Accuracy>0) { ProjectFile << "Accuracy=" << ProjectSettings.Accuracy << endl; } - if (ProjectSettings.BalancedAccuracy>0) { - ProjectFile << "BalancedAccuracy=" << ProjectSettings.BalancedAccuracy << endl; - } - if (ProjectSettings.F1score>0) { - ProjectFile << "F1score=" << ProjectSettings.F1score << endl; - } + if (ProjectSettings.BalancedAccuracy>0) { + ProjectFile << "BalancedAccuracy=" << ProjectSettings.BalancedAccuracy << endl; + } + if (ProjectSettings.F1score>0) { + ProjectFile << "F1score=" << ProjectSettings.F1score << endl; + } ProjectFile << "[Output]" << endl; switch (ProjectSettings.OutputMethod) { case EVERY: @@ -1366,7 +1365,7 @@ bool IOExplore::SetupExploreFromProject(string IOFilename) { if (atof(CurrentValue.c_str())>0 && atof(CurrentValue.c_str())<1) { ProjectSettings.BalancedAccuracy = atof(CurrentValue.c_str()); } else { - ProjectLoadErrors.push_back("Invalid value for constraint NPV."); + ProjectLoadErrors.push_back("Invalid value for constraint Balanced Accuracy."); return false; } } @@ -1375,7 +1374,7 @@ bool IOExplore::SetupExploreFromProject(string IOFilename) { if (atof(CurrentValue.c_str())>0 && atof(CurrentValue.c_str())<1) { ProjectSettings.F1score = atof(CurrentValue.c_str()); } else { - ProjectLoadErrors.push_back("Invalid value for constraint NPV."); + ProjectLoadErrors.push_back("Invalid value for constraint F1score."); return false; } } From 5f55e568de4d30b472500573d46c01849516c419 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Fri, 9 Aug 2024 16:35:40 +0200 Subject: [PATCH 22/41] Fix printing all candidate rules #23 --- src/C++/Explore/explore.cpp | 1 + src/C++/Explore/rule.cpp | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/C++/Explore/explore.cpp b/src/C++/Explore/explore.cpp index 1fc5ca70..0c562b9f 100755 --- a/src/C++/Explore/explore.cpp +++ b/src/C++/Explore/explore.cpp @@ -243,6 +243,7 @@ void Explore::ValidateBestCandidate() { if (PartitionCandidates.IsValid()) { CANDIDATE BestCandidate = PartitionCandidates; + cout << endl; cout << "Best length: " << BestCandidate.Size() << endl; if (Rule.SetRule(BestCandidate)) { cout << "Best candidate: "; diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index cd339a4a..3ffb3ee7 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -3461,7 +3461,9 @@ bool RULE::TestRule(bool Initialised, vector Constraints, float Cand switch (RuleOutputMethod) { case EVERY: - PrintCutoffSet(); + if (Candidate) { + PrintCutoffSet(); + } if (IsPrintPerformance) { PrintPerformance(); } From e249518b808ea5ddc5e93da2f462362eaf6254ff Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Mon, 12 Aug 2024 09:06:47 +0200 Subject: [PATCH 23/41] Update default parameters and add OutputMethod --- R/MainFunctions.R | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index 8531ca76..fe639a13 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -47,13 +47,14 @@ trainExplore <- function(train_data = NULL, Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, + OutputMethod = "BEST", PrintSettings = TRUE, - PrintPerformance = TRUE, + PrintPerformance = FALSE, Subsumption = FALSE, BranchBound = TRUE, Sorted = "none", Parallel = TRUE, - ParallelMethod = "ONE", + ParallelMethod = "TWO", BinaryReduction = FALSE) { if (!dir.exists(output_path)) { @@ -99,6 +100,7 @@ trainExplore <- function(train_data = NULL, checkDouble(Accuracy), checkDouble(BalancedAccuracy), checkDouble(Specificity), + checkString(OutputMethod), checkLogical(PrintSettings), checkLogical(PrintPerformance), checkLogical(Subsumption), @@ -185,6 +187,7 @@ trainExplore <- function(train_data = NULL, Accuracy = Accuracy, BalancedAccuracy = BalancedAccuracy, Specificity = Specificity, + OutputMethod = OutputMethod, PrintSettings = PrintSettings, PrintPerformance = PrintPerformance, Subsumption = Subsumption, @@ -232,6 +235,7 @@ trainExplore <- function(train_data = NULL, #' @param Maximize One of list with strings, list = "ACCURACY", ... #' @param Accuracy Float 0-1 -> default = 0 (if 0, make empty = computationally more beneficial) #' @param Specificity float 0-1, default = 0 +#' @param OutputMethod string EVERY, BEST, INCREMENT #' @param PrintSettings True or False #' @param PrintPerformance True or False #' @param Subsumption True or False @@ -257,11 +261,11 @@ settingsExplore <- function(settings, BalancedAccuracy = 0, Specificity = 0, PrintSettings = "yes", - PrintPerformance = "yes", + PrintPerformance = "no", Subsumption = "no", BranchBound = "yes", Parallel = "yes", - ParallelMethod = "ONE", + ParallelMethod = "TWO", BinaryReduction = "no") { @@ -284,6 +288,7 @@ settingsExplore <- function(settings, settings <- changeSetting(settings, parameter = "Accuracy", input = Accuracy) settings <- changeSetting(settings, parameter = "BalancedAccuracy", input = BalancedAccuracy) settings <- changeSetting(settings, parameter = "Specificity", input = Specificity) + settings <- changeSetting(settings, parameter = "OutputMethod", input = OutputMethod) settings <- changeSetting(settings, parameter = "PrintSettings", input = PrintSettings) settings <- changeSetting(settings, parameter = "PrintPerformance", input = PrintPerformance) settings <- changeSetting(settings, parameter = "Subsumption", input = Subsumption) @@ -380,13 +385,14 @@ modelsCurveExplore <- function(train_data = NULL, Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, + OutputMethod = "BEST", PrintSettings = TRUE, - PrintPerformance = TRUE, + PrintPerformance = FALSE, Subsumption = FALSE, BranchBound = TRUE, Sorted = "none", Parallel = TRUE, - ParallelMethod = "ONE", + ParallelMethod = "TWO", BinaryReduction = FALSE) { # TODO: only input required variables? @@ -407,9 +413,9 @@ modelsCurveExplore <- function(train_data = NULL, ClassFeature = ClassFeature, PositiveClass = PositiveClass, FeatureInclude = FeatureInclude, Maximize = "SENSITIVITY", Accuracy = Accuracy, BalancedAccuracy = BalancedAccuracy, Specificity = constraint, - PrintSettings = PrintSettings, PrintPerformance = PrintPerformance, + OutputMethod = OutputMethod, PrintSettings = PrintSettings, PrintPerformance = PrintPerformance, Subsumption = Subsumption, BranchBound = BranchBound, - Parallel = Parallel) + Parallel = Parallel, ParallelMethod = ParallelMethod) return(model) }) From 1f6d8aa351a4d85c7bc51e613a754b00f59bda24 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Mon, 12 Aug 2024 11:04:41 +0200 Subject: [PATCH 24/41] Fix print constraints when printSettings=yes --- src/C++/Explore/explore.cpp | 30 ++++++++++++++---------------- src/C++/IOExplore/IOExplore.cpp | 29 ++++++++++++++--------------- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/src/C++/Explore/explore.cpp b/src/C++/Explore/explore.cpp index 0c562b9f..ee0cf880 100755 --- a/src/C++/Explore/explore.cpp +++ b/src/C++/Explore/explore.cpp @@ -836,12 +836,12 @@ void Explore::PrintConstraints() { case ACCURACY: cout << "Accuracy"; break; - case BALANCEDACCURACY: - cout << "Balanced Accuracy"; - break; - case F1SCORE: - cout << "F1 score"; - break; + case BALANCEDACCURACY: + cout << "Balanced Accuracy"; + break; + case F1SCORE: + cout << "F1 score"; + break; } cout << endl; @@ -862,12 +862,12 @@ void Explore::PrintConstraints() { case ACCURACY: cout << "Accuracy >= "; break; - case BALANCEDACCURACY: - cout << "Balanced Accuracy >= "; - break; - case F1SCORE: - cout << "F1 score >= "; - break; + case BALANCEDACCURACY: + cout << "Balanced Accuracy >= "; + break; + case F1SCORE: + cout << "F1 score >= "; + break; } cout << (*CurrentConstraint).Value << endl; } @@ -1217,10 +1217,8 @@ bool Explore::Initialise() { // Print project settings if (IsPrintSettings) { - PrintSettings(); - } - if (IsPrintPerformance) { - PrintConstraints(); + PrintSettings(); + PrintConstraints(); } if (IsPrintFeatureOperators) { // Print FeatureOperators diff --git a/src/C++/IOExplore/IOExplore.cpp b/src/C++/IOExplore/IOExplore.cpp index e02df908..d5d82af3 100644 --- a/src/C++/IOExplore/IOExplore.cpp +++ b/src/C++/IOExplore/IOExplore.cpp @@ -1361,23 +1361,22 @@ bool IOExplore::SetupExploreFromProject(string IOFilename) { return false; } } - if (CurrentHeading.compare("BalancedAccuracy")==0) { // Balanced accuracy constraint - if (atof(CurrentValue.c_str())>0 && atof(CurrentValue.c_str())<1) { - ProjectSettings.BalancedAccuracy = atof(CurrentValue.c_str()); - } else { - ProjectLoadErrors.push_back("Invalid value for constraint Balanced Accuracy."); - return false; - } + if (CurrentHeading.compare("BalancedAccuracy")==0) { // Balanced accuracy constraint + if (atof(CurrentValue.c_str())>0 && atof(CurrentValue.c_str())<1) { + ProjectSettings.BalancedAccuracy = atof(CurrentValue.c_str()); + } else { + ProjectLoadErrors.push_back("Invalid value for constraint Balanced Accuracy."); + return false; } - - if (CurrentHeading.compare("F1score")==0) { // F1 score constraint - if (atof(CurrentValue.c_str())>0 && atof(CurrentValue.c_str())<1) { - ProjectSettings.F1score = atof(CurrentValue.c_str()); - } else { - ProjectLoadErrors.push_back("Invalid value for constraint F1score."); - return false; - } + } + if (CurrentHeading.compare("F1score")==0) { // F1 score constraint + if (atof(CurrentValue.c_str())>0 && atof(CurrentValue.c_str())<1) { + ProjectSettings.F1score = atof(CurrentValue.c_str()); + } else { + ProjectLoadErrors.push_back("Invalid value for constraint F1score."); + return false; } + } // Output Settings if (CurrentHeading.compare("OutputMethod")==0) { // Output method (ALL, INCREMENTAL or BEST) if (CurrentValue.compare("EVERY")==0) { From fff1be3629845de38f6388cd20522c293233acf0 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Mon, 12 Aug 2024 11:05:38 +0200 Subject: [PATCH 25/41] Adjust reading in trained model and add function candidatesExplore --- NAMESPACE | 1 + R/MainFunctions.R | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index fc2aaccd..4b6088ce 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(candidatesExplore) export(modelsCurveExplore) export(predictExplore) export(rocCurveExplore) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index fe639a13..a84cc323 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -205,10 +205,11 @@ trainExplore <- function(train_data = NULL, results <- paste(readLines(getSetting(settings, "OutputFile", type = "value")), collapse="\n") # Load model - rule_string <- stringr::str_extract(results, "Best candidate \\(overall\\):.*?\u000A") + rule_string <- stringr::str_extract_all(results, "Best candidate:.*?\u000A") + rule_string <- unlist(rule_string)[[length(rule_string)]] # Select the last rule as this is the final candidate # Clean string - rule_string <- stringr::str_replace(rule_string, "Best candidate \\(overall\\):", "") + rule_string <- stringr::str_replace(rule_string, "Best candidate:", "") rule_string <- stringr::str_replace_all(rule_string, " ", "") rule_string <- stringr::str_replace_all(rule_string, "\\n", "") @@ -260,6 +261,7 @@ settingsExplore <- function(settings, Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, + OutputMethod = "BEST", PrintSettings = "yes", PrintPerformance = "no", Subsumption = "no", @@ -357,6 +359,22 @@ predictExplore <- function(model, test_data) { return(predictions) } +#' Return the number of candidate rules for EXPLORE +#' @param OutputFile output file = paste0(output_path, file_name, ".result") +#' +#' @export +candidatesExplore <- function(OutputFile) { + + # Read in results file + results <- paste(readLines(OutputFile), collapse="\n") + + num_candidates <- stringr::str_extract_all(results, "Total Count Candidates \\(incl constraints\\):.*?\u000A")[[1]] + num_candidates <- as.data.frame(stringr::str_remove_all(num_candidates, "Total Count Candidates \\(incl constraints\\):")) + num_candidates <- stringr::str_replace_all(num_candidates, "\\n", "") + + return(as.numeric(num_candidates)) +} + #' modelsCurveExplore # TODO: update documentation? #' From 27e5c1cbe58a3c505a097ba852d2f68c6079d582 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Wed, 14 Aug 2024 22:16:38 +0200 Subject: [PATCH 26/41] Correction parallel TWO change to blocked_range2d and binaryReduction=yes #24 --- src/C++/Explore/explore.cpp | 425 ++++++++++++++++++------------------ src/C++/Explore/rule.cpp | 14 +- 2 files changed, 215 insertions(+), 224 deletions(-) diff --git a/src/C++/Explore/explore.cpp b/src/C++/Explore/explore.cpp index ee0cf880..ed7dde84 100755 --- a/src/C++/Explore/explore.cpp +++ b/src/C++/Explore/explore.cpp @@ -13,6 +13,7 @@ #include #include #include +#include std::mutex m0; std::mutex m1; @@ -654,7 +655,7 @@ void Explore::PrintSettings() { cout << "SETTINGS" << endl; cout << endl << "PROJECT" << endl << endl; - cout << "Name: "; + cout << "Name: "; if (strlen(Name.c_str())>0) { cout << Name << endl; } else { @@ -799,7 +800,7 @@ void Explore::PrintPerformance() { if (Initialised) { CurrentPerformance.Print(); } - + #ifdef DEBUG_TIMING End = clock(); ExploreTiming.AddTime("EXPLORE::PrintPerformance", Start, End); @@ -1230,7 +1231,7 @@ bool Explore::Initialise() { End = clock(); ExploreTiming.AddTime("EXPLORE::Initialise", Start, End); #endif - + return Initialised; } @@ -1265,7 +1266,7 @@ bool Explore::LimitedInitialise() { NoPartitionsDone = 0; RulesProcessed = 0; - FeatureSetsProcessed = 0; + FeatureSetsProcessed = 0; //PR ExploreComplexity = RuleComplexity(); // Calculate complexity for progress indication ExploreComplexity = Population.GetNoPartitions()*10; InitialiseCPFP(); @@ -1280,7 +1281,7 @@ bool Explore::LimitedInitialise() { End = clock(); ExploreTiming.AddTime("EXPLORE::Initialise", Start, End); #endif - + return Initialised; } @@ -1404,7 +1405,7 @@ Function: GetSeed() Category: Selectors Scope: public In: - -Out: long double, the seed which is a cast from time_t +Out: long double, the seed which is a cast from time_t Description: Get the seed used to randomize the population. **********************************************************************/ long double Explore::GetSeed() { @@ -1705,7 +1706,7 @@ bool Explore::RemoveCutoff(unsigned int FeatureNumber, string CutoffValue) { Population.RemoveCutoff(FeatureNumber, CutoffValue); // Remove the cutoff return true; } - + return false; } @@ -1731,7 +1732,7 @@ bool Explore::RemoveCutoffRange(unsigned int FeatureNumber) { Function: RemoveFeatureCutoffs() Category: Modifiers Scope: public -In: +In: Out: bool, could/could not remove cutoffs for selected feature Description: Remove all cutoffs belonging to a specific feature. **********************************************************************/ @@ -1740,7 +1741,7 @@ bool Explore::RemoveFeatureCutoffs(unsigned int FeatureNumber) { Population.RemoveFeatureCutoffs(FeatureNumber); return true; } - + return false; } @@ -1909,7 +1910,7 @@ Function: GetOperatorMethod() Category: Selectors Scope: public In: - -Out: OPERATOR_METHOD, the operator method +Out: OPERATOR_METHOD, the operator method Description: Returns the method to determine operators of each feature. **********************************************************************/ OPERATOR_METHOD Explore::GetOperatorMethod() { @@ -2616,7 +2617,7 @@ void Explore::Start() { cout << endl << "TIMING" << endl << endl; time(&endtime); cout << "Project end: " << ctime(&endtime) << endl; - + std::stringstream sstr; sstr << "RuleLength:" << Rule.GetMaxRuleLength(); RuleLengthTiming.Clear(); @@ -2930,68 +2931,69 @@ bool Explore::RunProject() { while (Rule.NextFeatureSetGenerator(0, Rule.GetFeatureOperatorSize())) { - if (IsPrintFeatureSets) Rule.PrintFeatureSet(); - CountFeatureOperatorPairs++; - - while (Rule.NextCutoffSetGenerator()) { - - switch (MaximizeMeasure) { - case SENSITIVITY: - CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; - break; - case SPECIFICITY: - CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; - break; - case NPV: - CandidatePerformance = PartitionCandidates.Performance.NPV.Value; - break; - case PPV: - CandidatePerformance = PartitionCandidates.Performance.PPV.Value; - break; - case ACCURACY: - CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; - break; - case BALANCEDACCURACY: - CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - CandidatePerformance = PartitionCandidates.Performance.F1score.Value; - break; - } - - if (Rule.TestRule(Initialised, Constraints, - CandidatePerformance, MaximizeMeasure, RestrictionSet, - RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { - - PartitionCandidates = Rule.SaveCandidate(MaximizeMeasure, RestrictionSet); - } - - if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set - cout << "Candidate model: "; - Rule.PrintCutoffSet(); - } + if (IsPrintFeatureSets) Rule.PrintFeatureSet(); + CountFeatureOperatorPairs++; + + while (Rule.NextCutoffSetGenerator()) { + + switch (MaximizeMeasure) { + case SENSITIVITY: + CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; + break; + case SPECIFICITY: + CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; + break; + case NPV: + CandidatePerformance = PartitionCandidates.Performance.NPV.Value; + break; + case PPV: + CandidatePerformance = PartitionCandidates.Performance.PPV.Value; + break; + case ACCURACY: + CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; + break; + case BALANCEDACCURACY: + CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; + break; + case F1SCORE: + CandidatePerformance = PartitionCandidates.Performance.F1score.Value; + break; + } + + if (Rule.TestRule(Initialised, Constraints, + CandidatePerformance, MaximizeMeasure, RestrictionSet, + RuleOutputMethod, IsPrintPerformance, IsPrintSets)) { + + PartitionCandidates = Rule.SaveCandidate(MaximizeMeasure, RestrictionSet); + } + + if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set + cout << "Candidate model: "; + Rule.PrintCutoffSet(); + } + + CountCutoffSets++; + // if (IsUpdateRealtime) CalculateProgress(); - CountCutoffSets++; - // if (IsUpdateRealtime) CalculateProgress(); - - #ifndef COMMANDVERSION - // BreatheCount++;// Increment breathe counter - - if (BreatheCount>BREATHE_INTERVAL) { - if (PauseFunction()) { // User paused the project - PrintSummary(); - return false; - } - if (CancelFunction()) { // User cancelled the project - PrintSummary(); - CloseFunction(); - return false; - } - BreatheCount = 0; - } - #endif - } - } +#ifndef COMMANDVERSION + // BreatheCount++;// Increment breathe counter + + if (BreatheCount>BREATHE_INTERVAL) { + if (PauseFunction()) { // User paused the project + PrintSummary(); + return false; + } + if (CancelFunction()) { // User cancelled the project + PrintSummary(); + CloseFunction(); + return false; + } + BreatheCount = 0; + } +#endif + } + } + // } // std::stringstream sstr; // TermTupleTiming.Clear(); @@ -3156,137 +3158,128 @@ bool Explore::RunProject() { all_rules.push_back(this->Rule); } - tbb::parallel_for(tbb::blocked_range(0, Rule.GetCombinationsGenerated()), [&](tbb::blocked_range r) { - - // for (int i = r.begin(); i < r.end(); ++i) - int i = r.begin(); - { - StartTimeTermTuple = clock(); - - // NOTE: blocked_range uses open interval [start,end) - tbb::parallel_for(tbb::blocked_range(0, Rule.GetFeatureOperatorSize() + 1), - [&](tbb::blocked_range s) { - - // for (int j = s.begin(); j < s.end(); j++) - { - int j = s.begin(); - - RULE Rule_ij = RULE(all_rules[i]); // CREATE DEEP COPY - - float CandidatePerformance; - CANDIDATE PotentialCandidate; - - m2.lock(); - - Rule_ij.CPBest = CPBest_global; - Rule_ij.CTBest = CTBest_global; - - switch (MaximizeMeasure) { - case SENSITIVITY: - CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; - break; - case SPECIFICITY: - CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; - break; - case NPV: - CandidatePerformance = PartitionCandidates.Performance.NPV.Value; - break; - case PPV: - CandidatePerformance = PartitionCandidates.Performance.PPV.Value; - break; - case ACCURACY: - CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; - break; - case BALANCEDACCURACY: - CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; - break; - case F1SCORE: - CandidatePerformance = PartitionCandidates.Performance.F1score.Value; - break; - } - m2.unlock(); - - if (IsPrintCombinations) Rule_ij.PrintCombination(); - - // printf("Combination %d and feature operators set %d and address %p and thread id %d \n", i, j, &Rule_ij, tbb::this_task_arena::current_thread_index()); - - while (Rule_ij.NextFeatureSetGenerator(j, j)) { - // TODO: create function that releases all print statements of one thread at once - - if (IsPrintFeatureSets) Rule_ij.PrintFeatureSet_Thread(); - - m0.lock(); - CountFeatureOperatorPairs++; - m0.unlock(); - - while (Rule_ij.NextCutoffSetGenerator()) { - - if (Rule_ij.TestRule(Initialised, Constraints, - CandidatePerformance, MaximizeMeasure, - RestrictionSet, - RuleOutputMethod, IsPrintPerformance, - IsPrintSets)) { - - PotentialCandidate = Rule_ij.SaveCandidate(MaximizeMeasure, - RestrictionSet); - - m2.lock(); - - bool change; - switch (MaximizeMeasure) { - case SENSITIVITY: - change = ( - PotentialCandidate.Performance.Sensitivity.Value > - PartitionCandidates.Performance.Sensitivity.Value); - break; - case SPECIFICITY: - change = ( - PotentialCandidate.Performance.Specificity.Value > - PartitionCandidates.Performance.Specificity.Value); - break; - case NPV: - change = (PotentialCandidate.Performance.NPV.Value > - PartitionCandidates.Performance.NPV.Value); - break; - case PPV: - change = (PotentialCandidate.Performance.PPV.Value > - PartitionCandidates.Performance.PPV.Value); - break; - case ACCURACY: - change = (PotentialCandidate.Performance.Accuracy.Value > - PartitionCandidates.Performance.Accuracy.Value); - break; - case BALANCEDACCURACY: - change = ( - PotentialCandidate.Performance.BalancedAccuracy.Value > - PartitionCandidates.Performance.BalancedAccuracy.Value); - break; - case F1SCORE: - change = (PotentialCandidate.Performance.F1score.Value > - PartitionCandidates.Performance.F1score.Value); - break; - } - - if (change) { - PartitionCandidates = PotentialCandidate; + tbb::parallel_for(tbb::blocked_range2d(0, Rule.GetCombinationsGenerated(), 0, Rule.GetFeatureOperatorSize()),[all_rules, &CPBest_global, &CTBest_global, this, &CountFeatureOperatorPairs, &CountCutoffSets, &CountCandidatesPartition] (const tbb::blocked_range2d &r) + { + for (int i = r.rows().begin(); i < r.rows().end(); ++i) { + + for (int j = r.cols().begin(); j < r.cols().end(); j++) { + + RULE Rule_ij = RULE(all_rules[i]); // CREATE DEEP COPY + + float CandidatePerformance; + CANDIDATE PotentialCandidate; + + m2.lock(); + + Rule_ij.CPBest = CPBest_global; + Rule_ij.CTBest = CTBest_global; + + switch (MaximizeMeasure) { + case SENSITIVITY: + CandidatePerformance = PartitionCandidates.Performance.Sensitivity.Value; + break; + case SPECIFICITY: + CandidatePerformance = PartitionCandidates.Performance.Specificity.Value; + break; + case NPV: + CandidatePerformance = PartitionCandidates.Performance.NPV.Value; + break; + case PPV: + CandidatePerformance = PartitionCandidates.Performance.PPV.Value; + break; + case ACCURACY: + CandidatePerformance = PartitionCandidates.Performance.Accuracy.Value; + break; + case BALANCEDACCURACY: + CandidatePerformance = PartitionCandidates.Performance.BalancedAccuracy.Value; + break; + case F1SCORE: + CandidatePerformance = PartitionCandidates.Performance.F1score.Value; + break; + } + m2.unlock(); + + if (IsPrintCombinations) Rule_ij.PrintCombination(); + + // printf("Combination %d and feature operators set %d and address %p and thread id %d \n", i, j, &Rule_ij, tbb::this_task_arena::current_thread_index()); + + while (Rule_ij.NextFeatureSetGenerator(j, j)) { + // TODO: create function that releases all print statements of one thread at once + + if (IsPrintFeatureSets) Rule_ij.PrintFeatureSet_Thread(); + + m0.lock(); + CountFeatureOperatorPairs++; + m0.unlock(); + + while (Rule_ij.NextCutoffSetGenerator()) { + + if (Rule_ij.TestRule(Initialised, Constraints, + CandidatePerformance, MaximizeMeasure, + RestrictionSet, + RuleOutputMethod, IsPrintPerformance, + IsPrintSets)) { + + PotentialCandidate = Rule_ij.SaveCandidate(MaximizeMeasure, + RestrictionSet); + + m2.lock(); + + bool change; + switch (MaximizeMeasure) { + case SENSITIVITY: + change = ( + PotentialCandidate.Performance.Sensitivity.Value > + PartitionCandidates.Performance.Sensitivity.Value); + break; + case SPECIFICITY: + change = ( + PotentialCandidate.Performance.Specificity.Value > + PartitionCandidates.Performance.Specificity.Value); + break; + case NPV: + change = (PotentialCandidate.Performance.NPV.Value > + PartitionCandidates.Performance.NPV.Value); + break; + case PPV: + change = (PotentialCandidate.Performance.PPV.Value > + PartitionCandidates.Performance.PPV.Value); + break; + case ACCURACY: + change = (PotentialCandidate.Performance.Accuracy.Value > + PartitionCandidates.Performance.Accuracy.Value); + break; + case BALANCEDACCURACY: + change = ( + PotentialCandidate.Performance.BalancedAccuracy.Value > + PartitionCandidates.Performance.BalancedAccuracy.Value); + break; + case F1SCORE: + change = (PotentialCandidate.Performance.F1score.Value > + PartitionCandidates.Performance.F1score.Value); + break; + } + + if (change) { + PartitionCandidates = PotentialCandidate; + + CPBest_global = Rule_ij.CPBest; + CTBest_global = Rule_ij.CTBest; + } + m2.unlock(); + } - CPBest_global = Rule_ij.CPBest; - CTBest_global = Rule_ij.CTBest; - } - m2.unlock(); - } - - if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set - cout << "Candidate model: "; - Rule_ij.PrintCutoffSet(); - } - m1.lock(); - CountCutoffSets++; - m1.unlock(); - // if (IsUpdateRealtime) CalculateProgress(); + if (IsPrintCutoffSets) { // Calculate performance of current rule in learn set + cout << "Candidate model: "; + Rule_ij.PrintCutoffSet(); + } + m1.lock(); + CountCutoffSets++; + m1.unlock(); + // if (IsUpdateRealtime) CalculateProgress(); - #ifndef COMMANDVERSION - // BreatheCount++;// Increment breathe counter +#ifndef COMMANDVERSION + // BreatheCount++;// Increment breathe counter if (BreatheCount>BREATHE_INTERVAL) { if (PauseFunction()) { // User paused the project @@ -3300,22 +3293,22 @@ bool Explore::RunProject() { } BreatheCount = 0; } - #endif - } - } - - // std::stringstream sstr; - // TermTupleTiming.Clear(); - // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); - m3.lock(); - CountCandidatesPartition += Rule_ij.GetCountCandidates(); - m3.unlock(); - } - }); - } - }); +#endif + } + } + + // std::stringstream sstr; + // TermTupleTiming.Clear(); + // TermTupleTiming.AddTime(sstr.str(), StartTimeTermTuple, clock()); + m3.lock(); + CountCandidatesPartition += Rule_ij.GetCountCandidates(); + m3.unlock(); + + } + } + }); } } @@ -3436,7 +3429,7 @@ void Explore::Induce(int nStart, int nEnd) { PartitionCandidates = Rule.SaveCandidate(MaximizeMeasure, RestrictionSet); } - + #ifndef COMMANDVERSION // BreatheCount++; // Increment breathe counter if (BreatheCount>BREATHE_INTERVAL) { diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 3ffb3ee7..79127029 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -1670,9 +1670,6 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { Start = clock(); #endif - // int FOperatorNr_start = 1; - // int FOperatorNr_end = FOperatorNr_start+1; - // Counters as reference int ConjunctionSize, ConjunctionNr, ConditionNr, FOperatorNr, MaxFOperator; CONDITION* Condition; @@ -1783,8 +1780,8 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { } else { // AM: Algorithm 6 Incremented = true; // First FeatureSet generated for current Combination - // FOperatorNr = 0; FOperatorNr = FOperatorNr_start; + // Iterate through conjunctions (from front to back) for (ConjunctionNr=0; ConjunctionNr<=(int)Conjunctions.size()-1; ConjunctionNr++) { @@ -1793,9 +1790,11 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { if (BinaryReduction && Conjunctions[ConjunctionNr].Size==1) { // Simply go to next FeatureOperator, no repeats - // Unless previous feature is continuous and not also term size 1, then repeat so "go back one" - if (FeatureOperators[Conjunctions[ConjunctionNr-1].Conditions[Conjunctions[ConjunctionNr-1].Size-1].FeatureOperator].Operator!=EQUAL - && Conjunctions[ConjunctionNr-1].Size!=1) { + if (FOperatorNr_start > 0 && Conjunctions[ConjunctionNr-1].Size!=1) { // For Parallel = TWO when starting with higher FOperatorNr + FOperatorNr=0; + NumRepeats=0; + } else if (FeatureOperators[Conjunctions[ConjunctionNr-1].Conditions[Conjunctions[ConjunctionNr-1].Size-1].FeatureOperator].Operator!=EQUAL + && Conjunctions[ConjunctionNr-1].Size!=1) { // Unless previous feature is continuous and not also term size 1, then repeat so "go back one" FOperatorNr--; } @@ -1926,7 +1925,6 @@ bool RULE::NextFeatureSet(int FOperatorNr_start, int FOperatorNr_end) { } - // Not the conjunction at which we started, conjunction sizes do not match if (ConjunctionNr!=StartConjunctionNr && Conjunctions[ConjunctionNr].Size!=Conjunctions[ConjunctionNr-1].Size) { From e654541a20e36a0f36c7cb7993e7c0e9adc849ec Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Wed, 28 Aug 2024 11:51:02 +0200 Subject: [PATCH 27/41] Change to allow larger # rule counts --- src/C++/Explore/explore.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/C++/Explore/explore.cpp b/src/C++/Explore/explore.cpp index ed7dde84..10e370a5 100755 --- a/src/C++/Explore/explore.cpp +++ b/src/C++/Explore/explore.cpp @@ -2874,9 +2874,9 @@ bool Explore::RunProject() { time_t dummy; unsigned int ActiveRuleLength; - int CountCandidatesPartition; - int CountFeatureOperatorPairs; - int CountCutoffSets; + unsigned int CountCandidatesPartition; + unsigned int CountFeatureOperatorPairs; + unsigned int CountCutoffSets; CANDIDATE BestCandidate; int BestLengthPartition; From fae9e6ebee0f838aca93c14ab9b2c42ab80f9759 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Fri, 30 Aug 2024 15:14:39 +0200 Subject: [PATCH 28/41] Add export candidateModelsExplore --- NAMESPACE | 3 ++- R/MainFunctions.R | 21 ++++++++++++++++++++- src/C++/Explore/rule.cpp | 1 + 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4b6088ce..44c592ef 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand -export(candidatesExplore) +export(candidateModelsExplore) +export(candidateNumberExplore) export(modelsCurveExplore) export(predictExplore) export(rocCurveExplore) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index a84cc323..a749544b 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -363,7 +363,7 @@ predictExplore <- function(model, test_data) { #' @param OutputFile output file = paste0(output_path, file_name, ".result") #' #' @export -candidatesExplore <- function(OutputFile) { +candidateNumberExplore <- function(OutputFile) { # Read in results file results <- paste(readLines(OutputFile), collapse="\n") @@ -376,6 +376,25 @@ candidatesExplore <- function(OutputFile) { } +#' Return the generated candidate rules for EXPLORE +#' @param OutputFile output file = paste0(output_path, file_name, ".result") +#' +#' @export +candidateModelsExplore <- function(OutputFile) { + + # TODO: this function requires running EXPLORE with OutputMethod = EVERY -> check this in results file first? + + # Read in results file + results <- paste(readLines(OutputFile), collapse="\n") + + cand_models_lines <- strsplit(results, "\n") + candidate_models <- grep("Candidate model:", unlist(cand_models_lines), value = TRUE) + # length(candidate_models) + + return(candidate_models) +} + + #' modelsCurveExplore # TODO: update documentation? #' #' @param output_path A string declaring the path to the settings diff --git a/src/C++/Explore/rule.cpp b/src/C++/Explore/rule.cpp index 79127029..c4599571 100755 --- a/src/C++/Explore/rule.cpp +++ b/src/C++/Explore/rule.cpp @@ -3460,6 +3460,7 @@ bool RULE::TestRule(bool Initialised, vector Constraints, float Cand switch (RuleOutputMethod) { case EVERY: if (Candidate) { + cout << "Candidate model: "; PrintCutoffSet(); } if (IsPrintPerformance) { From 4969782be2f0c647e1429f3cb052811e532214ef Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Tue, 3 Sep 2024 14:07:37 +0200 Subject: [PATCH 29/41] Updates parameters in .project file + added test mandatory feature and BA constraint --- DESCRIPTION | 2 +- inst/examples/complexity/binary_10.project | 9 ++- inst/examples/complexity/binary_3.project | 7 +-- .../examples/complexity/categorical_4.project | 9 ++- inst/examples/complexity/continuous_4.project | 9 ++- inst/examples/complexity/mix_4.project | 11 ++-- inst/examples/plp/test_plp.project | 1 - inst/examples/test.project | 1 - inst/examples/tests/iris.project | 1 - inst/examples/train_data.project | 1 - inst/settings/template.project | 5 +- man/modelsCurveExplore.Rd | 14 +++-- man/settingsExplore.Rd | 13 +++-- man/trainExplore.Rd | 18 ++++-- tests/testthat/test-MainFunctions.R | 57 +++++++++++++++++++ 15 files changed, 111 insertions(+), 47 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b0333419..5bafd3d0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,7 +21,7 @@ Imports: pracma Encoding: UTF-8 LinkingTo: Rcpp, BH (>= 1.51.0), RcppParallel -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 Suggests: testthat (>= 3.0.0), knitr, diff --git a/inst/examples/complexity/binary_10.project b/inst/examples/complexity/binary_10.project index 6119d60a..551c3420 100644 --- a/inst/examples/complexity/binary_10.project +++ b/inst/examples/complexity/binary_10.project @@ -8,7 +8,7 @@ IncrementalOutputFile=false [Setup] PartitionMethod=RESUBSTITUTION Randomize=no -StartRulelength=3 +StartRulelength=1 EndRulelength=3 LearnRatio=0.8 NumberofPartitions=1 @@ -34,10 +34,9 @@ PrintCutoffMethod=no PrintCutoffValues=no PrintOperatorMethod=no PrintOperatorValues=no -PrintCombinations=yes -PrintFeatureSets=yes +PrintCombinations=no +PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no @@ -45,6 +44,6 @@ SavePartitions=no Subsumption=no BranchBound=no Parallel=no -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/inst/examples/complexity/binary_3.project b/inst/examples/complexity/binary_3.project index 4719a91d..65d2bfd4 100644 --- a/inst/examples/complexity/binary_3.project +++ b/inst/examples/complexity/binary_3.project @@ -34,10 +34,9 @@ PrintCutoffMethod=no PrintCutoffValues=no PrintOperatorMethod=no PrintOperatorValues=no -PrintCombinations=yes -PrintFeatureSets=yes +PrintCombinations=no +PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no @@ -45,6 +44,6 @@ SavePartitions=no Subsumption=no BranchBound=no Parallel=no -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/inst/examples/complexity/categorical_4.project b/inst/examples/complexity/categorical_4.project index 725de6b0..b4416cfd 100644 --- a/inst/examples/complexity/categorical_4.project +++ b/inst/examples/complexity/categorical_4.project @@ -34,10 +34,9 @@ PrintCutoffMethod=no PrintCutoffValues=no PrintOperatorMethod=no PrintOperatorValues=no -PrintCombinations=yes -PrintFeatureSets=yes -PrintCutoffSets=yes -PrintCutOffsetsBestLength=no +PrintCombinations=no +PrintFeatureSets=no +PrintCutoffSets=no PrintPerformance=yes PrintSets=no SavePartitions=no @@ -45,6 +44,6 @@ SavePartitions=no Subsumption=no BranchBound=no Parallel=no -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/inst/examples/complexity/continuous_4.project b/inst/examples/complexity/continuous_4.project index d6b6618c..b70e7214 100644 --- a/inst/examples/complexity/continuous_4.project +++ b/inst/examples/complexity/continuous_4.project @@ -34,10 +34,9 @@ PrintCutoffMethod=no PrintCutoffValues=no PrintOperatorMethod=no PrintOperatorValues=no -PrintCombinations=yes -PrintFeatureSets=yes -PrintCutoffSets=yes -PrintCutOffsetsBestLength=no +PrintCombinations=no +PrintFeatureSets=no +PrintCutoffSets=no PrintPerformance=yes PrintSets=no SavePartitions=no @@ -45,6 +44,6 @@ SavePartitions=no Subsumption=no BranchBound=no Parallel=no -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/inst/examples/complexity/mix_4.project b/inst/examples/complexity/mix_4.project index 15cde8ac..3ec382c4 100644 --- a/inst/examples/complexity/mix_4.project +++ b/inst/examples/complexity/mix_4.project @@ -25,8 +25,9 @@ FeatureRule= Maximize=BALANCEDACCURACY Accuracy= Specificity= +BalancedAccuracy=0.6268 [Output] -OutputMethod=BEST +OutputMethod=EVERY PrintSettings=yes PrintPartitions=no PrintFeatureOperators=no @@ -34,17 +35,17 @@ PrintCutoffMethod=no PrintCutoffValues=no PrintOperatorMethod=no PrintOperatorValues=no -PrintCombinations=yes -PrintFeatureSets=yes +PrintCombinations=no +PrintFeatureSets=no PrintCutoffSets=no PrintCutOffsetsBestLength=no -PrintPerformance=yes +PrintPerformance=no PrintSets=no SavePartitions=no [Run] Subsumption=no BranchBound=no Parallel=no -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/inst/examples/plp/test_plp.project b/inst/examples/plp/test_plp.project index bb3a5862..cb7a0a6b 100644 --- a/inst/examples/plp/test_plp.project +++ b/inst/examples/plp/test_plp.project @@ -38,7 +38,6 @@ PrintOperatorValues=no PrintCombinations=no PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no diff --git a/inst/examples/test.project b/inst/examples/test.project index 03f0fbbf..94832eda 100755 --- a/inst/examples/test.project +++ b/inst/examples/test.project @@ -39,7 +39,6 @@ PrintOperatorValues=no PrintCombinations=no PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no diff --git a/inst/examples/tests/iris.project b/inst/examples/tests/iris.project index 5dc4d38d..fd2b6aad 100644 --- a/inst/examples/tests/iris.project +++ b/inst/examples/tests/iris.project @@ -38,7 +38,6 @@ PrintOperatorValues=no PrintCombinations=no PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no diff --git a/inst/examples/train_data.project b/inst/examples/train_data.project index e1ce687f..1d1f2b4e 100644 --- a/inst/examples/train_data.project +++ b/inst/examples/train_data.project @@ -37,7 +37,6 @@ PrintOperatorValues=no PrintCombinations=no PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no PrintPerformance=yes PrintSets=no SavePartitions=no diff --git a/inst/settings/template.project b/inst/settings/template.project index f31805fc..fa612feb 100755 --- a/inst/settings/template.project +++ b/inst/settings/template.project @@ -38,13 +38,12 @@ PrintOperatorValues=no PrintCombinations=no PrintFeatureSets=no PrintCutoffSets=no -PrintCutOffsetsBestLength=no -PrintPerformance=yes +PrintPerformance=no PrintSets=no SavePartitions=no [Run] Subsumption=no BranchBound=yes Parallel=yes -ParallelMethod=ONE +ParallelMethod=TWO BinaryReduction=no diff --git a/man/modelsCurveExplore.Rd b/man/modelsCurveExplore.Rd index 0f0b79c1..767ec77e 100644 --- a/man/modelsCurveExplore.Rd +++ b/man/modelsCurveExplore.Rd @@ -13,19 +13,23 @@ modelsCurveExplore( StartRulelength = 1, EndRulelength = 3, OperatorMethod = "EXHAUSTIVE", - CutoffMethod = "RVAC", + CutoffMethod = "ALL", ClassFeature = "'class'", PositiveClass = "'Iris-versicolor'", FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, + OutputMethod = "BEST", PrintSettings = TRUE, - PrintPerformance = TRUE, - Subsumption = TRUE, + PrintPerformance = FALSE, + Subsumption = FALSE, BranchBound = TRUE, - Parallel = FALSE + Sorted = "none", + Parallel = TRUE, + ParallelMethod = "TWO", + BinaryReduction = FALSE ) } \arguments{ diff --git a/man/settingsExplore.Rd b/man/settingsExplore.Rd index a423f8c0..886a4c0a 100644 --- a/man/settingsExplore.Rd +++ b/man/settingsExplore.Rd @@ -17,15 +17,18 @@ settingsExplore( ClassFeature, PositiveClass, FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, + OutputMethod = "BEST", PrintSettings = "yes", - PrintPerformance = "yes", - Subsumption = "yes", + PrintPerformance = "no", + Subsumption = "no", BranchBound = "yes", - Parallel = "no" + Parallel = "yes", + ParallelMethod = "TWO", + BinaryReduction = "no" ) } \arguments{ @@ -59,6 +62,8 @@ settingsExplore( \item{Specificity}{float 0-1, default = 0} +\item{OutputMethod}{string EVERY, BEST, INCREMENT} + \item{PrintSettings}{True or False} \item{PrintPerformance}{True or False} diff --git a/man/trainExplore.Rd b/man/trainExplore.Rd index 05e73e3b..5ace43ec 100644 --- a/man/trainExplore.Rd +++ b/man/trainExplore.Rd @@ -13,19 +13,23 @@ trainExplore( StartRulelength = 1, EndRulelength = 3, OperatorMethod = "EXHAUSTIVE", - CutoffMethod = "RVAC", + CutoffMethod = "ALL", ClassFeature = "'class'", PositiveClass = "'Iris-versicolor'", FeatureInclude = "", - Maximize = "ACCURACY", + Maximize = "BALANCEDACCURACY", Accuracy = 0, BalancedAccuracy = 0, Specificity = 0, + OutputMethod = "BEST", PrintSettings = TRUE, - PrintPerformance = TRUE, - Subsumption = TRUE, + PrintPerformance = FALSE, + Subsumption = FALSE, BranchBound = TRUE, - Parallel = FALSE + Sorted = "none", + Parallel = TRUE, + ParallelMethod = "TWO", + BinaryReduction = FALSE ) } \arguments{ @@ -51,7 +55,7 @@ trainExplore( \item{PositiveClass}{1 or string (?) (should be one of elements of column 'ClassFeature' in data train). Always provided by the user. The string should be enclused in single quotation marks, e.g. 'class'} -\item{FeatureInclude}{Empty or string (should be name of one of columns in data train)} +\item{FeatureInclude}{Empty or string (should be name of one or more columns in data train separated by ;)} \item{Maximize}{One of list with strings, list = "ACCURACY", "SENSITIVITY", "SPECIFICITY", ...} @@ -69,6 +73,8 @@ trainExplore( \item{BranchBound}{True or False} +\item{Sorted}{One of list with strings, e.g. "none", "jaccard", ... Sort features based on correlation with outcome variable, NOTE: only when train_data is entered} + \item{Parallel}{True or False} } \value{ diff --git a/tests/testthat/test-MainFunctions.R b/tests/testthat/test-MainFunctions.R index 8dc85a25..02dce8b7 100644 --- a/tests/testthat/test-MainFunctions.R +++ b/tests/testthat/test-MainFunctions.R @@ -110,3 +110,60 @@ test_that("compute AUC", { expect_true(auroc < 100) expect_true(auroc > 0) }) + +test_that("mandatory features", { + ### Tests for EXPLORE using iris dataset + data_path <- system.file("examples", "tests", "iris.arff", package = "Explore") + settings_path <- system.file("examples", "tests", "iris.project", package = "Explore") + output_path <- paste0(tempdir(), "/", "Test1") + dir.create(output_path) + if (.Platform$OS.type == "windows") { + output_path <- gsub("\\\\", "/", output_path) + } + output_path <- paste0(output_path, "/") + data <- farff::readARFF(data_path) + model <- Explore::trainExplore(output_path = output_path, + file_name = "iris", + train_data = data, + ClassFeature = "'class'", + PositiveClass = '"Iris-versicolor"', + FeatureInclude = "'sepalwidth';'sepallength'") + expect_equal(class(model), "character") + # expect_true(is.na(model), info = "Test failed because model is NA") + expect_equal(model, "'sepallength'>4.9AND'sepalwidth'<=3.2AND'petalwidth'<=1.7") +}) + +test_that("balanced accuracy constraint ", { + data_path <- system.file("examples", "complexity", "mix_4.arff", package = "Explore") + output_path <- paste0(getwd(), "/", "Test1") + dir.create(output_path) + if (.Platform$OS.type == "windows") { + output_path <- gsub("\\\\", "/", output_path) + } + output_path <- paste0(output_path, "/") + + data <- farff::readARFF(data_path) + data <-as.data.frame(apply(data,2,as.numeric)) + + model_without <- Explore::trainExplore(output_path = output_path, + file_name = "mix_4", + train_data = data, + StartRulelength = 3, + ClassFeature = "'outcomeCount'", + PositiveClass = '"1"') + num_without <- Explore::candidatesExplore(paste0(output_path, "mix_4", ".result")) + + model_with <- Explore::trainExplore(output_path = output_path, + file_name = "mix_4", + train_data = data, + StartRulelength = 3, + ClassFeature = "'outcomeCount'", + PositiveClass = '"1"', + BalancedAccuracy = 0.6, + OutputMethod = "EVERY", + Parallel = FALSE) + num_with <- Explore::candidatesExplore(paste0(output_path, "mix_4", ".result")) + + expect_equal(num_without, 1940) + expect_equal(num_with, 36) +}) \ No newline at end of file From eb8caee94af372caca22cd212770f2a650793773 Mon Sep 17 00:00:00 2001 From: aniekmarkus Date: Tue, 3 Sep 2024 16:16:08 +0200 Subject: [PATCH 30/41] Print rules only once --- R/testExplore.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/testExplore.R b/R/testExplore.R index cc722bd3..a0e9859e 100644 --- a/R/testExplore.R +++ b/R/testExplore.R @@ -40,7 +40,7 @@ testExplore <- function(dataset = "iris", StartRulelength = 2, EndRulelength = 2 Specificity <- 0 PrintSettings <- TRUE PrintPerformance <- TRUE - PrintCutoffSets <- TRUE + PrintCutoffSets <- FALSE Subsumption <- FALSE BranchBound <- FALSE Sorted <- "none" From e95d4709d9e81c3c67adc91d884f59665bdf4d7f Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Tue, 24 Sep 2024 14:27:19 +0200 Subject: [PATCH 31/41] unary function change name --- src/C++/Explore/set.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/C++/Explore/set.cpp b/src/C++/Explore/set.cpp index 7e19301b..ff88daa1 100755 --- a/src/C++/Explore/set.cpp +++ b/src/C++/Explore/set.cpp @@ -423,16 +423,16 @@ string SET::PrintPerformance() { return Result.str(); } -struct AndJibu : public std ::__unary_function +struct AndJibu : public std ::unary_function { - const boost::dynamic_bitset<> Source; - boost::dynamic_bitset<> Dest; - AndJibu(const boost::dynamic_bitset<> Source,boost::dynamic_bitset<> Dest) : - Source(Source), Dest(Dest){} - void operator()(int x) - { - Dest[x]=Dest[x] & Source[x]; - } + const boost::dynamic_bitset<> Source; + boost::dynamic_bitset<> Dest; + AndJibu(const boost::dynamic_bitset<> Source,boost::dynamic_bitset<> Dest) : + Source(Source), Dest(Dest){} + void operator()(int x) + { + Dest[x]=Dest[x] & Source[x]; + } }; /********************************************************************** From 1611326b56277d078625992c8a47156d0e60ab1c Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Mon, 30 Sep 2024 13:39:22 +0200 Subject: [PATCH 32/41] output method --- R/MainFunctions.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index f3d33b26..1ac9108c 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -191,7 +191,6 @@ trainExplore <- function(train_data = NULL, Accuracy = Accuracy, BalancedAccuracy = BalancedAccuracy, Specificity = Specificity, - OutputMethod = OutputMethod, PrintSettings = PrintSettings, PrintPerformance = PrintPerformance, PrintCutoffSets = PrintCutoffSets, From 8a1ee3efc90eca4c7a82be83aa50acfa6579d900 Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Mon, 30 Sep 2024 15:02:05 +0200 Subject: [PATCH 33/41] convert_logical function --- R/HelperFunctions.R | 18 +++++++++++++++++- R/testExplore.R | 5 ++++- tests/testthat/test-HelperFunctions.R | 17 +++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 tests/testthat/test-HelperFunctions.R diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index d0bb575b..2d99ba77 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -78,7 +78,9 @@ saveData <- function(output_path, train_data, file_name) { # Fix col type for binary data binary_cols <- sapply(1:ncol(train_data), function(c) all(train_data[[c]] %in% 0:1)) - train_data[binary_cols] <- lapply(colnames(train_data[binary_cols]), function(c) factor(train_data[[c]], levels=c("0","1"), labels=c(0,1))) + + # Convert TRUE/FALSE to 1/0 + train_data <- convert_logical(train_data) # Order data (first binary then continuous features) train_data <- cbind(train_data[binary_cols],train_data[!binary_cols]) # Order needed for correct functioning of main algorithm in C++ @@ -100,6 +102,20 @@ saveData <- function(output_path, train_data, file_name) { # TODO: Support other file formats? } +convert_logical <- function(train_data) { + + binary_cols <- sapply(train_data, function(col) all(col %in% c(0, 1, TRUE, FALSE))) + + # Convert TRUE/FALSE to 1/0 and create factors + train_data[binary_cols] <- lapply(train_data[binary_cols], function(col) { + col <- as.numeric(as.logical(col)) # Convert TRUE/FALSE to 1/0 + factor(col, levels = c(0, 1), labels = c(0, 1)) # Convert to factors + }) + + return(train_data) + +} + # Correlation metric for binary data. jaccard <- function(a, b) { intersection = length(intersect(a, b)) diff --git a/R/testExplore.R b/R/testExplore.R index 872d7e3b..054892b6 100644 --- a/R/testExplore.R +++ b/R/testExplore.R @@ -1,4 +1,7 @@ -testExplore <- function(dataset = "iris", StartRulelength = 2, EndRulelength = 2, BinaryReduction = FALSE) { +testExplore <- function(dataset = "iris", + StartRulelength = 2, + EndRulelength = 2, + BinaryReduction = FALSE) { # dataset = "iris" # dataset = "binary_3" # dataset = "binary_10" diff --git a/tests/testthat/test-HelperFunctions.R b/tests/testthat/test-HelperFunctions.R new file mode 100644 index 00000000..f68a96a8 --- /dev/null +++ b/tests/testthat/test-HelperFunctions.R @@ -0,0 +1,17 @@ +test_that("Convert logical to 0/1", { + train_data <- data.frame(check.names = FALSE, + outcomeCount = c(FALSE,FALSE,FALSE, + FALSE,FALSE,TRUE), + `198124209` = c(FALSE,FALSE,FALSE, + FALSE,FALSE,TRUE), + `316139209` = c(FALSE,FALSE,FALSE, + FALSE,FALSE,FALSE), + `316139210` = c(FALSE,FALSE,FALSE, + FALSE,FALSE,FALSE) + ) + + train_data <- convert_logical(train_data) + + expect_true(all(sapply(train_data, function(col) all(col %in% c(0, 1))))) + +}) From 76e595f56cc30b7169550c7349020808ca33f811 Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Mon, 30 Sep 2024 16:06:57 +0200 Subject: [PATCH 34/41] resultsExplore and test --- R/MainFunctions.R | 28 +++++++++++++++-- tests/testthat/test-MainFunctions.R | 49 ++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 3 deletions(-) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index 1ac9108c..776a27f8 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -226,8 +226,8 @@ trainExplore <- function(train_data = NULL, results <- list("model" = rule_string, - "candidate_models" = candidate_models, - "cutoff_sets" = cutoff_sets) + "candidate_models" = candidate_models, + "cutoff_sets" = cutoff_sets) result <- results[resultType] @@ -379,6 +379,30 @@ predictExplore <- function(model, test_data) { return(predictions) } +#' Return a set of results from EXPLORE output file +#' @param outputFile outputfile = paste0(output_path, file_name, ".result") +#' +#' @export +resultsExplore <- function(outputFile) { + + # Read in results file + results <- paste(readLines(OutputFile), collapse="\n") + results_lines <- strsplit(results, "\n") %>% unlist() + + result <- list() + + for (line in results_lines) { + if (grepl(":", line)) { + split_line <- strsplit(line, ":")[[1]] + key <- trimws(split_line[1]) + value <- trimws(split_line[2]) + result[[key]] <- value + } + } + + return(result) +} + #' Return the number of candidate rules for EXPLORE #' @param OutputFile output file = paste0(output_path, file_name, ".result") #' diff --git a/tests/testthat/test-MainFunctions.R b/tests/testthat/test-MainFunctions.R index 02dce8b7..2c8f1755 100644 --- a/tests/testthat/test-MainFunctions.R +++ b/tests/testthat/test-MainFunctions.R @@ -166,4 +166,51 @@ test_that("balanced accuracy constraint ", { expect_equal(num_without, 1940) expect_equal(num_with, 36) -}) \ No newline at end of file +}) + +test_that("Results Explore", { + + dataset <- "binary_3" + config <- getDataSetPath(dataset = dataset) + + ### Tests for EXPLORE using iris dataset + train_data <- farff::readARFF(config$data_path) + output_path <- paste0(tempdir(), "/", glue::glue("{getRandomId()}"), "/") + file_name <- paste0(dataset, "_train_data") + dir.create(output_path) + if (.Platform$OS.type == "windows") { + output_path <- gsub("\\\\", "/", output_path) + } + + result <- trainExplore(train_data = train_data, + settings_path = NULL, + output_path = output_path, + file_name = file_name, + OutputFile = NULL, + StartRulelength = StartRulelength, + EndRulelength = EndRulelength, + OperatorMethod = "EXHAUSTIVE", + CutoffMethod = "RVAC", + ClassFeature = config$class_feature, + PositiveClass = config$positive_class, + FeatureInclude = "", + Maximize = "ACCURACY", + Accuracy = 0, + BalancedAccuracy = 0, + Specificity = 0, + PrintSettings = TRUE, + PrintPerformance = TRUE, + Subsumption = TRUE, + BranchBound = TRUE, + Parallel = FALSE, + PrintCutoffSets = TRUE, + Sorted = "none", + OutputMethod = "EVERY", + BinaryReduction = FALSE) + + + outputFile <- paste0(output_path, file_name, ".result") + results_list <- resultsExplore(outputFile = outputFile) + expect_equal(results_list$`Total Count Cutoff Sets`, "13") + +}) From 7b0abf96357ed672b43da0d8301618915f4ade98 Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Tue, 1 Oct 2024 13:07:48 +0200 Subject: [PATCH 35/41] bug fix outputfile typo --- R/MainFunctions.R | 2 +- tests/testthat/test-MainFunctions.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index 776a27f8..17405825 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -386,7 +386,7 @@ predictExplore <- function(model, test_data) { resultsExplore <- function(outputFile) { # Read in results file - results <- paste(readLines(OutputFile), collapse="\n") + results <- paste(readLines(outputFile), collapse="\n") results_lines <- strsplit(results, "\n") %>% unlist() result <- list() diff --git a/tests/testthat/test-MainFunctions.R b/tests/testthat/test-MainFunctions.R index 2c8f1755..81994e0c 100644 --- a/tests/testthat/test-MainFunctions.R +++ b/tests/testthat/test-MainFunctions.R @@ -187,8 +187,8 @@ test_that("Results Explore", { output_path = output_path, file_name = file_name, OutputFile = NULL, - StartRulelength = StartRulelength, - EndRulelength = EndRulelength, + StartRulelength = 1, + EndRulelength = 2, OperatorMethod = "EXHAUSTIVE", CutoffMethod = "RVAC", ClassFeature = config$class_feature, From 83e6083fda0b456442384a2710f17bd1c55baee9 Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Tue, 1 Oct 2024 13:14:02 +0200 Subject: [PATCH 36/41] test corrections --- tests/testthat/test-MainFunctions.R | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/tests/testthat/test-MainFunctions.R b/tests/testthat/test-MainFunctions.R index 81994e0c..7dcb7c67 100644 --- a/tests/testthat/test-MainFunctions.R +++ b/tests/testthat/test-MainFunctions.R @@ -186,31 +186,18 @@ test_that("Results Explore", { settings_path = NULL, output_path = output_path, file_name = file_name, - OutputFile = NULL, StartRulelength = 1, EndRulelength = 2, - OperatorMethod = "EXHAUSTIVE", CutoffMethod = "RVAC", - ClassFeature = config$class_feature, - PositiveClass = config$positive_class, - FeatureInclude = "", + ClassFeature = "'outcomeCount'", + PositiveClass = "\"1\"", Maximize = "ACCURACY", - Accuracy = 0, - BalancedAccuracy = 0, - Specificity = 0, - PrintSettings = TRUE, PrintPerformance = TRUE, - Subsumption = TRUE, - BranchBound = TRUE, - Parallel = FALSE, - PrintCutoffSets = TRUE, - Sorted = "none", - OutputMethod = "EVERY", - BinaryReduction = FALSE) + Subsumption = TRUE) outputFile <- paste0(output_path, file_name, ".result") results_list <- resultsExplore(outputFile = outputFile) - expect_equal(results_list$`Total Count Cutoff Sets`, "13") + expect_equal(results_list$`Total Count Cutoff Sets`, "16") }) From f3cfe0a49f979bc8947baca1856428d78b04b5e1 Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Tue, 1 Oct 2024 13:14:51 +0200 Subject: [PATCH 37/41] test correction --- tests/testthat/test-MainFunctions.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/testthat/test-MainFunctions.R b/tests/testthat/test-MainFunctions.R index 7dcb7c67..c68fef3f 100644 --- a/tests/testthat/test-MainFunctions.R +++ b/tests/testthat/test-MainFunctions.R @@ -172,8 +172,6 @@ test_that("Results Explore", { dataset <- "binary_3" config <- getDataSetPath(dataset = dataset) - - ### Tests for EXPLORE using iris dataset train_data <- farff::readARFF(config$data_path) output_path <- paste0(tempdir(), "/", glue::glue("{getRandomId()}"), "/") file_name <- paste0(dataset, "_train_data") From 47a3237bbc593d771f9d6d65b63e08f74f8d9b1c Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Tue, 1 Oct 2024 13:15:57 +0200 Subject: [PATCH 38/41] test correction --- tests/testthat/test-HelperFunctions.R | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/testthat/test-HelperFunctions.R b/tests/testthat/test-HelperFunctions.R index f68a96a8..3d27d8d7 100644 --- a/tests/testthat/test-HelperFunctions.R +++ b/tests/testthat/test-HelperFunctions.R @@ -7,11 +7,7 @@ test_that("Convert logical to 0/1", { `316139209` = c(FALSE,FALSE,FALSE, FALSE,FALSE,FALSE), `316139210` = c(FALSE,FALSE,FALSE, - FALSE,FALSE,FALSE) - ) - + FALSE,FALSE,FALSE)) train_data <- convert_logical(train_data) - expect_true(all(sapply(train_data, function(col) all(col %in% c(0, 1))))) - }) From 53e065ef28a94ca0860df90faa55bc60f2ab2598 Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Tue, 1 Oct 2024 13:32:24 +0200 Subject: [PATCH 39/41] keys in lower and no space, instead underscore and candidate model has all models --- R/MainFunctions.R | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/R/MainFunctions.R b/R/MainFunctions.R index 17405825..17ff331d 100755 --- a/R/MainFunctions.R +++ b/R/MainFunctions.R @@ -392,11 +392,19 @@ resultsExplore <- function(outputFile) { result <- list() for (line in results_lines) { + # line <- "Candidate model: '198124209' = \"0\"" if (grepl(":", line)) { - split_line <- strsplit(line, ":")[[1]] - key <- trimws(split_line[1]) - value <- trimws(split_line[2]) - result[[key]] <- value + if (grepl("Candidate model", line)) { + split_line <- strsplit(line, ":")[[1]] + key <- trimws(split_line[1]) %>% tolower() %>% gsub(" ", "_", .) + value <- trimws(split_line[2]) + result[[key]] <- c(result[[key]], value) + } else { + split_line <- strsplit(line, ":")[[1]] + key <- trimws(split_line[1]) %>% tolower() %>% gsub(" ", "_", .) + value <- trimws(split_line[2]) + result[[key]] <- value + } } } From c17f82d94c928f488621e955ad6d198e8e697f30 Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Tue, 1 Oct 2024 15:05:07 +0200 Subject: [PATCH 40/41] test using train and resultsExplore --- tests/testthat/test-MainFunctions.R | 3 +- tests/testthat/test-testExplore.R | 45 +++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-MainFunctions.R b/tests/testthat/test-MainFunctions.R index c68fef3f..98f3ddf4 100644 --- a/tests/testthat/test-MainFunctions.R +++ b/tests/testthat/test-MainFunctions.R @@ -196,6 +196,7 @@ test_that("Results Explore", { outputFile <- paste0(output_path, file_name, ".result") results_list <- resultsExplore(outputFile = outputFile) - expect_equal(results_list$`Total Count Cutoff Sets`, "16") + expect_equal(results_list$total_count_cutoff_sets, "16") + expect_length(results_list$candidate_model, 32) }) diff --git a/tests/testthat/test-testExplore.R b/tests/testthat/test-testExplore.R index cb0dd91b..367a7f15 100644 --- a/tests/testthat/test-testExplore.R +++ b/tests/testthat/test-testExplore.R @@ -1,3 +1,48 @@ +test_that("binary_3 trainExplore", { + + dataset <- "binary_3" + config <- getDataSetPath(dataset = dataset) + train_data <- farff::readARFF(config$data_path) + output_path <- paste0(tempdir(), "/", glue::glue("{getRandomId()}"), "/") + file_name <- paste0(dataset, "_train_data") + dir.create(output_path) + if (.Platform$OS.type == "windows") { + output_path <- gsub("\\\\", "/", output_path) + } + + result <- trainExplore(train_data = train_data, + settings_path = NULL, + output_path = output_path, + file_name = file_name, + OutputFile = NULL, + StartRulelength = 1, + EndRulelength = 1, + OperatorMethod = "MEDIAN", + CutoffMethod = "RVAC", + ClassFeature = config$class_feature, + PositiveClass = config$positive_class, + FeatureInclude = "", + Maximize = "ACCURACY", + Accuracy = 0, + BalancedAccuracy = 0, + Specificity = 0, + PrintSettings = TRUE, + PrintPerformance = TRUE, + Subsumption = TRUE, + BranchBound = TRUE, + Parallel = FALSE, + PrintCutoffSets = TRUE, + Sorted = "none", + OutputMethod = "EVERY", + BinaryReduction = FALSE) + + outputFile <- paste0(output_path, file_name, ".result") + results_list <- resultsExplore(outputFile = outputFile) + expect_length(results_list$candidate_model, 6) + unlink(output_path, recursive = TRUE) + +}) + test_that("Test binary_3", { # Binary reduction FALSE From 695d13f215b2b6845277f8408104572bdf65cf7d Mon Sep 17 00:00:00 2001 From: Cesar Barboza Date: Tue, 1 Oct 2024 16:16:07 +0200 Subject: [PATCH 41/41] trainExplore --- tests/testthat/test-testExplore.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-testExplore.R b/tests/testthat/test-testExplore.R index 367a7f15..69dc0963 100644 --- a/tests/testthat/test-testExplore.R +++ b/tests/testthat/test-testExplore.R @@ -1,4 +1,4 @@ -test_that("binary_3 trainExplore", { +test_that("binary_3 trainExplore resultsExplore", { dataset <- "binary_3" config <- getDataSetPath(dataset = dataset)