elastic · valeriy42 · Aug 13, 2020 · Jul 6, 2020 · Jul 6, 2020 · Jul 6, 2020
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -65,6 +65,7 @@
 * Improve runtime and memory usage training deep trees for classification and
   regression. (See {ml-pull}1340[#1340].)
 * Improvement in handling large inference model definitions. (See {ml-pull}1349[#1349].)
+* Calculate total feature importance as a new result type. (See {ml-pull}1387[#1387].)
 
 === Bug Fixes
 

diff --git a/include/api/CDataFrameTrainBoostedTreeRunner.h b/include/api/CDataFrameTrainBoostedTreeRunner.h
@@ -57,6 +57,7 @@ class API_EXPORT CDataFrameTrainBoostedTreeRunner : public CDataFrameAnalysisRun
     static const std::string FEATURE_NAME_FIELD_NAME;
     static const std::string IMPORTANCE_FIELD_NAME;
     static const std::string FEATURE_IMPORTANCE_FIELD_NAME;
+    static const std::string TOTAL_FEATURE_IMPORTANCE_FIELD_NAME;
 
 public:
     ~CDataFrameTrainBoostedTreeRunner() override;

diff --git a/include/maths/CTreeShapFeatureImportance.h b/include/maths/CTreeShapFeatureImportance.h
@@ -73,6 +73,8 @@ class MATHS_EXPORT CTreeShapFeatureImportance {
     //! Get the maximum depth of any tree in \p forest.
     static std::size_t depth(const TTreeVec& forest);
 
+    const TStrVec& columnNames() const;
+
 private:
     //! Collects the elements of the path through decision tree that are updated together
     struct SPathElement {

diff --git a/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc b/lib/api/CDataFrameTrainBoostedTreeClassifierRunner.cc
@@ -15,6 +15,7 @@
 #include <maths/CBoostedTreeLoss.h>
 #include <maths/CDataFramePredictiveModel.h>
 #include <maths/CDataFrameUtils.h>
+#include <maths/CLinearAlgebraEigen.h>
 #include <maths/COrderings.h>
 #include <maths/CTools.h>
 #include <maths/CTreeShapFeatureImportance.h>
@@ -27,6 +28,7 @@
 #include <memory>
 #include <numeric>
 #include <set>
+#include <unordered_map>
 
 namespace ml {
 namespace api {
@@ -162,6 +164,9 @@ void CDataFrameTrainBoostedTreeClassifierRunner::writeOneRow(
     }
 
     if (featureImportance != nullptr) {
+        using TVector = maths::CDenseVector<double>;
+        using TTotalShapValues = std::unordered_map<std::size_t, TVector>;
+        TTotalShapValues totalShapValues;
         int numberClasses{static_cast<int>(classValues.size())};
         featureImportance->shap(
             row, [&](const maths::CTreeShapFeatureImportance::TSizeVec& indices,
@@ -182,14 +187,44 @@ void CDataFrameTrainBoostedTreeClassifierRunner::writeOneRow(
                                 writer.Key(classValues[j]);
                                 writer.Double(shap[i](j));
                             }
-                            writer.Key(CDataFrameTrainBoostedTreeRunner::IMPORTANCE_FIELD_NAME);
+                            writer.Key(IMPORTANCE_FIELD_NAME);
                             writer.Double(shap[i].lpNorm<1>());
                         }
                         writer.EndObject();
                     }
                 }
                 writer.EndArray();
+
+                for (std::size_t i = 0; i < shap.size(); ++i) {
+                    if (shap[i].lpNorm<1>() != 0) {
+                        if (totalShapValues.find(i) != totalShapValues.end()) {
+                            totalShapValues[i] += shap[i].cwiseAbs();
+                        } else {
+                            totalShapValues[i] = shap[i].cwiseAbs();
+                        }
+                    }
+                }
             });
+        writer.Key(TOTAL_FEATURE_IMPORTANCE_FIELD_NAME);
+        writer.StartArray();
+        for (const auto& item : totalShapValues) {
+            writer.StartObject();
+            writer.Key(FEATURE_NAME_FIELD_NAME);
+            writer.String(featureImportance->columnNames()[item.first]);
+            if (item.second.size() == 1) {
+                writer.Key(IMPORTANCE_FIELD_NAME);
+                writer.Double(item.second(0));
+            } else {
+                for (int j = 0; j < item.second.size() && j < numberClasses; ++j) {
+                    writer.Key(classValues[j]);
+                    writer.Double(item.second(j));
+                }
+                writer.Key(IMPORTANCE_FIELD_NAME);
+                writer.Double(item.second.lpNorm<1>());
+            }
+            writer.EndObject();
+        }
+        writer.EndArray();
     }
     writer.EndObject();
 }

diff --git a/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRegressionRunner.cc
@@ -13,6 +13,7 @@
 #include <maths/CBoostedTreeFactory.h>
 #include <maths/CBoostedTreeLoss.h>
 #include <maths/CDataFrameUtils.h>
+#include <maths/CLinearAlgebraEigen.h>
 #include <maths/CTreeShapFeatureImportance.h>
 
 #include <api/CBoostedTreeInferenceModelBuilder.h>
@@ -24,6 +25,7 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <unordered_map>
 
 namespace ml {
 namespace api {
@@ -109,10 +111,14 @@ void CDataFrameTrainBoostedTreeRegressionRunner::writeOneRow(
     writer.Bool(maths::CDataFrameUtils::isMissing(row[columnHoldingDependentVariable]) == false);
     auto featureImportance = tree.shap();
     if (featureImportance != nullptr) {
+        using TVector = maths::CDenseVector<double>;
+        using TTotalShapValues = std::unordered_map<std::size_t, TVector>;
+        TTotalShapValues totalShapValues;
         featureImportance->shap(
-            row, [&writer](const maths::CTreeShapFeatureImportance::TSizeVec& indices,
-                           const TStrVec& featureNames,
-                           const maths::CTreeShapFeatureImportance::TVectorVec& shap) {
+            row, [&writer, &totalShapValues](
+                     const maths::CTreeShapFeatureImportance::TSizeVec& indices,
+                     const TStrVec& featureNames,
+                     const maths::CTreeShapFeatureImportance::TVectorVec& shap) {
                 writer.Key(FEATURE_IMPORTANCE_FIELD_NAME);
                 writer.StartArray();
                 for (auto i : indices) {
@@ -126,7 +132,28 @@ void CDataFrameTrainBoostedTreeRegressionRunner::writeOneRow(
                     }
                 }
                 writer.EndArray();
+
+                for (int i = 0; i < shap.size(); ++i) {
+                    if (shap[i].lpNorm<1>() != 0) {
+                        if (totalShapValues.find(i) != totalShapValues.end()) {
+                            totalShapValues[i] += shap[i].cwiseAbs();
+                        } else {
+                            totalShapValues[i] = shap[i].cwiseAbs();
+                        }
+                    }
+                }
             });
+        writer.Key(TOTAL_FEATURE_IMPORTANCE_FIELD_NAME);
+        writer.StartArray();
+        for (const auto& item : totalShapValues) {
+            writer.StartObject();
+            writer.Key(FEATURE_NAME_FIELD_NAME);
+            writer.String(featureImportance->columnNames()[item.first]);
+            writer.Key(IMPORTANCE_FIELD_NAME);
+            writer.Double(item.second[0]);
+            writer.EndObject();
+        }
+        writer.EndArray();
     }
     writer.EndObject();
 }

diff --git a/lib/api/CDataFrameTrainBoostedTreeRunner.cc b/lib/api/CDataFrameTrainBoostedTreeRunner.cc
@@ -363,6 +363,7 @@ const std::string CDataFrameTrainBoostedTreeRunner::IS_TRAINING_FIELD_NAME{"is_t
 const std::string CDataFrameTrainBoostedTreeRunner::FEATURE_NAME_FIELD_NAME{"feature_name"};
 const std::string CDataFrameTrainBoostedTreeRunner::IMPORTANCE_FIELD_NAME{"importance"};
 const std::string CDataFrameTrainBoostedTreeRunner::FEATURE_IMPORTANCE_FIELD_NAME{"feature_importance"};
+const std::string CDataFrameTrainBoostedTreeRunner::TOTAL_FEATURE_IMPORTANCE_FIELD_NAME{"total_feature_importance"};
 // clang-format on
 }
 }
diff --git a/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc b/lib/api/unittest/CDataFrameAnalyzerFeatureImportanceTest.cc
@@ -440,6 +440,7 @@ BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceAllShap, SFixture) {
 
     TMeanVarAccumulator bias;
     double c1Sum{0.0}, c2Sum{0.0}, c3Sum{0.0}, c4Sum{0.0};
+    bool hasTotalFeatureImportance{false};
     for (const auto& result : results.GetArray()) {
         if (result.HasMember("row_results")) {
             double c1{readShapValue(result, "c1")};
@@ -456,6 +457,9 @@ BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceAllShap, SFixture) {
             c4Sum += std::fabs(c4);
             // assert that no SHAP value for the dependent variable is returned
             BOOST_REQUIRE_EQUAL(readShapValue(result, "target"), 0.0);
+            if (result["row_results"]["results"]["ml"].HasMember("total_feature_importance")) {
+                hasTotalFeatureImportance = true;
+            }
         }
     }
 
@@ -471,6 +475,7 @@ BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceAllShap, SFixture) {
     BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 5.0); // c3 and c4 within 5% of each other
     // make sure the local approximation differs from the prediction always by the same bias (up to a numeric error)
     BOOST_REQUIRE_SMALL(maths::CBasicStatistics::variance(bias), 1e-6);
+    BOOST_TEST_REQUIRE(hasTotalFeatureImportance);
 }
 
 BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceNoImportance, SFixture) {
@@ -510,6 +515,7 @@ BOOST_FIXTURE_TEST_CASE(testClassificationFeatureImportanceAllShap, SFixture) {
     auto results{runBinaryClassification(topShapValues, {0.5, -0.7, 0.2, -0.2})};
 
     double c1Sum{0.0}, c2Sum{0.0}, c3Sum{0.0}, c4Sum{0.0};
+    bool hasTotalFeatureImportance{false};
     for (const auto& result : results.GetArray()) {
         if (result.HasMember("row_results")) {
             double c1{readShapValue(result, "c1")};
@@ -536,6 +542,10 @@ BOOST_FIXTURE_TEST_CASE(testClassificationFeatureImportanceAllShap, SFixture) {
             c2Sum += std::fabs(c2);
             c3Sum += std::fabs(c3);
             c4Sum += std::fabs(c4);
+
+            if (result["row_results"]["results"]["ml"].HasMember("total_feature_importance")) {
+                hasTotalFeatureImportance = true;
+            }
         }
     }
 
@@ -548,13 +558,14 @@ BOOST_FIXTURE_TEST_CASE(testClassificationFeatureImportanceAllShap, SFixture) {
     BOOST_REQUIRE_CLOSE(c3Sum, c4Sum, 40.0); // c3 and c4 within 40% of each other
     // make sure the local approximation differs from the prediction always by the same bias (up to a numeric error)
     BOOST_REQUIRE_SMALL(maths::CBasicStatistics::variance(bias), 1e-6);
+    BOOST_TEST_REQUIRE(hasTotalFeatureImportance);
 }
 
 BOOST_FIXTURE_TEST_CASE(testMultiClassClassificationFeatureImportanceAllShap, SFixture) {
 
     std::size_t topShapValues{4};
     auto results{runMultiClassClassification(topShapValues, {0.5, -0.7, 0.2, -0.2})};
-
+    bool hasTotalFeatureImportance{false};
     for (const auto& result : results.GetArray()) {
         if (result.HasMember("row_results")) {
             double c1{readShapValue(result, "c1")};
@@ -584,8 +595,13 @@ BOOST_FIXTURE_TEST_CASE(testMultiClassClassificationFeatureImportanceAllShap, SF
             double c4bar{readShapValue(result, "c4", "bar")};
             double c4baz{readShapValue(result, "c4", "baz")};
             BOOST_REQUIRE_CLOSE(c4, std::abs(c4f) + std::abs(c4bar) + std::abs(c4baz), 1e-6);
+
+            if (result["row_results"]["results"]["ml"].HasMember("total_feature_importance")) {
+                hasTotalFeatureImportance = true;
+            }
         }
     }
+    BOOST_TEST_REQUIRE(hasTotalFeatureImportance);
 }
 
 BOOST_FIXTURE_TEST_CASE(testRegressionFeatureImportanceNoShap, SFixture) {

diff --git a/lib/maths/CTreeShapFeatureImportance.cc b/lib/maths/CTreeShapFeatureImportance.cc
@@ -362,5 +362,9 @@ void CTreeShapFeatureImportance::unwindPath(CSplitPath& path, int pathIndex, int
     }
     --nextIndex;
 }
+
+const CTreeShapFeatureImportance::TStrVec& CTreeShapFeatureImportance::columnNames() const {
+    return m_ColumnNames;
+}
 }
 }