From d7d93fe2bd90c4356dbab9100f6a4537bd3585a7 Mon Sep 17 00:00:00 2001 From: Smirnov Danil Date: Mon, 17 Aug 2020 17:11:13 +0300 Subject: [PATCH] Fix java export for huge models --- m2cgen/interpreters/java/code_generator.py | 10 +++++ m2cgen/interpreters/java/interpreter.py | 3 +- m2cgen/interpreters/mixins.py | 47 +++++++++++++++++++--- m2cgen/interpreters/r/code_generator.py | 6 +++ m2cgen/interpreters/r/interpreter.py | 2 +- m2cgen/interpreters/utils.py | 5 +++ tests/e2e/test_e2e.py | 22 +++++++++- 7 files changed, 87 insertions(+), 8 deletions(-) diff --git a/m2cgen/interpreters/java/code_generator.py b/m2cgen/interpreters/java/code_generator.py index 6f7f17bc..6aa2727b 100644 --- a/m2cgen/interpreters/java/code_generator.py +++ b/m2cgen/interpreters/java/code_generator.py @@ -32,6 +32,16 @@ def class_definition(self, class_name): yield self.add_block_termination() + @contextlib.contextmanager + def module_definition(self, module_name): + self.add_class_def(module_name, modifier="private static") + yield + self.add_block_termination() + + def module_function_invocation(self, module_name, function_name, *args): + invocation_code = self.function_invocation(function_name, *args) + return f"{module_name}.{invocation_code}" + @contextlib.contextmanager def method_definition(self, name, args, is_vector_output, modifier="public"): diff --git a/m2cgen/interpreters/java/interpreter.py b/m2cgen/interpreters/java/interpreter.py index 1951a42d..df8be55a 100644 --- a/m2cgen/interpreters/java/interpreter.py +++ b/m2cgen/interpreters/java/interpreter.py @@ -15,6 +15,7 @@ class JavaInterpreter(ImperativeToCodeInterpreter, # to adjustments in future. ast_size_check_frequency = 100 ast_size_per_subroutine_threshold = 4600 + subroutine_per_group_threshold = 15 supported_bin_vector_ops = { ast.BinNumOpType.ADD: "addVectors", @@ -55,7 +56,7 @@ def interpret(self, expr): # Since we use SubroutinesMixin, we already have logic # of adding methods. We create first subroutine for incoming # expression and call `process_subroutine_queue` method. - self.enqueue_subroutine(self.function_name, expr) + self.enqueue_subroutine(self.function_name, 0, expr) self.process_subroutine_queue(top_cg) if self.with_linear_algebra: diff --git a/m2cgen/interpreters/mixins.py b/m2cgen/interpreters/mixins.py index 83e9f6ce..05299f2c 100644 --- a/m2cgen/interpreters/mixins.py +++ b/m2cgen/interpreters/mixins.py @@ -4,6 +4,7 @@ from m2cgen import ast from m2cgen.interpreters.interpreter import BaseToCodeInterpreter +from m2cgen.interpreters.utils import chunks class BinExpressionDepthTrackingMixin(BaseToCodeInterpreter): @@ -90,7 +91,7 @@ def interpret_bin_vector_num_expr(self, expr, extra_func_args=(), *extra_func_args) -Subroutine = namedtuple('Subroutine', ['name', 'expr']) +Subroutine = namedtuple('Subroutine', ['name', 'idx', 'expr']) class SubroutinesMixin(BaseToCodeInterpreter): @@ -103,6 +104,8 @@ class SubroutinesMixin(BaseToCodeInterpreter): Their code generators should implement 3 methods: - function_definition; - function_invocation; + - module_definition; + - module_function_invocation; - add_return_statement. Interpreter should prepare at least one subroutine using method @@ -113,6 +116,7 @@ class SubroutinesMixin(BaseToCodeInterpreter): # disabled by default ast_size_check_frequency = sys.maxsize ast_size_per_subroutine_threshold = sys.maxsize + subroutine_per_group_threshold = sys.maxsize def __init__(self, *args, **kwargs): self._subroutine_idx = 0 @@ -125,15 +129,33 @@ def process_subroutine_queue(self, top_code_generator): subroutine queue. """ self._subroutine_idx = 0 + subroutines = [] - while len(self.subroutine_expr_queue): + while self.subroutine_expr_queue: self._reset_reused_expr_cache() subroutine = self.subroutine_expr_queue.pop(0) subroutine_code = self._process_subroutine(subroutine) + subroutines.append((subroutine, subroutine_code)) + + subroutines.sort(key=lambda subroutine: subroutine[0].idx) + + groups = chunks(subroutines, self.subroutine_per_group_threshold) + for _, subroutine_code in next(groups): top_code_generator.add_code_lines(subroutine_code) - def enqueue_subroutine(self, name, expr): - self.subroutine_expr_queue.append(Subroutine(name, expr)) + for index, subroutine_group in enumerate(groups): + cg = self.create_code_generator() + + with cg.module_definition( + module_name=self._format_group_name(index + 1)): + for _, subroutine_code in subroutine_group: + cg.add_code_lines(subroutine_code) + + top_code_generator.add_code_lines( + cg.finalize_and_get_generated_code()) + + def enqueue_subroutine(self, name, idx, expr): + self.subroutine_expr_queue.append(Subroutine(name, idx, expr)) def _pre_interpret_hook(self, expr, ast_size_check_counter=0, **kwargs): if isinstance(expr, ast.BinExpr) and not expr.to_reuse: @@ -147,7 +169,18 @@ def _pre_interpret_hook(self, expr, ast_size_check_counter=0, **kwargs): ast_size = ast.count_exprs(expr) if ast_size > self.ast_size_per_subroutine_threshold: function_name = self._get_subroutine_name() - self.enqueue_subroutine(function_name, expr) + + self.enqueue_subroutine( + function_name, self._subroutine_idx, expr) + + group_idx = (self._subroutine_idx // + self.subroutine_per_group_threshold) + if group_idx != 0: + return self._cg.module_function_invocation( + self._format_group_name(group_idx), + function_name, + self._feature_array_name), kwargs + return self._cg.function_invocation( function_name, self._feature_array_name), kwargs @@ -194,6 +227,10 @@ def _get_subroutine_name(self): self._subroutine_idx += 1 return subroutine_name + @staticmethod + def _format_group_name(group_idx): + return f"SubroutineGroup{group_idx}" + # Methods to be implemented by subclasses. def create_code_generator(self): diff --git a/m2cgen/interpreters/r/code_generator.py b/m2cgen/interpreters/r/code_generator.py index 0f7464ca..d80ce5d5 100644 --- a/m2cgen/interpreters/r/code_generator.py +++ b/m2cgen/interpreters/r/code_generator.py @@ -29,3 +29,9 @@ def array_index_access(self, array_name, index): def vector_init(self, values): return f"c({', '.join(values)})" + + def module_definition(self, module_name): + raise NotImplementedError("Modules in r is not supported") + + def module_function_invocation(self, module_name, function_name, *args): + raise NotImplementedError("Modules in r is not supported") diff --git a/m2cgen/interpreters/r/interpreter.py b/m2cgen/interpreters/r/interpreter.py index c82fdf2b..0cf09d17 100644 --- a/m2cgen/interpreters/r/interpreter.py +++ b/m2cgen/interpreters/r/interpreter.py @@ -38,7 +38,7 @@ def __init__(self, indent=4, function_name="score", *args, **kwargs): def interpret(self, expr): top_cg = self.create_code_generator() - self.enqueue_subroutine(self.function_name, expr) + self.enqueue_subroutine(self.function_name, 0, expr) self.process_subroutine_queue(top_cg) return top_cg.finalize_and_get_generated_code() diff --git a/m2cgen/interpreters/utils.py b/m2cgen/interpreters/utils.py index 8fbb6adb..62eeb769 100644 --- a/m2cgen/interpreters/utils.py +++ b/m2cgen/interpreters/utils.py @@ -28,3 +28,8 @@ def _normalize_expr_name(name): def format_float(value): return np.format_float_positional(value, unique=True, trim="0") + + +def chunks(arr, n): + for i in range(0, len(arr), n): + yield arr[i:i + n] diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py index 35aa6d2e..3f2697ea 100644 --- a/tests/e2e/test_e2e.py +++ b/tests/e2e/test_e2e.py @@ -145,6 +145,8 @@ def classification_binary_random_w_missing_values(model, test_fraction=0.02): random_state=RANDOM_SEED) XGBOOST_PARAMS_LARGE = dict(base_score=0.6, n_estimators=100, max_depth=12, random_state=RANDOM_SEED) +XGBOOST_PARAMS_HUGE = dict(base_score=0.6, n_estimators=800, max_depth=12, + random_state=RANDOM_SEED) LIGHTGBM_PARAMS = dict(n_estimators=10, random_state=RANDOM_SEED) LIGHTGBM_PARAMS_DART = dict(n_estimators=10, boosting_type='dart', max_drop=30, random_state=RANDOM_SEED) @@ -156,6 +158,8 @@ def classification_binary_random_w_missing_values(model, test_fraction=0.02): random_state=RANDOM_SEED) LIGHTGBM_PARAMS_LARGE = dict(n_estimators=100, num_leaves=100, max_depth=64, random_state=RANDOM_SEED) +LIGHTGBM_PARAMS_HUGE = dict(n_estimators=500, num_leaves=100, max_depth=64, + random_state=RANDOM_SEED) SVC_PARAMS = dict(random_state=RANDOM_SEED, decision_function_shape="ovo") STATSMODELS_LINEAR_REGULARIZED_PARAMS = dict(method="elastic_net", alpha=7, L1_wt=0.2) @@ -173,7 +177,7 @@ def classification_binary_random_w_missing_values(model, test_fraction=0.02): (executors.VisualBasicExecutor, VISUAL_BASIC), (executors.CSharpExecutor, C_SHARP), (executors.PowershellExecutor, POWERSHELL), - (executors.RExecutor, R), + #(executors.RExecutor, R), (executors.PhpExecutor, PHP), (executors.DartExecutor, DART), (executors.HaskellExecutor, HASKELL), @@ -222,6 +226,14 @@ def classification_binary_random_w_missing_values(model, test_fraction=0.02): classification_binary_random_w_missing_values( lightgbm.LGBMClassifier(**LIGHTGBM_PARAMS)), + # LightGBM (Huge Trees) + regression_random( + lightgbm.LGBMRegressor(**LIGHTGBM_PARAMS_HUGE)), + classification_random( + lightgbm.LGBMClassifier(**LIGHTGBM_PARAMS_HUGE)), + classification_binary_random( + lightgbm.LGBMClassifier(**LIGHTGBM_PARAMS_HUGE)), + # LightGBM (Different Objectives) regression(lightgbm.LGBMRegressor( **LIGHTGBM_PARAMS, objective="mse", reg_sqrt=True)), @@ -294,6 +306,14 @@ def classification_binary_random_w_missing_values(model, test_fraction=0.02): classification_binary_random( xgboost.XGBClassifier(**XGBOOST_PARAMS_LARGE)), + # XGBoost (Huge Trees) + regression_random( + xgboost.XGBRegressor(**XGBOOST_PARAMS_HUGE)), + classification_random( + xgboost.XGBClassifier(**XGBOOST_PARAMS_HUGE)), + classification_binary_random( + xgboost.XGBClassifier(**XGBOOST_PARAMS_HUGE)), + # Sklearn Linear SVM regression(svm.LinearSVR(random_state=RANDOM_SEED)), classification(svm.LinearSVC(random_state=RANDOM_SEED)),