Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "robynpy"
version = "0.1.2"
version = "0.2.1"
authors = [
{ name="Gufeng Zhou", email="[email protected]" },
{ name="Igor Skokan", email="[email protected]" },
Expand All @@ -27,14 +27,11 @@ dependencies = [
"lmfit",
"plotnine",
"nevergrad",
"PyQt6",
"seaborn",
"tqdm",
"rpy2",
"pytest",
"plotly",
"nlopt",
"cmake",
"ipykernel",
"rpy2==3.5.16",
]
Expand Down
2 changes: 0 additions & 2 deletions python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,8 @@ scipy==1.13.1
lmfit==1.3.2
plotnine==0.13.6
nevergrad==1.0.5
PyQt6==6.7.1
seaborn==0.13.2
tqdm==4.66.5
rpy2==3.5.16
plotly==5.24.1
pytest==8.3.3
nlopt==2.8.0
Expand Down
54 changes: 25 additions & 29 deletions python/src/robyn/modeling/ridge/ridge_data_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,28 +387,26 @@ def _hill_transformation(
gamma: float,
x_marginal: Optional[np.ndarray] = None,
) -> Dict[str, np.ndarray]:
"""Exactly match R's Hill transformation implementation.

Args:
x: Input array
alpha: Shape parameter
gamma: Inflection point parameter
x_marginal: Optional marginal values for carryover effects

Returns:
Dictionary containing x_saturated and inflexion point
"""
"""Exactly match R's Hill transformation implementation."""
x_array = np.array(x)

# Calculate inflexion point exactly like R
inflexion = np.max(x_array) * gamma

if x_marginal is None:
# Regular hill transformation (exactly like R)
x_saturated = x_array**alpha / (x_array**alpha + inflexion**alpha)
else:
# Marginal effect calculation (exactly like R)
x_saturated = x_marginal**alpha / (x_marginal**alpha + inflexion**alpha)
# If all values are 0, R returns NaN
if np.max(x_array) == 0:
return {
"x_saturated": np.full_like(x_array, np.nan),
"inflexion": inflexion,
}

# Use x_marginal if provided, otherwise use x_array
input_array = x_marginal if x_marginal is not None else x_array

# Calculate hill transformation
numerator = input_array**alpha
denominator = input_array**alpha + inflexion**alpha

x_saturated = numerator / denominator

return {"x_saturated": x_saturated, "inflexion": inflexion}

Expand Down Expand Up @@ -451,9 +449,6 @@ def run_transformations(
dt_modAdstocked = self.X_base.drop(
columns=["ds"] if "ds" in self.X_base.columns else []
)
self.logger.debug(
f"After ds drop - dt_modAdstocked shape: {dt_modAdstocked.shape}"
)

# 2. Get window indices - FIXED to match R exactly
window_start = self.mmm_data.mmmdata_spec.rolling_window_start_which
Expand All @@ -471,12 +466,8 @@ def run_transformations(
saturated_immediate_collect = {}
saturated_carryover_collect = {}

# Process media variables
media_vars = self.mmm_data.mmmdata_spec.paid_media_spends

# Process each media variable (including newsletter)
for var in media_vars + ["newsletter"]:

# Process media variables using all_media from spec
for var in self.mmm_data.mmmdata_spec.all_media:
# 1. Adstocking (whole data)
input_data = dt_modAdstocked[var].values
theta = params[f"{var}_thetas"]
Expand Down Expand Up @@ -521,15 +512,19 @@ def run_transformations(

# EXACTLY match R's flow:
# 1. First update dt_modAdstocked with adstocked values (full data)
dt_modAdstocked = dt_modAdstocked.drop(columns=media_vars)
dt_modAdstocked = dt_modAdstocked.drop(
columns=self.mmm_data.mmmdata_spec.all_media
)
for var, values in adstocked_collect.items():
dt_modAdstocked[var] = values

# 2. Then window and create dt_modSaturated (exactly like R)
dt_modSaturated = dt_modAdstocked.iloc[window_indices].copy()

# Drop media columns before binding (exactly like R)
dt_modSaturated = dt_modSaturated.drop(columns=media_vars + ["newsletter"])
dt_modSaturated = dt_modSaturated.drop(
columns=self.mmm_data.mmmdata_spec.all_media
)
for var, values in saturated_total_collect.items():
dt_modSaturated[var] = values

Expand All @@ -545,6 +540,7 @@ def run_transformations(
dt_saturatedCarryover = pd.DataFrame(
saturated_carryover_collect, index=dt_modSaturated.index
).fillna(0)
dt_modSaturated = dt_modSaturated.fillna(0)

# Window y data using same indices
self.y_windowed = self.y_base.iloc[window_indices]
Expand Down
184 changes: 116 additions & 68 deletions python/src/robyn/modeling/ridge/ridge_evaluate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from tqdm import tqdm
from sklearn.linear_model import Ridge
from sklearn.exceptions import ConvergenceWarning
from typing import Dict, Any, Tuple, Optional, List
from typing import Dict, Any, Tuple, Optional, List, Union
from robyn.modeling.entities.modeloutputs import Trial
from robyn.modeling.entities.enums import NevergradAlgorithm
from robyn.modeling.ridge.ridge_metrics_calculator import RidgeMetricsCalculator
Expand All @@ -16,6 +16,7 @@
import json
from datetime import datetime
import random
from enum import Enum


class RidgeModelEvaluator:
Expand All @@ -27,12 +28,14 @@ def __init__(
ridge_metrics_calculator,
ridge_data_builder,
calibration_input=None,
holidays_data=None,
):
self.mmm_data = mmm_data
self.featurized_mmm_data = featurized_mmm_data
self.ridge_metrics_calculator = ridge_metrics_calculator
self.ridge_data_builder = ridge_data_builder
self.calibration_input = calibration_input
self.holidays_data = holidays_data
self.logger = logging.getLogger(__name__)

def _run_nevergrad_optimization(
Expand Down Expand Up @@ -298,7 +301,14 @@ def _evaluate_model(
# Split dep_var and features like R does
y = dt_modSaturated["dep_var"]
X = dt_modSaturated.drop(columns=["dep_var"])

# After getting dt_modSaturated
self.logger.debug("Step 1 - Initial data check:")
self.logger.debug(
f"dt_modSaturated has NaN: {dt_modSaturated.isna().any().any()}"
)
if dt_modSaturated.isna().any().any():
nan_cols = dt_modSaturated.columns[dt_modSaturated.isna().any()].tolist()
self.logger.debug(f"Columns with NaN in dt_modSaturated: {nan_cols}")
# Continue with existing evaluation logic...
sol_id = f"{trial}_{iter_ng + 1}_1"

Expand Down Expand Up @@ -388,8 +398,22 @@ def _evaluate_model(
x_norm = X_train.to_numpy()
y_norm = y_train.to_numpy()

self.logger.debug("Data quality check:")
self.logger.debug(f"x_norm shape: {x_norm.shape}")
self.logger.debug(f"y_norm shape: {y_norm.shape}")
self.logger.debug(f"x_norm has NaN: {np.isnan(x_norm).any()}")
self.logger.debug(f"y_norm has NaN: {np.isnan(y_norm).any()}")
self.logger.debug(f"x_norm has inf: {np.isinf(x_norm).any()}")
self.logger.debug(f"y_norm has inf: {np.isinf(y_norm).any()}")

# Check value ranges
self.logger.debug(f"x_norm min: {np.min(x_norm)}, max: {np.max(x_norm)}")
self.logger.debug(f"y_norm min: {np.min(y_norm)}, max: {np.max(y_norm)}")

# Get sign control parameters
x_sign, lower_limits, upper_limits, check_factor = self._setup_sign_control(X)
signs_grouped, lower_limits, upper_limits, check_factor = (
self._setup_sign_control(X)
)
params["lower_limits"] = lower_limits
params["upper_limits"] = upper_limits

Expand Down Expand Up @@ -417,7 +441,7 @@ def format_limit_value(val):
),
"organic": list(self.mmm_data.mmmdata_spec.organic_vars),
},
"signs": x_sign,
"signs": signs_grouped,
"factor_variables": {
"is_factor": factor_dict,
"factor_names": [
Expand Down Expand Up @@ -1029,78 +1053,102 @@ def format_limit_value(val):

def _setup_sign_control(
self, X: pd.DataFrame
) -> Tuple[Dict[str, List[str]], List[float], List[float], Dict[str, bool]]:
"""Set up sign control for model variables, matching R's implementation exactly.

Args:
X: Feature DataFrame with dep_var already removed

Returns:
Tuple containing:
- x_sign: Dict mapping variable types to their signs
- lower_limits: List of lower bounds for each variable
- upper_limits: List of upper bounds for each variable
- check_factor: Dict mapping column names to boolean indicating if they are factors
"""
# Define signs grouped by variable type (matching R's structure)
x_sign = {
"prophet": ["default"] * 3, # [trend, season, holiday]
"context": ["default"] * len(self.mmm_data.mmmdata_spec.context_vars),
"paid_media": ["positive"]
* len(self.mmm_data.mmmdata_spec.paid_media_spends),
"organic": "positive", # Single string for organic, matching R
}
) -> Tuple[
Dict[str, Union[List[str], str]], List[float], List[float], Dict[str, bool]
]:
"""Set up sign control for model variables, matching R's implementation exactly."""
# Get prophet variables and signs from holidays_data
prophet_vars = self.holidays_data.prophet_vars
prophet_signs = [
sign.value if isinstance(sign, Enum) else sign
for sign in self.holidays_data.prophet_signs
]

# Check for factor variables
check_factor = {
col: pd.api.types.is_categorical_dtype(X[col]) for col in X.columns
}
# Convert Enum values to strings
context_signs = (
[sign.value for sign in self.mmm_data.mmmdata_spec.context_signs]
if self.mmm_data.mmmdata_spec.context_signs
else ["default"] * len(self.mmm_data.mmmdata_spec.context_vars or [])
)

# Initialize limits for prophet vars
lower_limits = [0] * 3 # trend, season, holiday
upper_limits = [1] * 3
paid_media_signs = (
[sign.value for sign in self.mmm_data.mmmdata_spec.paid_media_signs]
if self.mmm_data.mmmdata_spec.paid_media_signs
else ["positive"] * len(self.mmm_data.mmmdata_spec.paid_media_spends or [])
)

# Handle negative trend case
if "trend" in X.columns and X["trend"].sum() < 0:
lower_limits[0] = -1
upper_limits[0] = 0
organic_signs = (
[sign.value for sign in self.mmm_data.mmmdata_spec.organic_signs]
if self.mmm_data.mmmdata_spec.organic_signs
else ["positive"] * len(self.mmm_data.mmmdata_spec.organic_vars or [])
)

# Handle remaining variables
for col in X.columns[3:]: # Skip prophet vars
if check_factor.get(col, False):
level_n = len(X[col].unique())
if level_n <= 1:
raise ValueError(
f"Factor variable {col} must have more than 1 level"
)
# Combine all signs in order (matching R's vector)
x_sign = prophet_signs + context_signs + paid_media_signs + organic_signs

# Get variable names in same order
var_names = (
prophet_vars
+ (self.mmm_data.mmmdata_spec.context_vars or [])
+ (self.mmm_data.mmmdata_spec.paid_media_spends or [])
+ (self.mmm_data.mmmdata_spec.organic_vars or [])
)

# Create named dictionary like R's named vector
x_sign_dict = dict(zip(var_names, x_sign))

# Get variable type and index
if col in self.mmm_data.mmmdata_spec.context_vars:
sign = "default"
elif col in self.mmm_data.mmmdata_spec.paid_media_spends:
sign = "positive"
else: # organic
sign = "positive"
# Check factors exactly like R - using all columns except dep_var
dt_sign = X.copy()
check_factor = pd.Series(
[pd.api.types.is_categorical_dtype(dt_sign[col]) for col in var_names],
index=var_names,
)

# Initialize limits for prophet vars exactly like R
lower_limits = [0] * len(prophet_signs)
upper_limits = [1] * len(prophet_signs)

# Handle negative trend case exactly like R
trend_loc = X.columns.get_loc("trend") if "trend" in X.columns else -1
if trend_loc >= 0 and X.iloc[:, trend_loc].sum() < 0:
trend_loc = prophet_vars.index("trend")
lower_limits[trend_loc] = -1
upper_limits[trend_loc] = 0

# Handle remaining variables exactly like R's loop
for s in range(len(prophet_signs), len(x_sign)):
col = var_names[s]
if check_factor[col]:
# Use categorical levels if available, otherwise unique values
if pd.api.types.is_categorical_dtype(dt_sign[col]):
level_n = len(dt_sign[col].cat.categories)
else:
level_n = len(dt_sign[col].unique())

if sign == "positive":
if level_n <= 1:
raise ValueError("All factor variables must have more than 1 level")

if x_sign[s] == "positive":
lower_vec = [0] * (level_n - 1)
upper_vec = ["Inf"] * (level_n - 1) # Match R's "Inf"
elif sign == "negative":
lower_vec = ["-Inf"] * (level_n - 1)
upper_vec = [0] * (level_n - 1)
else: # default
lower_vec = ["-Inf"] * (level_n - 1)
upper_vec = ["Inf"] * (level_n - 1)
upper_vec = [float("inf")] * (level_n - 1)
else:
lower_vec = [float("-inf")] * (level_n - 1)
upper_vec = [0 if x_sign[s] == "negative" else float("inf")] * (
level_n - 1
)

lower_limits.extend(lower_vec)
upper_limits.extend(upper_vec)
else:
# Handle non-factor variables
if col in self.mmm_data.mmmdata_spec.context_vars:
lower_limits.append("-Inf")
upper_limits.append("Inf")
else: # paid_media or organic
lower_limits.append(0)
upper_limits.append("Inf")

return x_sign, lower_limits, upper_limits, check_factor
lower_limits.append(0 if x_sign[s] == "positive" else float("-inf"))
upper_limits.append(0 if x_sign[s] == "negative" else float("inf"))

# Create signs grouped structure for logging
signs_grouped = {
"prophet": prophet_signs,
"context": context_signs,
"paid_media": paid_media_signs,
"organic": organic_signs[0] if organic_signs else "positive",
}

return signs_grouped, lower_limits, upper_limits, check_factor
1 change: 1 addition & 0 deletions python/src/robyn/modeling/ridge_model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def __init__(
self.ridge_metrics_calculator,
self.ridge_data_builder,
self.calibration_input,
self.holiday_data,
)
self.logger = logging.getLogger(__name__)

Expand Down
Loading