# Copyright 2022 MTS (Mobile Telesystems)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Module contains CUPED-based data transformation methods for the experiment
acceleration.
"""
from typing import Dict, List, Optional, Union
import numpy as np
import pandas as pd
from ambrosia import types
from ambrosia.tools.ab_abstract_component import AbstractVarianceReducer
from ambrosia.tools.back_tools import wrap_cols
[docs]
class Cuped(AbstractVarianceReducer):
"""
Class for data CUPED transformation.
https://towardsdatascience.com/how-to-double-a-b-testing-speed-with-cuped-f80460825a90
Y_hat = Y - theta * X
theta := cov(X, Y) / Var(Y)
It is important, that the mean covariance metric did not change over time!!!
Parameters
----------
verbose : bool, default: ``True``
If ``True`` will print in sys.stdout the information
about the variance reduction.
Attributes
----------
params : Dict
Parameters of instance that will be updated after calling fit() method.
Include:
- target column name
- covariate column name
- name of column after the transformation
- linear coefficient for CUPED transformation.
- bias value for mean equality
verbose : bool
Verbose info flag.
fitted : bool
Flag if class was fitted.
Examples
--------
Suppose we have the dataframe with users info which contains two columns:
a "target" columns and a column with metric "income". Let us can assume,
that over time, the average of the "income" values do not change. Then, we
can use CUPED transformation based on "income" data to reduce "target"
column variation.
>>> cuped_transformer = Cuped(dataframe, 'target', verbose=True)
>>> cuped_transformer.fit_transform(
>>> dataframe=dataframe
>>> target_column='target'
>>> covariate_column='income',
>>> transformed_name='cuped_target'
>>> inplace=True,
>>> )
Now in the dataframe a new column "cuped_target" appeared, we can use it
to design our experiment and estimate variance reduction. For further CUPED
usage in the future experiment, let us store the parameters:
>>> cuped_transformer.store_params('cuped_transform_params.json')
Now we conduct an experiment and want to transform our data to reduce its
variation:
>>> cuped_transformation = Cuped()
>>> cuped_transformation.load_params('cuped_transform_params.json')
>>> cuped_transformation.transform(
>>> dataframe=exp_results,
>>> inplace=True,
>>> )
Methods
-------
get_params_dict()
Returns dictionary with params if fit() method has been previously
called.
load_params_dict(params)
Load params from a dictionary.
store_params(store_path)
Store params to json file if fit() method has been previously called.
load_params(load_path)
Load params from a json file.
fit(covariate_column)
Fit model using a specific covariate column.
transform(covariate_column, inplace, name)
Transform target column after a class instance fitting.
fit_transform(covariate_column, inplace, name)
Combination of fit() and transform() methods.
"""
THETA_NAME: str = "theta"
BIAS_NAME: str = "bias"
non_serializable_params: List = [THETA_NAME, BIAS_NAME]
def __init__(self, verbose: bool = True) -> None:
super().__init__(verbose)
self.params["covariate_column"] = None
self.params[Cuped.THETA_NAME] = None
self.params[Cuped.BIAS_NAME] = None
def __str__(self) -> str:
return f"СUPED for {self.params['target_column']}"
def __call__(self, y: np.ndarray, X: np.ndarray) -> np.ndarray:
self._check_fitted()
y_hat: np.ndarray = y - self.params[Cuped.THETA_NAME] * (X - self.params[Cuped.BIAS_NAME])
return y_hat
def get_params_dict(self) -> Dict:
"""
Returns a dictionary with params.
Returns
-------
params : Dict
Dictionary with fitted params.
"""
self._check_fitted()
return {
key: (value if key not in Cuped.non_serializable_params else value.tolist())
for key, value in self.params.items()
}
def load_params_dict(self, params: Dict) -> None:
"""
Load model parameters from the dictionary.
Parameters
----------
params : Dict
Dictionary with params.
"""
for parameter in self.params:
if parameter in params:
if parameter in Cuped.non_serializable_params:
self.params[parameter] = np.array(params[parameter])
else:
self.params[parameter] = params[parameter]
else:
raise TypeError(f"params argument must contain: {parameter}")
self.fitted = True
[docs]
def fit(
self,
dataframe: pd.DataFrame,
target_column: types.ColumnNameType,
covariate_column: types.ColumnNameType,
transformed_name: Optional[types.ColumnNameType] = None,
) -> None:
"""
Fit to calculate CUPED parameters for target column using given
covariate column and data.
Parameters
----------
dataframe : pd.DataFrame
Table with data for the calculation of CUPED parameters.
target_column : ColumnNameType
Column from the dataframe, for which CUPED transformation will be
applied.
covariate_column : ColumnNameType
Column which will be used as the covariate in CUPED transformation.
transformed_name : ColumnNamesType, optional
Name for the new transformed target column, if is not defined
it will be generated automatically.
"""
self._check_cols(dataframe, [target_column, covariate_column])
covariance: pd.DataFrame = dataframe[[target_column, covariate_column]].cov()
covariate_variance: float = covariance.loc[covariate_column, covariate_column]
self.params[Cuped.THETA_NAME] = covariance.loc[target_column, covariate_column] / (
super().EPSILON + covariate_variance
)
self.params[Cuped.BIAS_NAME] = np.mean(dataframe[covariate_column])
self.params["target_column"] = target_column
self.params["covariate_column"] = covariate_column
self.params["transformed_name"] = transformed_name
self.fitted = True
[docs]
class MultiCuped(AbstractVarianceReducer):
"""
Class for data Multi CUPED transformation.
Y_hat = Y - X theta (Matrix multiplication)
theta := argmin Var (Y - X theta)
It is important, that the mean covariance metric do not change over time!!!
Parameters
----------
verbose : bool, default: ``True``
If ``True`` will print in sys.stdout the information
about the variance reduction.
Attributes
----------
params : Dict
Parameters of instance that will be updated after calling fit() method.
Include:
- target column name
- covariate columns names
- name of column after the transformation
- linear coefficients for Multi CUPED transformation.
- bias value for mean equality
verbose : bool
Verbose info flag.
fitted : bool
Flag if class was fitted.
Examples
--------
We have dataframe with users info with column 'target' and
columns 'income' and 'age'. We can assume, that over time,
the average of this covariate values does not change. Then, we can use
multi cuped transformation to reduce variation.
Suppose we have the dataframe with users info which contains two columns:
a "target" columns and columns "income" and "age". Let us can assume,
that over time, the average of the "income" and "age" values do not change.
Then, we can use Multi CUPED transformation based on "income" and "age"
data in order to reduce "target" column variation.
>>> cuped_transformer = MultiCuped(verbose=True)
>>> cuped_transformer.fit_transform(
>>> dataframe=dataframe
>>> target_column='target'
>>> ['income', 'age'],
>>> transformed_name='cuped_target'
>>> inplace=True,
>>> )
Now in the dataframe a new column "cuped_target" appeared, we can use it
to design our experiment and estimate variance reduction. For further
Multi CUPED usage in the future experiment, let us store the parameters:
>>> cuped_transformer.store_params('cuped_transform_params.json')
Now we conduct an experiment and want to transform our data to reduce its
variation:
>>> cuped_transformation = MultiCuped()
>>> cuped_transformation.load_params('cuped_transform_params.json')
>>> cuped_transformation.transform(
>>> exp_results,
>>> inplace=True,
>>> )
Methods
-------
get_params_dict()
Returns dictionary with params if fit() method has been previously
called.
load_params_dict(params)
Load params from a dictionary.
store_params(store_path)
Store params to json file if fit() method has been previously called.
load_params(load_path)
Load params from a json file.
fit(covariate_column)
Fit model using covariate columns.
transform(covariate_column, inplace, name)
Transform target column after a class instance fitting.
fit_transform(covariate_column, inplace, name)
Combination of fit() and transform() methods.
"""
THETA_NAME: str = "theta"
BIAS_NAME: str = "bias"
non_serializable_params: List = [THETA_NAME, BIAS_NAME]
def __init__(self, verbose: bool = True) -> None:
super().__init__(verbose)
self.params["covariate_columns"] = None
self.params[MultiCuped.THETA_NAME] = None
self.params[MultiCuped.BIAS_NAME] = None
def __str__(self) -> str:
return f"Multi СUPED for {self.params['target_column']}"
def __call__(self, y: np.ndarray, X: np.ndarray) -> np.ndarray:
self._check_fitted()
y_hat: np.ndarray = y - (X @ self.params[MultiCuped.THETA_NAME]).reshape(-1) + self.params[MultiCuped.BIAS_NAME]
return y_hat
def get_params_dict(self) -> Dict:
"""
Returns a dictionary with params.
Returns
-------
params : Dict
Dictionary with fitted params.
"""
self._check_fitted()
return {
key: (value if key not in MultiCuped.non_serializable_params else value.tolist())
for key, value in self.params.items()
}
def load_params_dict(self, params: Dict) -> None:
"""
Load model parameters from the dictionary.
Parameters
----------
params : Dict
Dictionary with params.
"""
for parameter in self.params:
if parameter in params:
if parameter in MultiCuped.non_serializable_params:
self.params[parameter] = np.array(params[parameter])
else:
self.params[parameter] = params[parameter]
else:
raise TypeError(f"params argument must contain: {parameter}")
self.fitted = True
[docs]
def fit(
self,
dataframe: pd.DataFrame,
target_column: types.ColumnNameType,
covariate_columns: types.ColumnNamesType,
transformed_name: Optional[types.ColumnNameType] = None,
) -> None:
"""
Fit to calculate Multi CUPED parameters for target column using selected
covariate columns.
Parameters
----------
dataframe : pd.DataFrame
Table with data for the calculation of CUPED parameters.
target_column : ColumnNameType
Column from the dataframe, for which CUPED transformation will be
applied.
covariate_columns : ColumnNamesType
Columns which will be used as the covariates in Multi CUPED
transformation.
transformed_name : ColumnNamesType, optional
Name for the new transformed target column, if is not defined
it will be generated automatically.
"""
covariate_columns = wrap_cols(covariate_columns)
cols_concat: List = [target_column] + covariate_columns
self._check_cols(dataframe, cols_concat)
covariance: np.ndarray = dataframe[cols_concat].cov()
matrix: np.ndarray = covariance.loc[covariate_columns, covariate_columns]
num_features: int = len(covariate_columns)
covariance_target: np.ndarray = covariance.loc[covariate_columns, target_column].values.reshape(
num_features, -1
)
self.params[MultiCuped.THETA_NAME] = np.linalg.inv(matrix) @ covariance_target
self.params[MultiCuped.BIAS_NAME]: np.ndarray = (
(dataframe[covariate_columns].values @ self.params[MultiCuped.THETA_NAME]).reshape(-1).mean()
)
self.params["target_column"] = target_column
self.params["covariate_columns"] = covariate_columns
self.params["transformed_name"] = transformed_name
self.fitted = True