Source code for ambrosia.preprocessing.cuped

#  Copyright 2022 MTS (Mobile Telesystems)
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Module contains CUPED-based data transformation methods for the experiment
acceleration.
"""
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd

from ambrosia import types
from ambrosia.tools.ab_abstract_component import AbstractVarianceReducer
from ambrosia.tools.back_tools import wrap_cols


[docs] class Cuped(AbstractVarianceReducer): """ Class for data CUPED transformation. https://towardsdatascience.com/how-to-double-a-b-testing-speed-with-cuped-f80460825a90 Y_hat = Y - theta * X theta := cov(X, Y) / Var(Y) It is important, that the mean covariance metric did not change over time!!! Parameters ---------- verbose : bool, default: ``True`` If ``True`` will print in sys.stdout the information about the variance reduction. Attributes ---------- params : Dict Parameters of instance that will be updated after calling fit() method. Include: - target column name - covariate column name - name of column after the transformation - linear coefficient for CUPED transformation. - bias value for mean equality verbose : bool Verbose info flag. fitted : bool Flag if class was fitted. Examples -------- Suppose we have the dataframe with users info which contains two columns: a "target" columns and a column with metric "income". Let us can assume, that over time, the average of the "income" values do not change. Then, we can use CUPED transformation based on "income" data to reduce "target" column variation. >>> cuped_transformer = Cuped(dataframe, 'target', verbose=True) >>> cuped_transformer.fit_transform( >>> dataframe=dataframe >>> target_column='target' >>> covariate_column='income', >>> transformed_name='cuped_target' >>> inplace=True, >>> ) Now in the dataframe a new column "cuped_target" appeared, we can use it to design our experiment and estimate variance reduction. For further CUPED usage in the future experiment, let us store the parameters: >>> cuped_transformer.store_params('cuped_transform_params.json') Now we conduct an experiment and want to transform our data to reduce its variation: >>> cuped_transformation = Cuped() >>> cuped_transformation.load_params('cuped_transform_params.json') >>> cuped_transformation.transform( >>> dataframe=exp_results, >>> inplace=True, >>> ) Methods ------- get_params_dict() Returns dictionary with params if fit() method has been previously called. load_params_dict(params) Load params from a dictionary. store_params(store_path) Store params to json file if fit() method has been previously called. load_params(load_path) Load params from a json file. fit(covariate_column) Fit model using a specific covariate column. transform(covariate_column, inplace, name) Transform target column after a class instance fitting. fit_transform(covariate_column, inplace, name) Combination of fit() and transform() methods. """ THETA_NAME: str = "theta" BIAS_NAME: str = "bias" non_serializable_params: List = [THETA_NAME, BIAS_NAME] def __init__(self, verbose: bool = True) -> None: super().__init__(verbose) self.params["covariate_column"] = None self.params[Cuped.THETA_NAME] = None self.params[Cuped.BIAS_NAME] = None def __str__(self) -> str: return f"СUPED for {self.params['target_column']}" def __call__(self, y: np.ndarray, X: np.ndarray) -> np.ndarray: self._check_fitted() y_hat: np.ndarray = y - self.params[Cuped.THETA_NAME] * (X - self.params[Cuped.BIAS_NAME]) return y_hat def get_params_dict(self) -> Dict: """ Returns a dictionary with params. Returns ------- params : Dict Dictionary with fitted params. """ self._check_fitted() return { key: (value if key not in Cuped.non_serializable_params else value.tolist()) for key, value in self.params.items() } def load_params_dict(self, params: Dict) -> None: """ Load model parameters from the dictionary. Parameters ---------- params : Dict Dictionary with params. """ for parameter in self.params: if parameter in params: if parameter in Cuped.non_serializable_params: self.params[parameter] = np.array(params[parameter]) else: self.params[parameter] = params[parameter] else: raise TypeError(f"params argument must contain: {parameter}") self.fitted = True
[docs] def fit( self, dataframe: pd.DataFrame, target_column: types.ColumnNameType, covariate_column: types.ColumnNameType, transformed_name: Optional[types.ColumnNameType] = None, ) -> None: """ Fit to calculate CUPED parameters for target column using given covariate column and data. Parameters ---------- dataframe : pd.DataFrame Table with data for the calculation of CUPED parameters. target_column : ColumnNameType Column from the dataframe, for which CUPED transformation will be applied. covariate_column : ColumnNameType Column which will be used as the covariate in CUPED transformation. transformed_name : ColumnNamesType, optional Name for the new transformed target column, if is not defined it will be generated automatically. """ self._check_cols(dataframe, [target_column, covariate_column]) covariance: pd.DataFrame = dataframe[[target_column, covariate_column]].cov() covariate_variance: float = covariance.loc[covariate_column, covariate_column] self.params[Cuped.THETA_NAME] = covariance.loc[target_column, covariate_column] / ( super().EPSILON + covariate_variance ) self.params[Cuped.BIAS_NAME] = np.mean(dataframe[covariate_column]) self.params["target_column"] = target_column self.params["covariate_column"] = covariate_column self.params["transformed_name"] = transformed_name self.fitted = True
[docs] def transform( self, dataframe: pd.DataFrame, inplace: bool = False, ) -> Union[pd.DataFrame, None]: """ Make CUPED transformation for the target column. Could be performed inplace or not. Parameters ---------- dataframe : pd.DataFrame Table with data for CUPED transformation. inplace : bool, default: ``False`` If is ``True``, then method returns ``None`` and sets a new column for the original dataframe. Otherwise return copied dataframe with a new column. """ self._check_cols(dataframe, [self.params["target_column"], self.params["covariate_column"]]) new_target: np.ndarray = self( dataframe[self.params["target_column"]], dataframe[self.params["covariate_column"]] ) if self.verbose: old_variance: float = np.var(dataframe[self.params["target_column"]]) new_variance: float = np.var(new_target) self._verbose(old_variance, new_variance) return self._return_result(dataframe, new_target, inplace)
[docs] def fit_transform( self, dataframe, target_column, covariate_column: types.ColumnNameType, transformed_name: Optional[types.ColumnNameType] = None, inplace: bool = False, ) -> Union[pd.DataFrame, None]: """ Combination of fit() and transform() methods. Parameters ---------- dataframe : pd.DataFrame Table with data for fitting and applying CUPED transformation. target_column : ColumnNameType Column from the dataframe, for which CUPED transformation will be applied. covariate_column : ColumnNameType Column which will be used as the covariate. transformed_name : ColumnNamesType, optional Name for the new transformed target column, if is not defined it will be generated automatically. inplace : bool, default: ``False`` If is ``True``, then method returns ``None`` and sets a new column for the original dataframe. Otherwise return copied dataframe with a new column. """ self.fit(dataframe, target_column, covariate_column, transformed_name) return self.transform(dataframe, inplace)
[docs] class MultiCuped(AbstractVarianceReducer): """ Class for data Multi CUPED transformation. Y_hat = Y - X theta (Matrix multiplication) theta := argmin Var (Y - X theta) It is important, that the mean covariance metric do not change over time!!! Parameters ---------- verbose : bool, default: ``True`` If ``True`` will print in sys.stdout the information about the variance reduction. Attributes ---------- params : Dict Parameters of instance that will be updated after calling fit() method. Include: - target column name - covariate columns names - name of column after the transformation - linear coefficients for Multi CUPED transformation. - bias value for mean equality verbose : bool Verbose info flag. fitted : bool Flag if class was fitted. Examples -------- We have dataframe with users info with column 'target' and columns 'income' and 'age'. We can assume, that over time, the average of this covariate values does not change. Then, we can use multi cuped transformation to reduce variation. Suppose we have the dataframe with users info which contains two columns: a "target" columns and columns "income" and "age". Let us can assume, that over time, the average of the "income" and "age" values do not change. Then, we can use Multi CUPED transformation based on "income" and "age" data in order to reduce "target" column variation. >>> cuped_transformer = MultiCuped(verbose=True) >>> cuped_transformer.fit_transform( >>> dataframe=dataframe >>> target_column='target' >>> ['income', 'age'], >>> transformed_name='cuped_target' >>> inplace=True, >>> ) Now in the dataframe a new column "cuped_target" appeared, we can use it to design our experiment and estimate variance reduction. For further Multi CUPED usage in the future experiment, let us store the parameters: >>> cuped_transformer.store_params('cuped_transform_params.json') Now we conduct an experiment and want to transform our data to reduce its variation: >>> cuped_transformation = MultiCuped() >>> cuped_transformation.load_params('cuped_transform_params.json') >>> cuped_transformation.transform( >>> exp_results, >>> inplace=True, >>> ) Methods ------- get_params_dict() Returns dictionary with params if fit() method has been previously called. load_params_dict(params) Load params from a dictionary. store_params(store_path) Store params to json file if fit() method has been previously called. load_params(load_path) Load params from a json file. fit(covariate_column) Fit model using covariate columns. transform(covariate_column, inplace, name) Transform target column after a class instance fitting. fit_transform(covariate_column, inplace, name) Combination of fit() and transform() methods. """ THETA_NAME: str = "theta" BIAS_NAME: str = "bias" non_serializable_params: List = [THETA_NAME, BIAS_NAME] def __init__(self, verbose: bool = True) -> None: super().__init__(verbose) self.params["covariate_columns"] = None self.params[MultiCuped.THETA_NAME] = None self.params[MultiCuped.BIAS_NAME] = None def __str__(self) -> str: return f"Multi СUPED for {self.params['target_column']}" def __call__(self, y: np.ndarray, X: np.ndarray) -> np.ndarray: self._check_fitted() y_hat: np.ndarray = y - (X @ self.params[MultiCuped.THETA_NAME]).reshape(-1) + self.params[MultiCuped.BIAS_NAME] return y_hat def get_params_dict(self) -> Dict: """ Returns a dictionary with params. Returns ------- params : Dict Dictionary with fitted params. """ self._check_fitted() return { key: (value if key not in MultiCuped.non_serializable_params else value.tolist()) for key, value in self.params.items() } def load_params_dict(self, params: Dict) -> None: """ Load model parameters from the dictionary. Parameters ---------- params : Dict Dictionary with params. """ for parameter in self.params: if parameter in params: if parameter in MultiCuped.non_serializable_params: self.params[parameter] = np.array(params[parameter]) else: self.params[parameter] = params[parameter] else: raise TypeError(f"params argument must contain: {parameter}") self.fitted = True
[docs] def fit( self, dataframe: pd.DataFrame, target_column: types.ColumnNameType, covariate_columns: types.ColumnNamesType, transformed_name: Optional[types.ColumnNameType] = None, ) -> None: """ Fit to calculate Multi CUPED parameters for target column using selected covariate columns. Parameters ---------- dataframe : pd.DataFrame Table with data for the calculation of CUPED parameters. target_column : ColumnNameType Column from the dataframe, for which CUPED transformation will be applied. covariate_columns : ColumnNamesType Columns which will be used as the covariates in Multi CUPED transformation. transformed_name : ColumnNamesType, optional Name for the new transformed target column, if is not defined it will be generated automatically. """ covariate_columns = wrap_cols(covariate_columns) cols_concat: List = [target_column] + covariate_columns self._check_cols(dataframe, cols_concat) covariance: np.ndarray = dataframe[cols_concat].cov() matrix: np.ndarray = covariance.loc[covariate_columns, covariate_columns] num_features: int = len(covariate_columns) covariance_target: np.ndarray = covariance.loc[covariate_columns, target_column].values.reshape( num_features, -1 ) self.params[MultiCuped.THETA_NAME] = np.linalg.inv(matrix) @ covariance_target self.params[MultiCuped.BIAS_NAME]: np.ndarray = ( (dataframe[covariate_columns].values @ self.params[MultiCuped.THETA_NAME]).reshape(-1).mean() ) self.params["target_column"] = target_column self.params["covariate_columns"] = covariate_columns self.params["transformed_name"] = transformed_name self.fitted = True
[docs] def transform( self, dataframe: pd.DataFrame, inplace: bool = False, ) -> Union[pd.DataFrame, None]: """ Make Multi CUPED transformation for the target column. Could be performed inplace or not. Parameters ---------- dataframe : pd.DataFrame Table with data for Multi CUPED transformation. inplace : bool, default: ``False`` If is ``True``, then method returns ``None`` and sets a new column for the original dataframe. Otherwise return copied dataframe with a new column. """ self._check_cols(dataframe, [self.params["target_column"]] + self.params["covariate_columns"]) self._check_fitted() new_target: np.ndarray = self( dataframe[self.params["target_column"]].values, dataframe[self.params["covariate_columns"]].values ) if self.verbose: old_variance: float = np.var(dataframe[self.params["target_column"]]) new_variance: float = np.var(new_target) self._verbose(old_variance, new_variance) return self._return_result(dataframe, new_target, inplace)
[docs] def fit_transform( self, dataframe: pd.DataFrame, target_column: types.ColumnNameType, covariate_columns: types.ColumnNamesType, transformed_name: Optional[types.ColumnNameType] = None, inplace: bool = False, ) -> Union[pd.DataFrame, None]: """ Combination of fit() and transform() methods. Parameters ---------- dataframe : pd.DataFrame Table with data for fitting and applying Multi CUPED transformation. target_column : ColumnNameType Column from the dataframe, for which CUPED transformation will be applied. covariate_column : ColumnNameType Column which will be used as the covariate. transformed_name : ColumnNamesType, optional Name for the new transformed target column, if is not defined it will be generated automatically. inplace : bool, default: ``False`` If is ``True``, then method returns ``None`` and sets a new column for the original dataframe. Otherwise return copied dataframe with a new column. """ self.fit(dataframe, target_column, covariate_columns, transformed_name) return self.transform(dataframe, inplace)