Source code for ambrosia.preprocessing.transformers

#  Copyright 2022 MTS (Mobile Telesystems)
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Module contains tools for metrics transformations during a
preprocessing task.
"""
from typing import Dict, Optional, Union

import numpy as np
import pandas as pd
import scipy.stats as sps

from ambrosia import types
from ambrosia.tools.ab_abstract_component import AbstractFittableTransformer
from ambrosia.tools.back_tools import wrap_cols


[docs] class BoxCoxTransformer(AbstractFittableTransformer): """ Unit for a Box-Cox transformation of the pandas data. A Box Cox transformation helps to transform non-normal dependent variables into a normal shape. All variables values must be positive. Optimal transformation lambdas are selected automatically during the transformer fit process. Attributes ---------- column_names : List Names of column which will be selected for the transformation. lambda_ : np.ndarray Array of parameters using during the transformation of the selected columns. fitted : bool Fit flag. Examples -------- >>> boxcox = BoxCoxTransformer() >>> boxcox.fit(dataframe, ['column1', 'column2']) >>> boxcox.transform(dataframe, inplace=True) """ def __str__(self) -> str: return "Box-Cox transformation" def __init__( self, ) -> None: """ BoxCoxTransformer class constructor. """ self.column_names = None self.lambda_ = None super().__init__() def __calculate_lambda_( self, dataframe: pd.DataFrame, ) -> None: columns_num: int = len(self.column_names) self.lambda_ = np.zeros(columns_num) X: np.ndarray = dataframe[self.column_names].values for num in range(columns_num): self.lambda_[num] = sps.boxcox(X[:, num])[1] def get_params_dict(self) -> Dict: """ Returns a dictionary with params. Returns ------- params : Dict Dictionary with fitted params. """ self._check_fitted() return { "column_names": self.column_names, "lambda_": self.lambda_.tolist(), } def load_params_dict(self, params: Dict) -> None: """ Load instance parameters from the dictionary. Parameters ---------- params : Dict Dictionary with params. """ if "column_names" in params: self.column_names = params["column_names"] else: raise TypeError(f"params argument must contain: {'column_names'}") if "lambda_" in params: self.lambda_ = np.array(params["lambda_"]) else: raise TypeError(f"params argument must contain: {'lambda_'}") self.fitted = True
[docs] def fit( self, dataframe: pd.DataFrame, column_names: types.ColumnNamesType, ): """ Fit to calculate transformation parameters for the selected columns. Parameters ---------- dataframe : pd.DataFrame Dataframe to calculate optimal transformation parameters. column_names : ColumnNamesType One or number of columns in the dataframe. Returns ------- self : object Instance object. """ self.column_names = wrap_cols(column_names) self._check_cols(dataframe, self.column_names) self.__calculate_lambda_(dataframe) self.fitted = True return self
[docs] def transform(self, dataframe: pd.DataFrame, inplace: bool = False) -> Union[pd.DataFrame, None]: """ Apply Box-Cox transformation for the data. Parameters ---------- dataframe : pd.DataFrame Dataframe to transform. inplace : bool, default: ``False`` If ``True`` transforms the given dataframe, otherwise copy and returns an another one. Returns ------- df : Union[pd.DataFrame, None] Transformed dataframe or None """ self._check_fitted() self._check_cols(dataframe, self.column_names) transformed: pd.DataFrame = dataframe if inplace else dataframe.copy() X: np.ndarray = transformed[self.column_names].values for num in range(len(self.column_names)): if self.lambda_[num] == 0: X[:, num] = np.log(X[:, num]) else: X[:, num] = (X[:, num] ** self.lambda_[num] - 1) / self.lambda_[num] transformed[self.column_names] = X return None if inplace else transformed
[docs] def fit_transform( self, dataframe: pd.DataFrame, column_names: types.ColumnNamesType, inplace: bool = False, ) -> Union[pd.DataFrame, None]: """ Fit transformer parameters using given dataframe and transform it. Parameters ---------- dataframe : pd.DataFrame Dataframe for calculation of optimal parameters and further transformation. column_names : ColumnNamesType One or number of columns in the dataframe. inplace : bool, default: ``False`` If ``True`` transforms the given dataframe, otherwise copy and returns an another one. Returns ------- df : Union[pd.DataFrame, None] Transformed dataframe or None """ self.fit(dataframe, column_names) return self.transform(dataframe, inplace)
def inverse_transform(self, dataframe: pd.DataFrame, inplace: bool = False) -> Union[pd.DataFrame, None]: """ Apply inverse Box-Cox transformation for the data. Parameters ---------- dataframe : pd.DataFrame Dataframe to inverse transform. inplace : bool, default: ``False`` If ``True`` transforms the given dataframe, otherwise copy and returns an another one. Returns ------- df : Union[pd.DataFrame, None] Transformed dataframe or None """ self._check_fitted() self._check_cols(dataframe, self.column_names) transformed: pd.DataFrame = dataframe if inplace else dataframe.copy() X_tr: np.ndarray = transformed[self.column_names].values for num in range(len(self.column_names)): if self.lambda_[num] == 0: X_tr[:, num] = np.exp(X_tr[:, num]) else: X_tr[:, num] = (X_tr[:, num] * self.lambda_[num] + 1) ** (1 / self.lambda_[num]) transformed[self.column_names] = X_tr return None if inplace else transformed
[docs] class LogTransformer(AbstractFittableTransformer): """ Unit for a logarithmic transformation of the pandas data. A logarithmic transformation helps to transform some metrics distributions into a more normal shape and reduce the variance. All metrics values must be positive. Attributes ---------- column_names : List Names of column which will be selected for the transformation. fitted : bool Fit flag. Examples -------- >>> log = LogTransformer() >>> log.fit(dataframe, ['column1', 'column2']) >>> log.transform(dataframe, inplace=True) """ def __str__(self) -> str: return "Logarithmic transformation" def __init__(self) -> None: """ LogTransformer class constructor. """ self.column_names = None super().__init__() def get_params_dict(self) -> Dict: """ Returns a dictionary with params. """ self._check_fitted() return { "column_names": self.column_names, } def load_params_dict(self, params: Dict) -> None: """ Load instance parameters from the dictionary. Parameters ---------- params : Dict Dictionary with params. """ if "column_names" in params: self.column_names = params["column_names"] else: raise TypeError(f"params argument must contain: {'column_names'}") self.fitted = True
[docs] def fit( self, dataframe: pd.DataFrame, column_names: types.ColumnNamesType, ): """ Fit names of the selected columns. Parameters ---------- dataframe : pd.DataFrame Dataframe with metrics. column_names : ColumnNamesType One or number of columns in the dataframe. Returns ------- self : object Instance object. """ self.column_names = wrap_cols(column_names) self._check_cols(dataframe, self.column_names) self.fitted = True return self
[docs] def transform(self, dataframe: pd.DataFrame, inplace: bool = False) -> Union[pd.DataFrame, None]: """ Apply log transformation for the data. Parameters ---------- dataframe : pd.DataFrame Dataframe to transform. inplace : bool, default: ``False`` If ``True`` transforms the given dataframe, otherwise copy and returns an another one. Returns ------- df : Union[pd.DataFrame, None] Transformed dataframe or None """ self._check_fitted() self._check_cols(dataframe, self.column_names) transformed: pd.DataFrame = dataframe if inplace else dataframe.copy() if (transformed[self.column_names] > 0).all(axis=None): transformed[self.column_names] = np.log(transformed[self.column_names].values) else: raise ValueError(f"All values in columns {self.column_names} must be positive") return None if inplace else transformed
[docs] def fit_transform( self, dataframe: pd.DataFrame, column_names: types.ColumnNamesType, inplace: bool = False, ) -> Union[pd.DataFrame, None]: """ Fit transformer parameters using given dataframe and transform it. Only column names are fittable. Parameters ---------- dataframe : pd.DataFrame Dataframe to transform. column_names : ColumnNamesType One or number of columns in the dataframe. inplace : bool, default: ``False`` If ``True`` transforms the given dataframe, otherwise copy and returns an another one. Returns ------- df : Union[pd.DataFrame, None] Transformed dataframe or None """ self.fit(dataframe, column_names) return self.transform(dataframe, inplace)
def inverse_transform(self, dataframe: pd.DataFrame, inplace: bool = False) -> Union[pd.DataFrame, None]: """ Apply inverse log transformation for the data. Parameters ---------- dataframe : pd.DataFrame Dataframe to inverse transform. inplace : bool, default: ``False`` If ``True`` transforms the given dataframe, otherwise copy and returns an another one. Returns ------- df : Union[pd.DataFrame, None] Transformed dataframe or None """ self._check_fitted() self._check_cols(dataframe, self.column_names) transformed: pd.DataFrame = dataframe if inplace else dataframe.copy() transformed[self.column_names] = np.exp(transformed[self.column_names].values) return None if inplace else transformed
class LinearizationTransformer(AbstractFittableTransformer): """ Linearization transformer for ratio metrics. Converts a ratio metric (numerator / denominator) into a per-unit linearized metric that is approximately normally distributed, enabling correct t-test usage: linearized_i = numerator_i - ratio * denominator_i where ratio = mean(numerator) / mean(denominator), estimated on the reference (control group / historical) data passed to fit(). Parameters ---------- numerator : str Column name of the ratio numerator (e.g. "revenue"). denominator : str Column name of the ratio denominator (e.g. "orders"). transformed_name : str, optional Name for the new column. Defaults to ``"{numerator}_lin"``. Examples -------- >>> transformer = LinearizationTransformer() >>> transformer.fit(control_df, "revenue", "orders", "arpu_lin") >>> transformer.transform(experiment_df, inplace=True) """ def __str__(self) -> str: return "Linearization transformation" def __init__(self) -> None: self.numerator: Optional[str] = None self.denominator: Optional[str] = None self.transformed_name: Optional[str] = None self.ratio: Optional[float] = None super().__init__() def get_params_dict(self) -> Dict: self._check_fitted() return { "numerator": self.numerator, "denominator": self.denominator, "transformed_name": self.transformed_name, "ratio": self.ratio, } def load_params_dict(self, params: Dict) -> None: for key in ("numerator", "denominator", "transformed_name", "ratio"): if key not in params: raise TypeError(f"params argument must contain: {key}") setattr(self, key, params[key]) self.fitted = True def fit( self, dataframe: pd.DataFrame, numerator: str, denominator: str, transformed_name: Optional[str] = None, ): """ Estimate ratio = mean(numerator) / mean(denominator) on reference data. Parameters ---------- dataframe : pd.DataFrame Reference dataframe (typically control group or historical data). numerator : str Column name of the ratio numerator. denominator : str Column name of the ratio denominator. transformed_name : str, optional Name for the linearized column. Defaults to ``"{numerator}_lin"``. """ self._check_cols(dataframe, [numerator, denominator]) denom_mean = dataframe[denominator].mean() if denom_mean == 0: raise ValueError(f"Mean of denominator column '{denominator}' is zero; cannot compute ratio.") self.numerator = numerator self.denominator = denominator self.transformed_name = transformed_name if transformed_name is not None else f"{numerator}_lin" self.ratio = dataframe[numerator].mean() / denom_mean self.fitted = True return self def transform(self, dataframe: pd.DataFrame, inplace: bool = False) -> Union[pd.DataFrame, None]: """ Apply linearization: transformed = numerator - ratio * denominator. Parameters ---------- dataframe : pd.DataFrame Dataframe to transform. inplace : bool, default: ``False`` If ``True`` modifies dataframe in place, otherwise returns a copy. """ self._check_fitted() self._check_cols(dataframe, [self.numerator, self.denominator]) df = dataframe if inplace else dataframe.copy() df[self.transformed_name] = df[self.numerator] - self.ratio * df[self.denominator] return None if inplace else df def fit_transform( self, dataframe: pd.DataFrame, numerator: str, denominator: str, transformed_name: Optional[str] = None, inplace: bool = False, ) -> Union[pd.DataFrame, None]: """ Fit and transform in one step. Parameters ---------- dataframe : pd.DataFrame Reference dataframe for fitting and transformation. numerator : str Column name of the ratio numerator. denominator : str Column name of the ratio denominator. transformed_name : str, optional Name for the linearized column. inplace : bool, default: ``False`` If ``True`` modifies dataframe in place. """ self.fit(dataframe, numerator, denominator, transformed_name) return self.transform(dataframe, inplace)