# Copyright 2022 MTS (Mobile Telesystems)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Module contains tools for outliers removal from data during a
preprocessing task.
"""
from typing import Dict, Iterable, List, Union
import numpy as np
import pandas as pd
from ambrosia import types
from ambrosia.tools import log
from ambrosia.tools.ab_abstract_component import AbstractFittableTransformer
from ambrosia.tools.back_tools import wrap_cols
[docs]
class RobustPreprocessor(AbstractFittableTransformer):
"""
Unit for simple robust transformation for avoiding outliers in data.
It cuts the alpha percentage of distribution from head, tail or both sides
for each given metric.
The data distribution structure assumed to present as small alpha
part of outliers, followed by the normal part of the data with another
alpha part of outliers at the end of the distribution.
Parameters
----------
verbose : bool, default: ``True``
If ``True`` will show info about the transformation of passed columns.
Attributes
----------
params : Dict
Dictionary with operational parameters of the instance.
Updated after calling the ``fit`` method.
verbose : bool
Verbose info flag.
available_tails : List
List of the available tail type names to preprocess
non_serializable_params: List
List of the class parameters that should be converted to lists
in order to serialize.
fitted : bool
Fit flag.
Examples
--------
>>> robust = RobustPreprocessor(verbose=True)
>>> robust.fit(dataframe, ['column1', 'column2'], alpha=0.05)
>>> robust.transform(dataframe, inplace=True)
You can pass one or number of columns, if several columns are passed
it will drop in total alpha percent of extreme values for each column.
"""
available_tails: List = ["both", "left", "right"]
non_serializable_params: List = ["alpha", "quantiles"]
def __str__(self) -> str:
return "Robust preprocessing"
def __init__(self, verbose: bool = True) -> None:
"""
RobustPreprocessor class constructor.
"""
self.params = {
"tail": None,
"column_names": None,
"alpha": None,
"quantiles": None,
}
self.verbose = verbose
super().__init__()
def get_params_dict(self) -> Dict:
"""
Returns a dictionary with params.
Returns
-------
params : Dict
Dictionary with fitted params.
"""
self._check_fitted()
return {
key: (value if key not in RobustPreprocessor.non_serializable_params else value.tolist())
for key, value in self.params.items()
}
def load_params_dict(self, params: Dict) -> None:
"""
Load prefitted parameters form a dictionary.
Parameters
----------
params : Dict
Dictionary with prefitted params.
"""
for parameter in self.params:
if parameter in params:
if parameter in RobustPreprocessor.non_serializable_params:
self.params[parameter] = np.array(params[parameter])
else:
self.params[parameter] = params[parameter]
else:
raise TypeError(f"params argument must contain: {parameter}")
self.fitted = True
def __wrap_alpha(self, alpha: Union[float, Iterable]) -> np.ndarray:
columns_num = len(self.params["column_names"])
if isinstance(alpha, float):
alpha = np.array([alpha] * columns_num)
elif isinstance(alpha, Iterable):
alpha = np.array(alpha)
else:
raise ValueError("Alpha parameter must be float or an iterable")
if len(alpha) != columns_num:
raise ValueError("Alpha length must be equal to the columns number")
if (alpha < 0).any() or (alpha >= 0.5).any():
raise ValueError(f"Alpha value must be from 0 to 0.5, but alpha vector = {alpha}")
return alpha
def __check_tail(self, tail: str) -> str:
if tail not in self.available_tails:
raise ValueError(f"tail must be one of {RobustPreprocessor.available_tails}")
return tail
def __calculate_quantiles(
self,
dataframe: pd.DataFrame,
) -> None:
columns_num = len(self.params["column_names"])
if self.params["tail"] == "both":
self.params["quantiles"] = np.zeros((columns_num, 2))
for num, col in enumerate(self.params["column_names"]):
alpha = self.params["alpha"][num] / 2
self.params["quantiles"][num, :] = np.quantile(dataframe[col].values, [alpha, 1 - alpha])
else:
self.params["quantiles"] = np.zeros((columns_num, 1))
for num, col in enumerate(self.params["column_names"]):
alpha = self.params["alpha"][num] if self.params["tail"] == "left" else 1 - self.params["alpha"][num]
self.params["quantiles"][num] = np.quantile(dataframe[col].values, alpha)
[docs]
def fit(
self,
dataframe: pd.DataFrame,
column_names: types.ColumnNamesType,
alpha: Union[float, np.ndarray] = 0.05,
tail: str = "both",
):
"""
Fit to calculate robust parameters for the selected columns.
Parameters
----------
dataframe : pd.DataFrame
Dataframe to calculate quantiles.
column_names : ColumnNamesType
One or number of columns in the dataframe.
alpha : Union[float, np.ndarray], default: ``0.05``
The percentage of removed data from head and tail.
tail : str, default: ``"both"``
Part of distribution to be removed.
Can be ``"left"``, ``"right"`` or ``"both"``.
Returns
-------
self : object
Instance object.
"""
self.params["column_names"] = wrap_cols(column_names)
self._check_cols(dataframe, self.params["column_names"])
self.params["alpha"] = self.__wrap_alpha(alpha)
self.params["tail"] = self.__check_tail(tail)
self.__calculate_quantiles(dataframe)
self.fitted = True
return self
[docs]
class IQRPreprocessor(AbstractFittableTransformer):
"""
Unit for IQR transformation of the data to exclude outliers.
It cuts the points from the distribution which are behind the range of
0.25 quantile - 1,5 * iqr and 0.75 quantile + 1,5 * iqr
for each given metric.
Parameters
----------
verbose : bool, default: ``True``
If ``True`` will show info about the transformation of passed columns.
Attributes
----------
params : Dict
Dictionary with operational parameters of the instance.
Updated after calling the ``fit`` method.
verbose : bool
Verbose info flag.
non_serializable_params: List
List of the class parameters that should be converted to lists
in order to serialize.
fitted : bool
Fit flag.
Examples
--------
>>> iqr = IQRPreprocessor(verbose=True)
>>> iqr.fit(dataframe, ['column1', 'column2'])
>>> iqr.transform(dataframe, inplace=True)
You can pass one or number of columns, if several columns are passed
it will drop extreme values for each column.
"""
non_serializable_params: List = ["medians", "quartiles"]
def __str__(self) -> str:
return "IQR outliers preprocessing"
def __init__(self, verbose: bool = True) -> None:
"""
IQRPreprocessor class constructor.
"""
self.params = {"column_names": None, "medians": None, "quartiles": None}
self.verbose = verbose
super().__init__()
def get_params_dict(self) -> Dict:
"""
Returns a dictionary with params.
Returns
-------
params : Dict
Dictionary with fitted params.
"""
self._check_fitted()
return {
key: (value if key not in IQRPreprocessor.non_serializable_params else value.tolist())
for key, value in self.params.items()
}
def load_params_dict(self, params: Dict) -> None:
"""
Load prefitted parameters form a dictionary.
Parameters
----------
params : Dict
Dictionary with prefitted params.
"""
for parameter in self.params:
if parameter in params:
if parameter in IQRPreprocessor.non_serializable_params:
self.params[parameter] = np.array(params[parameter])
else:
self.params[parameter] = params[parameter]
else:
raise TypeError(f"params argument must contain: {parameter}")
self.fitted = True
def __calculate_params(
self,
dataframe: pd.DataFrame,
):
X: np.ndarray = dataframe[self.params["column_names"]].values
self.params["quartiles"] = np.quantile(X, (0.25, 0.75), axis=0).T
self.params["medians"] = np.median(X, axis=0).T
[docs]
def fit(
self,
dataframe: pd.DataFrame,
column_names: types.ColumnNamesType,
):
"""
Fit to calculate iqr parameters for the selected columns.
Parameters
----------
dataframe : pd.DataFrame
Dataframe to calculate quantiles.
column_names : ColumnNamesType
One or number of columns in the dataframe.
Returns
-------
self : object
Instance object.
"""
self.params["column_names"] = wrap_cols(column_names)
self._check_cols(dataframe, self.params["column_names"])
self.__calculate_params(dataframe)
self.fitted = True
return self