# Copyright 2022 MTS (Mobile Telesystems)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Module contains ML-based data transformation methods for the experiment
acceleration.
"""
import json
from pathlib import Path
from typing import Any, Callable, Dict, Optional, Union
import joblib
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from ambrosia import types
from ambrosia.tools import log
from ambrosia.tools.ab_abstract_component import AbstractVarianceReducer
from ambrosia.tools.back_tools import wrap_cols
[docs]
class MLVarianceReducer(AbstractVarianceReducer):
"""
Machine Learning approach for variance reduction.
Building a model M, we can make a transformation:
Y_hat = Y - M(X) + MEAN(M(X))
It is important, that that the mean of M(X) do not change over time!!!
You can choose models from Gradient boosting or Ridge regression or your
own model class, for example ``sklearn.ensemble.RandomForest``, and pass
models params to constructor function for a model assembly.
Parameters
----------
model : str or model type, default: ``"boosting"``
Model which will be used for the transformations.
model_params : Dict, optional
Dictionary with parameters which will be used in constructor
for a model assembly.
scores : Dict[str, Callable], optional
Scores which will be used.
verbose : bool, default: ``True``
If ``True`` will print in sys.stdout the information
about the reduction in variance.
Attributes
----------
model : model type
Model which will be used for the transformations.
params : Dict
Parameters of instance that will be updated after calling fit() method.
Include:
- target column name
- covariate columns names
- name of column after the transformation
- additional train bias equals mean(M(X)).
scores : Dict[str, Callable]
Scores which will be used.
verbose : bool
Verbose info flag.
fitted : bool
Fit status flag.
Examples
--------
We have data table with column 'target' and columns 'feature_1',
'feature_2', 'feature_3'. Let us assume, that means of all these metrics
don't change over the time, it can be age for example. We want to reduce
variance using the predictions some of ML model, then we can use this class:
>>> transformer = MLVarianceReducer() # By default CatBoost model will be choosen
>>> transformer.fit_transform(dataframe, 'target', [feature columns], inplace=True, name='new_target')
>>> transformer.store_params('path_ml_params.json')
Now to transform the experimental data we use the following commands:
>>> transformer = MLVarianceReducer()
>>> transformer.load_params('path_ml_params.json')
>>> transformer.transform(exp_data, inplace=True)
Methods
-------
get_params_dict()
Returns dict with instance fitted parameters.
load_params_dict()
Load parameters from the dict.
store_params(store_path)
Store fitted params in a json file and pickle model file.
load_params(load_path)
Load params from a json file and pickled model.
fit(**fit_params)
Fit model using a train data.
transform(dataframe, inplace)
Transform target column of a data frame.
fit_transform(dataframe, **fit_params, inplace)
Combination of fit() and transform() methods.
"""
def __set_scorer(self, scores: Optional[Dict[str, Callable]]):
"""
Support method for scorer setting.
"""
if scores is not None:
self.score = scores
else:
self.score = {"MSE": mean_squared_error}
def __create_model(self) -> None:
"""
Construct variance reducing ML model.
"""
if not isinstance(self.model, str):
self.model = self.model(**self.model_params)
if self.model == "linear":
self.model = Ridge(**self.model_params)
if self.model == "boosting":
if "verbose" not in self.model_params:
self.model_params["verbose"] = False
self.model = CatBoostRegressor(**self.model_params)
def __init__(
self,
model: Union[str, Any] = "boosting",
model_params: Optional[Dict] = None,
scores: Optional[Dict[str, Callable]] = None,
verbose: bool = True,
) -> None:
super().__init__(verbose)
self.params["covariate_columns"] = None
self.params["train_bias"] = None
self.model = model
self.model_params = {} if model_params is None else model_params
self.__set_scorer(scores)
def __str__(self) -> str:
return f"ML approach reduce for {self.params['target_column']}"
def __call__(self, y: np.ndarray, X: np.ndarray) -> np.ndarray:
"""
Transform target values using its predictions based on covariates.
Class must be fitted.
"""
self._check_fitted()
y_hat = y - self.model.predict(X) + self.params["train_bias"]
return y_hat
def _verbose_score(self, dataframe: pd.DataFrame, prediction: np.ndarray) -> None:
for name, scorer in self.score.items():
current_score: float = scorer(dataframe[self.params["target_column"]], prediction)
log.info_log(f"Prediction {name} score - {current_score:.5f}")
def _check_load_params(self, params: Dict) -> None:
for parameter in self.params:
if parameter in params:
self.params[parameter] = params[parameter]
else:
raise TypeError(f"params argument must contain: {parameter}")
def get_params_dict(self) -> Dict:
"""
Returns a dictionary with params.
Returns
-------
params : Dict
Dictionary with fitted params.
"""
self._check_fitted()
return {
"target_column": self.params["target_column"],
"covariate_columns": self.params["covariate_columns"],
"transformed_name": self.params["transformed_name"],
"train_bias": self.params["train_bias"],
"model": self.model,
}
def load_params_dict(self, params: Dict) -> None:
"""
Load instance parameters from the dictionary.
Parameters
----------
params : Dict
Dictionary with params.
"""
self._check_load_params(params)
if "model" in params:
self.model = params["model"]
else:
raise TypeError(f"params argument must contain: {'model'}")
self.fitted = True
[docs]
def store_params(self, config_store_path: Path, model_store_path: Path) -> None:
"""
Store params of model as a json file, available only for CatBoost
model.
You can reach model using instance.model and store it by yourself.
Parameters
----------
store_path : Path
Path where models parameters will be stored in a json format.
"""
self._check_fitted()
with open(config_store_path, "w+") as file:
json.dump(self.params, file)
joblib.dump(self.model, model_store_path)
[docs]
def load_params(self, config_load_path: Path, model_load_path: Path) -> None:
"""
Load models params from a json file, works only for CatBoost model.
Parameters
----------
load_path: Path
Path to a json file with model parameters.
"""
with open(config_load_path, "r+") as file:
params = json.load(file)
self._check_load_params(params)
self.model = joblib.load(model_load_path)
self.fitted = True
[docs]
def fit(
self,
dataframe: pd.DataFrame,
target_column: types.ColumnNameType,
covariate_columns: types.ColumnNamesType,
transformed_name: Optional[types.ColumnNamesType] = None,
) -> None:
"""
Fit model for transformations.
Parameters
----------
dataframe : pd.DataFrame
Table with data for model fitting.
target_column : ColumnNameType
Column from the dataframe, for which transformation will be
applied.
covariate_columns: ColumnNamesType
Columns which will be used for the transformation.
transformed_name : ColumnNamesType, optional
Name for the new transformed target column, if is not defined
it will be generated automatically.
"""
covariate_columns = wrap_cols(covariate_columns)
self._check_cols(dataframe, [target_column] + covariate_columns)
self.__create_model()
self.model.fit(dataframe[covariate_columns].values, dataframe[target_column].values)
self.params["target_column"] = target_column
self.params["transformed_name"] = transformed_name
self.params["covariate_columns"] = covariate_columns
self.params["train_bias"] = np.mean(self.model.predict(dataframe[covariate_columns].values))
self.fitted = True