Source code for ambrosia.tester.tester

#  Copyright 2022 MTS (Mobile Telesystems)
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Experiment results evaluation and testing methods.

Module contains `Tester` core class and `test` methid which are
designed to evaluate statistical significance of the experiment results
and a magnitude of effect via large number of methods and criteria.

It is recommended to use for parameters such as test method and statistical
criterion the values that were chosen during the experiment design stage.

Currently, experimental results can only be processed and evaluated as
pandas DataFrames or .csv tables. Support for Spark dataframes is under
development and will be available soon.

"""
import itertools
from copy import deepcopy
from typing import Any, Callable, Dict, List, Optional, Union
from warnings import warn

import numpy as np
import pandas as pd

import ambrosia.tools.empirical_tools as empirical_pkg
import ambrosia.tools.multitest as multitest_pkg
import ambrosia.tools.pvalue_tools as pvalue_pkg
import ambrosia.tools.srm as srm_pkg
import ambrosia.tools.stat_criteria as criteria_pkg
from ambrosia import types
from ambrosia.tools.ab_abstract_component import ABStatCriterion, ABToolAbstract, DataframeHandler, StatCriterion

from .binary_result_evaluation import binary_absolute_result, binary_relative_result
from .handlers import TheoreticalTesterHandler, filter_spark_and_make_groups

BOOTSTRAP_SIZE: int = 10000
AVAILABLE: List[str] = ["pandas", "spark"]
AVAILABLE_AB_CRITERIA: Dict[str, ABStatCriterion] = {
    "ttest": criteria_pkg.TtestIndCriterion,
    "ttest_rel": criteria_pkg.TtestRelCriterion,
    "mw": criteria_pkg.MannWhitneyCriterion,
    "wilcoxon": criteria_pkg.WilcoxonCriterion,
}
AVAILABLE_MULTITEST_CORRECTIONS: List[str] = multitest_pkg.available_methods()



[docs]
class Tester(ABToolAbstract):
    """
    Unit for evaluating the results of experiments.

    The experiment evaluation result contains:
        - Pvalue for the selected criterion
        - Point effect estimation
        - Corresponding confidence interval for the effect
        - Boolean result - presence / absence of the effect

    Parameters
    ----------
    dataframe : PassedDataType, optional
        Dataframe used with experiment results metrics.
    df_mapping : GroupsInfoType, optional
        Dataframe which contains group labels of objects.
    experiment_results : ExperimentResults, optional
        Dict with separate experiment results for each group.
        Dict keys are used as groups labels, values must be either
        pandas or Spark dataframes.
    column_groups : ColumnNameType, optional
        Column which contains groups label of objects.
    group_labels : GroupLabelsType, optional
        Labels for experimental groups. If ``column_groups`` contains
        at least two values, they will choose for labels.
    id_column : ColumnNameType, optional
        Name of column with objects ids in ``df_mapping`` dataframe.
    first_type_errors : StatErrorType, default: ``0.05``
        I type errors values. Fix P (detect difference for equal) to be less
        than threshold. Used to construct confidence intervals.
    metrics : MetricNameType, optional
        Metrics (columns of dataframe) which is used to calculate
        experiment result.
    metric_funcs : Dict[str, Callable], optional
        Dictionary mapping metric names to callable functions.
        Each function receives a ``pd.DataFrame`` (group data) and must
        return an array-like of numeric values. When provided, the
        function is used instead of column lookup for the corresponding
        metric name. Only supported for pandas DataFrames.

    Attributes
    ----------
    dataframe : PassedDataType
        Dataframe used with experiment results metrics.
    df_mapping : GroupsInfoType
        Dataframe which contains group labels of objects.
    experiment_results : ExperimentResults, optional
        Dict with separate experiment results for each group.
    column_groups : ColumnNameType
        Column which contains groups label of objects.
    group_labels : GroupLabelsType
        Labels for experimental groups.
    id_column : ColumnNameType
        Name of column with objects ids in ``df_mapping`` dataframe.
    first_type_errors : StatErrorType, default: ``0.05``
        I type errors values.
    metrics : MetricNameType
        Columns of dataframe with experiment results.

    Examples
    --------
    We've experimented with adding onboarding to our mobile app and
    would like to know about its results in terms of A/B testing.
    Suppose we have a loaded pandas dataframe with a column responsible
    for the groups in the testing and columns with metric values,
    such as retention. Then you can use the tester class the following way:

    >>> tester = Tester(
    >>>     dataframe=df,
    >>>     column_groups='groups',
    >>>     metrics='retention'
    >>> )
    >>> tester.run()
    >>> # Output
    >>> [{
    >>>     'first_type_error' : 0.05,
    >>>     'pvalue' : 0.03,
    >>>     'effect' : 1.05,
    >>>     'confidence_interval' : (1.01, 1.10),
    >>>     'metric name': 'retention',
    >>>     'group A label': 'A',
    >>>     'group B label': 'B'
    >>> }]

    Notes
    -----
    Basic mathematic methods for evaluating experiments:

        - Theory:
            - Absolute: Using ttest, mann-whitney, others and custom criteria
            - Relative: Using delta method

        - Empiric:
            - Absolute / Relative: Building empirical distribution for T(A, B)

        - Binary:
            - Absolute: Using special binary intervals and
              finding pvalue = inf_a {x : 0 not in interval(x)}
            - Relative: Not implemented yet :(

    Constructors:

    >>> # Empty constructor
    >>> tester = Tester()
    >>> # You can pass Iterable or single object for some parameters
    >>> tester = Tester(
    >>>     dataframe=df,
    >>>     columns_groups='groups',
    >>>     metrics=['ltv', 'retention']
    >>> )
    >>> tester = Tester(metrics='retention', first_type_errors=[0.01, 0.05])
    >>> # You can set a separate table containing information about
    >>> # the partitioning in the experiment
    >>> tester = tester = Tester(
    >>>     dataframe=df, # main dataframe with metrics
    >>>     df_mapping=groups, # table with information about groups
    >>>     metrics='metric', # Metric to be tested
    >>>     column_groups='group', # Column in df_mapping with labels
    >>>     id_column='id' # Column with ids in df and df_mapping (for join)
    >>> )

    Setters:

    >>> tester.set_metrics(['ltv', 'retention'])
    >>> tester.set_dataframe(dataframe=dataframe, column_groups='groups')
    >>> # You can set separate data of each group packed in special dict form
    >>> tester.set_experiment_results(experiment_results=experiment_results)

    Run:

    >>> # You can choose effect_type to estimate: relative / absolute
    >>> tester.run('absolute')
    >>> # Also you can choose method
    >>> tester.run('absolute', method='empriric') # emipiric for bootstrap
    >>> # One can pass arguments in run() method and they will have
    >>> # higher priority
    >>> tester.run(metrics='ltv', data_a_group=df_a)

    Use a function instead of a class:

    >>> test('absolute', dataframe=df, column_groups='groups', metrics='ltv')
    """

    # This is for avoiding warnings from pytest
    __test__ = False

    def set_experiment_results(self, experiment_results: types.ExperimentResults) -> None:
        self.__experiment_results = experiment_results

    def set_errors(self, first_type_errors: types.StatErrorType) -> None:
        if first_type_errors is None:
            first_type_errors = 0.05
        if isinstance(first_type_errors, float):
            self.__alpha = np.array([first_type_errors])
        else:
            self.__alpha = np.array(first_type_errors)

    def set_metrics(self, metrics: types.MetricNamesType) -> None:
        if isinstance(metrics, types.MetricNameType):
            self.__metrics = [metrics]
        else:
            self.__metrics = metrics

    def set_dataframe(
        self,
        dataframe: types.PassedDataType,
        column_groups: types.MetricNameType,
        group_labels: types.GroupLabelsType = None,
        df_mapping: types.GroupsInfoType = None,
        id_column: types.MetricNameType = None,
    ) -> None:
        __filtering_kwargs = {
            "dataframe": dataframe,
            "df_mapping": df_mapping,
            "column_groups": column_groups,
            "group_labels": group_labels,
            "id_column": id_column,
        }
        self.__experiment_results = DataframeHandler()._handle_cases(
            Tester.__filter_data,
            filter_spark_and_make_groups,
            **__filtering_kwargs,
        )

    def __init__(
        self,
        dataframe: Optional[types.PassedDataType] = None,
        df_mapping: Optional[types.GroupsInfoType] = None,
        experiment_results: Optional[types.ExperimentResults] = None,
        column_groups: Optional[types.ColumnNameType] = None,
        group_labels: Optional[types.GroupLabelsType] = None,
        id_column: Optional[types.ColumnNameType] = None,
        first_type_errors: types.StatErrorType = 0.05,
        metrics: Optional[types.MetricNamesType] = None,
        metric_funcs: Optional[Dict[str, Callable]] = None,
    ):
        """
        Tester class constructor to initialize the object.
        """
        if dataframe is not None:
            self.set_dataframe(
                dataframe,
                column_groups,
                group_labels,
                df_mapping,
                id_column,
            )
        else:
            self.set_experiment_results(experiment_results=experiment_results)
        self.set_errors(first_type_errors)
        self.set_metrics(metrics)
        self.__metric_funcs = metric_funcs or {}

    @staticmethod
    def __filter_data(
        dataframe: types.PassedDataType,
        df_mapping: types.GroupsInfoType,
        column_groups: types.ColumnNameType,
        group_labels: types.GroupLabelsType,
        id_column: types.ColumnNameType,
    ) -> types.TwoSamplesType:
        """
        Function to handle setting of pandas data.
        """
        if dataframe is None:
            return None

        if df_mapping is not None:
            if id_column not in dataframe:
                raise ValueError(f"Column {id_column}, is not in list of df columns")
            if id_column not in df_mapping:
                raise ValueError(f"Column {id_column}, is not in list of df_mapping columns")
            dataframe = dataframe.merge(df_mapping, how="left", on=id_column).dropna()
        if column_groups not in dataframe:
            raise ValueError(f"Column {column_groups}, is not in list of df columns")

        if group_labels is not None:
            if len(group_labels) < 2:
                raise ValueError(f"Group labels must be at least 2, given {group_labels}")
        else:
            group_labels = dataframe[column_groups].unique()
        experiment_results: types.ExperimentResults = {
            group_label: dataframe[dataframe[column_groups] == group_label] for group_label in group_labels
        }
        return experiment_results

    @staticmethod
    def __bootstrap_result(
        group_a: types.GroupType,
        group_b: types.GroupType,
        alpha: np.ndarray,
        bootstrap_size: int = BOOTSTRAP_SIZE,
        effect_type: str = "absolute",
        **kwargs,
    ) -> types._SubResultType:
        """
        Function to handle the empirical approach to testing.
        """
        if effect_type == "absolute":
            metric = "mean"
            point_effect = np.mean(group_b) - np.mean(group_a)
        elif effect_type == "relative":
            metric = "fraction"
            point_effect = np.mean(group_b) / np.mean(group_a) - 1
        else:
            raise ValueError("Set effect_type as 'absolute' or 'relative'")
        paired: bool = kwargs.pop("paired") if "paired" in kwargs else False
        bootstrap_handler = empirical_pkg.BootstrapStats(bootstrap_size=bootstrap_size, metric=metric, paired=paired)
        bootstrap_handler.fit(group_a, group_b, **kwargs)
        left_bounds, right_bounds = bootstrap_handler.confidence_interval(confidence_level=1 - alpha, **kwargs)
        pvalue = bootstrap_handler.pvalue_criterion(**kwargs)
        confidence_interval = list(zip(left_bounds, right_bounds))
        return {
            "first_type_error": alpha,
            "pvalue": pvalue,
            "effect": point_effect,
            "confidence_interval": confidence_interval,
        }

    @staticmethod
    def __binary_result(
        group_a: types.GroupType, group_b: types.GroupType, alpha: np.ndarray, effect_type: str = "absolute", **kwargs
    ) -> types._SubResultType:
        """
        Function to handle binary intervals for testing.
        """
        warning_message_values: str = "Values for metric is not binary, choose other method, for example ttest!"
        if not set(np.unique(group_a)).issubset({0, 1}) or not set(np.unique(group_b)).issubset({0, 1}):
            warn(warning_message_values)
        if effect_type == "absolute":
            return binary_absolute_result(group_a, group_b, alpha, **kwargs)
        elif effect_type == "relative":
            return binary_relative_result(group_a, group_b, alpha, **kwargs)
        else:
            raise ValueError(f"``effect_type`` variable could be only  'absolute' or 'relative, got {effect_type}.")

    @staticmethod
    def __theory_handler(
        group_a: types.GroupType,
        group_b: types.GroupType,
        alpha: np.ndarray,
        effect_type: str = "absolute",
        criterion: Optional[ABStatCriterion] = None,
        **kwargs,
    ) -> types._SubResultType:
        """
        Function to handle the theoretical approach to testing.
        """
        criterion: Union[str, StatCriterion] = criterion if criterion is not None else "ttest"
        if isinstance(criterion, str) and (criterion in AVAILABLE_AB_CRITERIA):
            criterion = AVAILABLE_AB_CRITERIA[criterion]
        elif not (hasattr(criterion, "get_results") and callable(criterion.get_results)):
            raise ValueError(
                f"Choose correct criterion name from {list(AVAILABLE_AB_CRITERIA)} or pass correct custom class"
            )
        return criterion().get_results(group_a=group_a, group_b=group_b, alpha=alpha, effect_type=effect_type, **kwargs)

    @staticmethod
    def __as_error_array(first_type_errors: Optional[types.StatErrorType]) -> Optional[np.ndarray]:
        """
        Wrap first type errors into an array, keeping ``None`` untouched.
        """
        if first_type_errors is None:
            return None
        if isinstance(first_type_errors, float):
            return np.array([first_type_errors])
        return np.array(first_type_errors)

    @staticmethod
    def __warn_on_srm(experiment_results: types.ExperimentResults, expected_ratios: Optional[Dict[Any, float]]) -> None:
        """
        Check the group sizes for a Sample Ratio Mismatch and warn if detected.
        """
        observed_sizes = {label: srm_pkg.group_size(group_data) for label, group_data in experiment_results.items()}
        srm_result = srm_pkg.check_srm_from_counts(observed_sizes, expected_ratios=expected_ratios)
        if srm_result["srm_detected"]:
            warn(
                f"Sample Ratio Mismatch detected: observed group sizes {srm_result['observed']} deviate "
                f"from the expected ratios (chi-square p-value = {srm_result['pvalue']:.3g} < "
                f"{srm_result['alpha']}). The group assignment may be broken and the test results may be "
                "unreliable. If the unequal split is intentional, pass srm_expected_ratios; "
                "to disable this check, set check_srm=False."
            )

    @staticmethod
    def __pre_run(method: str, args: types._UsageArgumentsType, **kwargs) -> types.TesterResult:
        """
        Function to handle run method on pandas dataframes.
        """
        # TODO: add methods to enum
        accepted_methods: List[str] = ["theory", "empiric", "binary"]
        if method not in accepted_methods:
            raise ValueError(f'Choose method from {", ".join(accepted_methods)}')
        result: types.TesterResult = {}
        metric_funcs: Dict = args.get("metric_funcs", {})
        for metric in args["metrics"]:
            metric_func = metric_funcs.get(metric)
            if metric_func is not None:
                a_values: np.ndarray = np.asarray(metric_func(args["data_a_group"]))
                b_values: np.ndarray = np.asarray(metric_func(args["data_b_group"]))
            else:
                a_values = args["data_a_group"][metric].values
                b_values = args["data_b_group"][metric].values
            if method == "theory":
                # TODO: Make it SolverClass ~ method
                # solver = SolverClass(...)
                # sub_result = solver.solve()
                solver = TheoreticalTesterHandler(
                    args["data_a_group"],
                    args["data_b_group"],
                    column=metric,
                    alpha=np.array(args["alpha"]),
                    effect_type=args["effect_type"],
                    criterion=args["criterion"],
                    metric_func=metric_func,
                    **kwargs,
                )
                sub_result = solver.solve()
            elif method == "empiric":
                sub_result = Tester.__bootstrap_result(
                    a_values, b_values, np.array(args["alpha"]), effect_type=args["effect_type"], **kwargs
                )
            elif method == "binary":
                sub_result = Tester.__binary_result(
                    a_values, b_values, np.array(args["alpha"]), effect_type=args["effect_type"], **kwargs
                )
            result[metric] = sub_result
        return result

    @staticmethod
    def __apply_multitest_correction(
        result: types.TesterResult,
        metrics: List[types.MetricNameType],
        method: str,
        nominal_alpha: np.ndarray,
    ) -> None:
        """
        Adjust p-values across the whole family of tested hypotheses in place.

        The family consists of one p-value per (group pair, metric) combination.
        Adjusting this vector directly - rather than the flattened result table,
        which repeats each p-value once per first type error level - keeps the
        rank-based procedures (Holm, Benjamini-Hochberg, Hommel, ...) correct.
        The reported ``first_type_error`` is restored to the nominal level, while
        any confidence-interval widening already performed for the
        constant-scaling methods (Bonferroni, Sidak) is left untouched.
        """
        coordinates: List = []
        pvalues: List[float] = []
        for test_name, subresult in result.items():
            for metric in metrics:
                coordinates.append((test_name, metric))
                pvalues.append(float(subresult[metric]["pvalue"]))
        adjusted: np.ndarray = multitest_pkg.adjust_pvalues(np.asarray(pvalues), method)
        for (test_name, metric), pvalue in zip(coordinates, adjusted):
            result[test_name][metric]["pvalue"] = float(pvalue)
            result[test_name][metric]["first_type_error"] = nominal_alpha

    @staticmethod
    def as_table(dict_result: types.TesterResult) -> pd.DataFrame:
        """
        Transform dict type output result to pandas DataFrame format.

        Parameters
        ----------
        dict_result : TesterResult
           Tester result as a dictionary.

        Returns
        -------
        result_table : pd.DataFrame
           Table with results.
        """
        answer: List[pd.DataFrame] = []
        for single_test in dict_result:
            metrics_names = list(dict_result[single_test].keys())
            metrics_names.remove("group_a_label")
            metrics_names.remove("group_b_label")
            for metric_name in metrics_names:
                tmp = deepcopy(dict_result[single_test][metric_name])
                tmp["metric name"] = metric_name
                tmp["group A label"] = dict_result[single_test]["group_a_label"]
                tmp["group B label"] = dict_result[single_test]["group_b_label"]
                if tmp["confidence_interval"][0][0] is not None:
                    tmp["confidence_interval"] = [
                        (round(left, Tester._PRECISION_DIGITS), round(right, Tester._PRECISION_DIGITS))
                        for left, right in tmp["confidence_interval"]
                    ]
                answer.append(pd.DataFrame(tmp))
        result_table = pd.concat(answer).reset_index(drop=True)
        return result_table


[docs]
    def run(
        self,
        effect_type: str = "absolute",
        method: str = "theory",
        dataframe: Optional[types.PassedDataType] = None,
        df_mapping: Optional[types.GroupsInfoType] = None,
        experiment_results: Optional[types.ExperimentResults] = None,
        id_column: Optional[str] = None,
        column_groups: Optional[str] = None,
        group_labels: Optional[types.GroupLabelsType] = None,
        metrics: Optional[types.MetricNamesType] = None,
        first_type_errors: Optional[types.StatErrorType] = None,
        criterion: Optional[ABStatCriterion] = None,
        correction_method: Union[str, None] = "bonferroni",
        as_table: bool = True,
        metric_funcs: Optional[Dict[str, Callable]] = None,
        check_srm: Optional[bool] = None,
        srm_expected_ratios: Optional[Dict[Any, float]] = None,
        **kwargs,
    ) -> types.TesterResult:
        """
        The main method for testing and evaluating experimental results.

        Parameters
        ----------
        effect_type : str, default: ``"absolute"``
           Effect type to calculate.
           Could be ``"absolute"`` or ``"relative"``.
        method : str, default: ``"theory"``
           Type of testing approach.
           Can take the values ``"theory"``, ``"empiric"`` or ``"binary"``.
        dataframe : PassedDataType, optional
           Data used to calculate the results of an experiment.
        df_mapping : GroupsInfoType, optional
           Dataframe which contains group labels of objects.
        experiment_results : ExperimentResults
            Dict with separate experiment results for each group.
            Dict keys are used as groups labels, values must be either
            pandas or Spark dataframes.
        column_groups : ColumnNameType
            Column which contains groups label of objects.
        group_labels : GroupLabelsType
            Labels for experimental groups.
        id_column : ColumnNameType
            Name of column with objects ids in ``df_mapping`` dataframe.
        first_type_errors : StatErrorType, default: ``0.05``
            I type errors values.
        metrics : MetricNameType
            Columns of dataframe with experiment results.
        criterion : ABStatCriterion, optional
            Statistical criterion for hypotheses testing.
            If ``method`` is ``"theory"`` and no criterion provided,
            ttest for independent samples will be used.
        correction_method : Union[str, None], default: ``"bonferroni"``
            Method for multiple hypothesis testing correction of p-values.
            Supported values: ``"bonferroni"``, ``"sidak"``, ``"holm"``,
            ``"holm-sidak"``, ``"fdr_bh"`` (Benjamini-Hochberg),
            ``"fdr_by"`` (Benjamini-Yekutieli), ``"hommel"``,
            ``"simes-hochberg"``; pass ``None`` to disable correction.
            The family size equals the number of group-pair combinations
            times the number of metrics. For ``"bonferroni"`` and ``"sidak"``
            confidence intervals are widened accordingly; the other, step-wise
            methods adjust only the p-values and leave intervals at the nominal
            level.
        as_table : bool, default: ``True``
            Return the test results as a pandas dataframe.
            If ``False``, a list of dicts with results will be returned.
        metric_funcs : Dict[str, Callable], optional
            Dictionary mapping metric names to callable functions.
            Each function receives a group ``pd.DataFrame`` and returns
            array-like values. Overrides functions set in constructor
            for matching metric names. Only pandas DataFrames supported.
        check_srm : bool, optional
            Run a Sample Ratio Mismatch check on the group sizes before
            testing and emit a warning if the observed sizes deviate from
            the expected ratios (chi-square test at the ``0.0005`` level).
            A detected mismatch usually means a broken assignment procedure,
            making the test results unreliable. By default the check runs
            only when ``srm_expected_ratios`` is provided; pass ``True`` to
            enable it with equal expected sizes or ``False`` to disable it
            entirely. The check assumes one row per randomization unit
            (e.g. one user); for Spark data it triggers a count job per
            group.
        srm_expected_ratios : Dict[Any, float], optional
            Expected group size ratios for the Sample Ratio Mismatch check,
            mapping group label to its share (normalized internally).
            Pass it when the split is intentionally unequal.
            If ``None``, equal group sizes are expected.
        **kwargs : Dict
            Other keyword arguments.

        Returns
        -------
        result : types.TesterResult
            Experiment results as pandas table or list of dicts for each metric
            and first type error.
        """
        if isinstance(metrics, types.MetricNameType):
            metrics = [metrics]
        first_type_errors = Tester.__as_error_array(first_type_errors)
        if "alternative" in kwargs:
            pvalue_pkg.check_alternative(kwargs["alternative"])
        else:
            kwargs["alternative"] = "two-sided"

        __filtering_kwargs = {
            "dataframe": dataframe,
            "df_mapping": df_mapping,
            "column_groups": column_groups,
            "group_labels": group_labels,
            "id_column": id_column,
        }
        if dataframe is not None:
            experiment_results = DataframeHandler()._handle_cases(
                Tester.__filter_data, filter_spark_and_make_groups, **__filtering_kwargs
            )

        arguments_choice: types._PrepareArgumentsType = {
            "experiment_results": (self.__experiment_results, experiment_results),
            "metrics": (self.__metrics, metrics),
            "alpha": (self.__alpha, first_type_errors),
        }
        chosen_args: types._UsageArgumentsType = Tester._prepare_arguments(arguments_choice)
        chosen_args["effect_type"] = effect_type
        chosen_args["criterion"] = criterion
        effective_metric_funcs = {**self.__metric_funcs, **(metric_funcs or {})}
        chosen_args["metric_funcs"] = effective_metric_funcs

        run_srm_check: bool = check_srm if check_srm is not None else srm_expected_ratios is not None
        if run_srm_check:
            Tester.__warn_on_srm(chosen_args["experiment_results"], srm_expected_ratios)

        hypothesis_num: int = len(list(itertools.combinations(chosen_args["experiment_results"], 2))) * len(
            chosen_args["metrics"]
        )
        apply_correction: bool = correction_method is not None and hypothesis_num > 1
        nominal_alpha: np.ndarray = np.array(chosen_args["alpha"])
        if apply_correction:
            correction_method = multitest_pkg.validate_method(correction_method)
            # Stage 1: widen confidence intervals for constant-scaling methods
            # (Bonferroni, Sidak); the step-wise methods keep the nominal level.
            chosen_args["alpha"] = multitest_pkg.alpha_for_confidence_interval(
                nominal_alpha, correction_method, hypothesis_num
            )

        result: types.TesterResult = {}
        # Variating over all pairs of groups - comb(n, 2)
        for group_a_label, group_b_label in itertools.combinations(chosen_args["experiment_results"], 2):
            test_name = f"group_{group_a_label}_vs_group_{group_b_label}"
            chosen_args["data_a_group"] = chosen_args["experiment_results"][group_a_label]
            chosen_args["data_b_group"] = chosen_args["experiment_results"][group_b_label]
            pre_run_args = (method, chosen_args)
            subresult: types.TesterResult = Tester.__pre_run(*pre_run_args, **kwargs)
            subresult["group_a_label"] = group_a_label
            subresult["group_b_label"] = group_b_label
            result[test_name] = subresult

        # Stage 2: adjust the family of p-values and restore the reported alpha.
        if apply_correction:
            Tester.__apply_multitest_correction(result, chosen_args["metrics"], correction_method, nominal_alpha)

        result = Tester.as_table(result)
        if not as_table:
            result = result.to_dict(orient="records")
        return result





[docs]
def test(
    effect_type: str = "absolute",
    method: str = "theory",
    dataframe: Optional[types.PassedDataType] = None,
    df_mapping: Optional[types.GroupsInfoType] = None,
    experiment_results: Optional[types.ExperimentResults] = None,
    id_column: Optional[str] = None,
    column_groups: Optional[str] = None,
    group_labels: Optional[types.GroupLabelsType] = None,
    metrics: Optional[types.MetricNamesType] = None,
    first_type_errors: Optional[types.StatErrorType] = None,
    criterion: Optional[ABStatCriterion] = None,
    correction_method: Union[str, None] = "bonferroni",
    as_table: bool = True,
    metric_funcs: Optional[Dict[str, Callable]] = None,
    check_srm: Optional[bool] = None,
    srm_expected_ratios: Optional[Dict[Any, float]] = None,
    **kwargs,
) -> types.TesterResult:
    """
    Function wrapper around the ``Tester`` class.

    Apply on the experimental data to get the results of an experiment.

    Creates an instance of the ``Tester`` class internally and execute
    run method with corresponding arguments.

    Parameters
    ----------
    effect_type : str, default: ``"absolute"``
        Effect type to calculate.
        Could be ``"absolute"`` or ``"relative"``.
    method : str, default: ``"theory"``
        Type of testing approach.
        Can take the values ``"theory"``, ``"empiric"`` or ``"binary"``.
    dataframe : PassedDataType, optional
        Data used to calculate the results of an experiment.
    df_mapping : GroupsInfoType, optional
        Dataframe which contains group labels of objects.
    experiment_results : ExperimentResults
        Dict with separate experiment results for each group.
        Dict keys are used as groups labels, values must be either
        pandas or Spark dataframes.
    column_groups : ColumnNameType
        Column which contains groups label of objects.
    group_labels : GroupLabelsType
        Labels for experimental groups.
    id_column : ColumnNameType
        Name of column with objects ids in ``df_mapping`` dataframe.
    first_type_errors : StatErrorType, default: ``0.05``
        I type errors values.
    metrics : MetricNameType
        Columns of dataframe with experiment results.
    criterion : ABStatCriterion, optional
        Statistical criterion for hypotheses testing.
        If ``method`` is ``"theory"`` and no criterion provided,
        ttest for independent samples will be used.
    correction_method : Union[str, None], default: ``"bonferroni"``
        Method for multiple hypothesis testing correction of p-values.
        Supported values: ``"bonferroni"``, ``"sidak"``, ``"holm"``,
        ``"holm-sidak"``, ``"fdr_bh"`` (Benjamini-Hochberg),
        ``"fdr_by"`` (Benjamini-Yekutieli), ``"hommel"``,
        ``"simes-hochberg"``; pass ``None`` to disable correction.
        The family size equals the number of group-pair combinations
        times the number of metrics. For ``"bonferroni"`` and ``"sidak"``
        confidence intervals are widened accordingly; the other, step-wise
        methods adjust only the p-values and leave intervals at the nominal
        level.
    as_table : bool, default: ``True``
        Return the test results as a pandas dataframe.
        If ``False``, a list of dicts with results will be returned.
    metric_funcs : Dict[str, Callable], optional
        Dictionary mapping metric names to callable functions.
        Each function receives a group ``pd.DataFrame`` and returns
        array-like values. Only pandas DataFrames supported.
    check_srm : bool, optional
        Run a Sample Ratio Mismatch check on the group sizes before
        testing and emit a warning if the observed sizes deviate from
        the expected ratios. By default the check runs only when
        ``srm_expected_ratios`` is provided; pass ``True`` to enable it
        with equal expected sizes or ``False`` to disable it entirely.
    srm_expected_ratios : Dict[Any, float], optional
        Expected group size ratios for the Sample Ratio Mismatch check.
        Pass it when the split is intentionally unequal.
    **kwargs : Dict
        Other keyword arguments.

    Returns
    -------
    result : types.TesterResult
        Experiment results as pandas table or list of dicts for each metric
        and first type error.
    """
    return Tester(
        dataframe=dataframe,
        df_mapping=df_mapping,
        id_column=id_column,
        column_groups=column_groups,
        group_labels=group_labels,
        metrics=metrics,
        first_type_errors=first_type_errors,
    ).run(
        effect_type=effect_type,
        method=method,
        experiment_results=experiment_results,
        criterion=criterion,
        correction_method=correction_method,
        as_table=as_table,
        metric_funcs=metric_funcs,
        check_srm=check_srm,
        srm_expected_ratios=srm_expected_ratios,
        **kwargs,
    )