Source code for myforestplot.cleaning_utils

from typing import Union, Optional, List, Dict, Tuple, Any, Callable
import pandas as pd
import numpy as np
import statsmodels


[docs]def statsmodels_fitting_result_dataframe(
    res,
    alpha: float = 0.05,
    accessor: Callable[[np.array], np.array] = np.exp,
    ) -> pd.DataFrame:
    """Create category and item columns from the statsmodels result.
    Categorical results are divided into original column name (category) and
    its items (item).

    Args:
        res: statsmodels' fitting results.
        alpha: The significance level for the confidence interval.
        accessor: Function to access each model result, which is summarized and displayed.
    """
    df = accessor(res.conf_int(alpha=0.05))
    df["risk"] = accessor(res.params)
    df["pvalues"] = res.pvalues

    cate = "category"
    df[cate] = np.nan
    rename_dic = {}
    for ind in df.index:
        if "[" in ind:
            s1, s2 = ind.split("[")
            rename_dic[ind] = s2[2:-1]
            # For case of specifying Treatment in formula.
            if "Treatment('" in s1:
                s1 = s1.split(",")[0][2:]
            df.loc[ind, cate] = s1
    df.rename(index=rename_dic, inplace=True)

    # Insert the same name for "category" in case of continuous variables.
    cond = df[cate].isnull()
    df[cate] = df[cate].mask(cond, df.index)
    df = df.reset_index().rename(columns={"index":"item"})

    df.insert(0, column="category", value=df.pop("category"))
    df.insert(1, column="item", value=df.pop("item"))

    # drop Intercept.
    df = df[df["category"] != "Intercept"]

    return df


[docs]def add_pretty_risk_column(res: pd.DataFrame, risk: str, lower: str, upper: str,
                           fml: str = ".2f", ref: str = "Ref."
                           ) -> pd.Series:
    """Add prrety risk string column.

    Args:
        res: Dataframe contaning points and confidence intervals.
        risk: point estimates of risk column name.
        lower: lower confidence interval column name.
        upper: upper confidence interval column name.
        fml: formula for f string.
        ref: if point esitmate column is empty, insert this string.
    """
    def f(x):
        risk_v = x[risk]
        lower_v = x[lower]
        upper_v = x[upper]
        s = f"{risk_v:{fml}} ({lower_v:{fml}}, {upper_v:{fml}})"
        return s

    ser = (res.apply(f, axis=1)
           .mask(res[risk].isnull(), ref)
           )
    return ser


[docs]def count_category_frequency(df: pd.DataFrame,
                             categorical_cols: List[str],
                             impute_continuous: bool = True,
                             ) -> pd.DataFrame:
    """Count category frequency.

    Args:
        df: Original dataframe.
        categorical_cols: Columns for categorical variables.
        impute_continuous: columns not specified as categorical_cols were
            imputed for item and number of observations (nobs).
    """
    n = df.shape[0]
    sers = [(df[c]
             .value_counts()
             .to_frame()
             .stack()
             )
             for c in categorical_cols]
    ser_sum = pd.concat(sers)
    df_nobs = (ser_sum
               .reset_index()
               .rename(columns={"level_0": "item",
                                "level_1": "category",
                                0:"nobs"}
                       )
               )

    df_nobs.insert(0, column="category", value=df_nobs.pop("category"))
    df_nobs.insert(1, column="item", value=df_nobs.pop("item"))
    return df_nobs


[docs]def sort_category_item(df_: pd.DataFrame,
                       order: List[str],
                       item_order: Dict[str, List[str]] = None,
                       ) -> pd.DataFrame:
    """Sort category and item based on categorical values.

    Args:
        df_ : dataframe containing category and item.
        categorical: Dictionary containing column names and its order of items.
        order : if specified, category is ordered based on this variable.

    """
    if item_order is None:
        item_order = {}

    df_sorted = pd.DataFrame()
    for c in order:
        cond = df_["category"] == c
        dfM = df_[cond]
        if c in item_order.keys():
            lis = item_order[c]
            sort_dic = {l:i for i,l in enumerate(lis)}
            dfM = dfM.sort_values(by="item",key=lambda x: x.replace(sort_dic))
        df_sorted = pd.concat((df_sorted, dfM), axis=0)
    return df_sorted


[docs]def statsmodels_pretty_result_dataframe(
    data: pd.DataFrame,
    res,
    order: List[str],
    cont_cols: Optional[List[str]] = None,
    item_order: Dict[str, List[str]] = None,
    fml: str = ".2f",
    accessor: Callable[[np.array], np.array] = np.exp,
) -> pd.DataFrame:
    """Obtain pretty result dataframe from statsmodels results.
    Fitting coefficients are converted by np.exp.

    Args:
        data: original dataframe.
        res: statsmodels results.
        categorical: Dictionary containing column names and its order of items.
        order : if specified, category is ordered based on this variable.
        fml: formula for f string of pretty risk.
        accessor: Function to access each model result, which is summarized and displayed.
    """
    if res.nobs != data.shape[0]:
        raise Exception(("Some observations were dropped when fitted, "
                         "check number of observations"
                        ))
    if cont_cols is None:
        cate_cols = order
    else:
        cate_cols = [c for c in order if not c in cont_cols]
    df_res = statsmodels_fitting_result_dataframe(res, alpha=0.05, accessor=accessor)
    df_nobs = count_category_frequency(data, cate_cols)
    df_sum = pd.merge(df_res, df_nobs,
                      on=["category", "item"],
                      validate="1:1",
                      how="outer")
    df_sum = sort_category_item(df_sum, order=order, item_order=item_order)
    df_sum["risk_pretty"] = add_pretty_risk_column(df_sum,
                                                   risk="risk",
                                                   lower=0,
                                                   upper=1,
                                                   fml=".2f"
                                                   )
    return df_sum