Source code for myforestplot.cleaning_utils

from typing import Union, Optional, List, Dict, Tuple, Any, Callable
import pandas as pd
import numpy as np
import statsmodels


[docs]def statsmodels_fitting_result_dataframe( res, alpha: float = 0.05, accessor: Callable[[np.array], np.array] = np.exp, ) -> pd.DataFrame: """Create category and item columns from the statsmodels result. Categorical results are divided into original column name (category) and its items (item). Args: res: statsmodels' fitting results. alpha: The significance level for the confidence interval. accessor: Function to access each model result, which is summarized and displayed. """ df = accessor(res.conf_int(alpha=0.05)) df["risk"] = accessor(res.params) df["pvalues"] = res.pvalues cate = "category" df[cate] = np.nan rename_dic = {} for ind in df.index: if "[" in ind: s1, s2 = ind.split("[") rename_dic[ind] = s2[2:-1] # For case of specifying Treatment in formula. if "Treatment('" in s1: s1 = s1.split(",")[0][2:] df.loc[ind, cate] = s1 df.rename(index=rename_dic, inplace=True) # Insert the same name for "category" in case of continuous variables. cond = df[cate].isnull() df[cate] = df[cate].mask(cond, df.index) df = df.reset_index().rename(columns={"index":"item"}) df.insert(0, column="category", value=df.pop("category")) df.insert(1, column="item", value=df.pop("item")) # drop Intercept. df = df[df["category"] != "Intercept"] return df
[docs]def add_pretty_risk_column(res: pd.DataFrame, risk: str, lower: str, upper: str, fml: str = ".2f", ref: str = "Ref." ) -> pd.Series: """Add prrety risk string column. Args: res: Dataframe contaning points and confidence intervals. risk: point estimates of risk column name. lower: lower confidence interval column name. upper: upper confidence interval column name. fml: formula for f string. ref: if point esitmate column is empty, insert this string. """ def f(x): risk_v = x[risk] lower_v = x[lower] upper_v = x[upper] s = f"{risk_v:{fml}} ({lower_v:{fml}}, {upper_v:{fml}})" return s ser = (res.apply(f, axis=1) .mask(res[risk].isnull(), ref) ) return ser
[docs]def count_category_frequency(df: pd.DataFrame, categorical_cols: List[str], impute_continuous: bool = True, ) -> pd.DataFrame: """Count category frequency. Args: df: Original dataframe. categorical_cols: Columns for categorical variables. impute_continuous: columns not specified as categorical_cols were imputed for item and number of observations (nobs). """ n = df.shape[0] sers = [(df[c] .value_counts() .to_frame() .stack() ) for c in categorical_cols] ser_sum = pd.concat(sers) df_nobs = (ser_sum .reset_index() .rename(columns={"level_0": "item", "level_1": "category", 0:"nobs"} ) ) df_nobs.insert(0, column="category", value=df_nobs.pop("category")) df_nobs.insert(1, column="item", value=df_nobs.pop("item")) return df_nobs
[docs]def sort_category_item(df_: pd.DataFrame, order: List[str], item_order: Dict[str, List[str]] = None, ) -> pd.DataFrame: """Sort category and item based on categorical values. Args: df_ : dataframe containing category and item. categorical: Dictionary containing column names and its order of items. order : if specified, category is ordered based on this variable. """ if item_order is None: item_order = {} df_sorted = pd.DataFrame() for c in order: cond = df_["category"] == c dfM = df_[cond] if c in item_order.keys(): lis = item_order[c] sort_dic = {l:i for i,l in enumerate(lis)} dfM = dfM.sort_values(by="item",key=lambda x: x.replace(sort_dic)) df_sorted = pd.concat((df_sorted, dfM), axis=0) return df_sorted
[docs]def statsmodels_pretty_result_dataframe( data: pd.DataFrame, res, order: List[str], cont_cols: Optional[List[str]] = None, item_order: Dict[str, List[str]] = None, fml: str = ".2f", accessor: Callable[[np.array], np.array] = np.exp, ) -> pd.DataFrame: """Obtain pretty result dataframe from statsmodels results. Fitting coefficients are converted by np.exp. Args: data: original dataframe. res: statsmodels results. categorical: Dictionary containing column names and its order of items. order : if specified, category is ordered based on this variable. fml: formula for f string of pretty risk. accessor: Function to access each model result, which is summarized and displayed. """ if res.nobs != data.shape[0]: raise Exception(("Some observations were dropped when fitted, " "check number of observations" )) if cont_cols is None: cate_cols = order else: cate_cols = [c for c in order if not c in cont_cols] df_res = statsmodels_fitting_result_dataframe(res, alpha=0.05, accessor=accessor) df_nobs = count_category_frequency(data, cate_cols) df_sum = pd.merge(df_res, df_nobs, on=["category", "item"], validate="1:1", how="outer") df_sum = sort_category_item(df_sum, order=order, item_order=item_order) df_sum["risk_pretty"] = add_pretty_risk_column(df_sum, risk="risk", lower=0, upper=1, fml=".2f" ) return df_sum