Source code for gdphelper.gdpdescribe

import pandas as pd
import numpy as np


[docs]def gdpdescribe(df, x, y, stats=["mean", "sd", "median"], dec=2): """ Calculates summary statistics for the Numeric Variable x grouping by categorical variable y. The function is able to calculate the following descriptive statistics: Mean Median Standard Deviation Minimum Value Maximum Value Range Value of 75th percentile Value of 25th percentile Interquartile range Number of Missing values Parameters ---------- df: pd.Dataframe pandas dataframe with the variables to analyze x : str column name of a pandas dataframe used to calculate the descriptive statistics y: str column name of a grouping variable dec: int number of decimal places to return in the table stats: list, default ["mean", "sd", "median"] Descriptive statistics to calculate Returns ------- pd.Dataframe Table with the summary statistics specified as arguments of the function Examples -------- >>> gdpdescribe(df, "Value", "Location", stats=["mean", "median", "sd", "min", "max", "range_", "q75", "q25", "iqr", "nas"], dec=3) """ # Avoid scientific notation in the display, use decimal points pd.set_option("display.float_format", "{:." f"{dec}" "f}".format) # validate inputs arguments of the function if isinstance(df, pd.core.frame.DataFrame): pass else: raise ValueError("df should be a pd.DataFrame") if x in df.columns.tolist() and y in df.columns.tolist(): pass else: raise ValueError("Incorrect Variable names") if df[y].nunique() < df.shape[0]: pass else: raise ValueError("Variable Y has unique values in every row, cannot group") if df[x].dtype.type == np.float64 or df[x].dtype.type == np.int64: pass else: raise ValueError("Variable X is not numeric") if df[y].dtype.type == np.object_: pass else: raise ValueError("Variable Y is not categorical") if all( x in ["mean", "median", "sd", "min", "max", "range_", "q75", "q25", "iqr", "nas"] for x in stats ): pass else: raise ValueError( "The statistic to calculate is not correct! Please enter a valid one" ) # initialize and calculate the stats mean = None mean = df[[x, y]].groupby([y], dropna=True).mean() mean.rename(columns={x: f"mean {x}"}, inplace=True) median = None median = df[[x, y]].groupby([y], dropna=True).median() median.rename(columns={x: f"Median {x}"}, inplace=True) sd = None sd = df[[x, y]].groupby([y], dropna=True).std() sd.rename(columns={x: f"Standard devitation {x}"}, inplace=True) min = None min = df[[x, y]].groupby([y], dropna=True).min() min.rename(columns={x: f"Min {x}"}, inplace=True) max = None max = df[[x, y]].groupby([y], dropna=True).max() max.rename(columns={x: f"Max {x}"}, inplace=True) range_ = None range_ = df[[x, y]].groupby([y], dropna=True).apply(lambda z: z.max() - z.min()) range_.rename(columns={x: f"Range {x}"}, inplace=True) q75 = None q75 = df[[x, y]].groupby([y], dropna=True).quantile(q=0.75) q75.rename(columns={x: f"Quartile 75th {x}"}, inplace=True) q25 = None q25 = df[[x, y]].groupby([y], dropna=True).quantile(q=0.25) q25.rename(columns={x: f"Quartile 25th {x}"}, inplace=True) iqr = ( df[[x, y]] .groupby([y], dropna=True) .apply(lambda x: x.quantile(q=0.75) - x.quantile(q=0.25)) ) iqr.rename(columns={x: f"IQR {x}"}, inplace=True) nas = None nas = df[[x, y]].groupby([y], dropna=True).agg(lambda x: x.isnull().sum()) nas.rename(columns={x: f"NAs {x}"}, inplace=True) # validate stats to calculate out = [] for i in stats: i_1 = eval(i) out.append(i_1) results = pd.concat(out, axis=1).T return results