Source code for gdphelper.gdpdescribe

import pandas as pd
import numpy as np


[docs]def gdpdescribe(df, x, y, stats=["mean", "sd", "median"], dec=2):
    """
    Calculates summary statistics for the Numeric Variable x grouping by categorical variable y.

    The function is able to calculate the following descriptive statistics:

      Mean
      Median
      Standard Deviation
      Minimum Value
      Maximum Value
      Range
      Value of 75th percentile
      Value of 25th percentile
      Interquartile range
      Number of Missing values

    Parameters
    ----------
    df: pd.Dataframe
        pandas dataframe with the variables to analyze

    x : str
        column name of a pandas dataframe used to calculate the descriptive statistics

    y: str
      column name of a grouping variable

    dec: int
       number of decimal places to return in the table

    stats: list, default ["mean", "sd", "median"]
         Descriptive statistics to calculate

    Returns
    -------
    pd.Dataframe
        Table with the summary statistics specified as arguments of the function

    Examples
    --------
    >>> gdpdescribe(df, "Value", "Location", stats=["mean", "median", "sd", "min", "max", "range_", "q75", "q25", "iqr", "nas"], dec=3)
    """
    # Avoid scientific notation in the display, use decimal points
    pd.set_option("display.float_format", "{:." f"{dec}" "f}".format)

    # validate inputs arguments of the function

    if isinstance(df, pd.core.frame.DataFrame):
        pass
    else:
        raise ValueError("df should be a pd.DataFrame")

    if x in df.columns.tolist() and y in df.columns.tolist():
        pass
    else:
        raise ValueError("Incorrect Variable names")

    if df[y].nunique() < df.shape[0]:
        pass
    else:
        raise ValueError("Variable Y has unique values in every row, cannot group")

    if df[x].dtype.type == np.float64 or df[x].dtype.type == np.int64:
        pass
    else:
        raise ValueError("Variable X is not numeric")

    if df[y].dtype.type == np.object_:
        pass
    else:
        raise ValueError("Variable Y is not categorical")

    if all(
        x
        in ["mean", "median", "sd", "min", "max", "range_", "q75", "q25", "iqr", "nas"]
        for x in stats
    ):
        pass
    else:
        raise ValueError(
            "The statistic to calculate is not correct! Please enter a valid one"
        )

    # initialize and calculate the stats

    mean = None
    mean = df[[x, y]].groupby([y], dropna=True).mean()
    mean.rename(columns={x: f"mean {x}"}, inplace=True)

    median = None
    median = df[[x, y]].groupby([y], dropna=True).median()
    median.rename(columns={x: f"Median {x}"}, inplace=True)

    sd = None
    sd = df[[x, y]].groupby([y], dropna=True).std()
    sd.rename(columns={x: f"Standard devitation {x}"}, inplace=True)

    min = None
    min = df[[x, y]].groupby([y], dropna=True).min()
    min.rename(columns={x: f"Min {x}"}, inplace=True)

    max = None
    max = df[[x, y]].groupby([y], dropna=True).max()
    max.rename(columns={x: f"Max {x}"}, inplace=True)

    range_ = None
    range_ = df[[x, y]].groupby([y], dropna=True).apply(lambda z: z.max() - z.min())
    range_.rename(columns={x: f"Range {x}"}, inplace=True)

    q75 = None
    q75 = df[[x, y]].groupby([y], dropna=True).quantile(q=0.75)
    q75.rename(columns={x: f"Quartile 75th  {x}"}, inplace=True)

    q25 = None
    q25 = df[[x, y]].groupby([y], dropna=True).quantile(q=0.25)
    q25.rename(columns={x: f"Quartile 25th  {x}"}, inplace=True)

    iqr = (
        df[[x, y]]
        .groupby([y], dropna=True)
        .apply(lambda x: x.quantile(q=0.75) - x.quantile(q=0.25))
    )
    iqr.rename(columns={x: f"IQR  {x}"}, inplace=True)

    nas = None
    nas = df[[x, y]].groupby([y], dropna=True).agg(lambda x: x.isnull().sum())
    nas.rename(columns={x: f"NAs  {x}"}, inplace=True)

    # validate stats to calculate
    out = []

    for i in stats:
        i_1 = eval(i)
        out.append(i_1)

    results = pd.concat(out, axis=1).T
    return results