Source code for gdphelper.gdpcleaner

import pandas as pd
[docs]def gdpcleaner(gdpdata: pd.DataFrame):
    """
    Author: Gabe Fairbrother

    Remove spurious columns, Rename relevant columns, Remove NaNs



    Parameters
    ----------
    gdpdata: DataFrame 
    a loaded dataframe based on a downloaded Open Government GDP at basic prices dataset (https://open.canada.ca/en/open-data)


    Returns
    -------

    DataFrame:  A cleaned and simplified DataFrame of the relevant columns for summary and visualization. 
    Possible columns (dataset dependent) include:
        Date: Date of data
        Location: Province or Jurisdiction
        Scale: Scale of the Value column (Percent, Millions, etc)
        Unit: Unit of Measure
        Value: Portion of the GDP for the Location and Date
        NAICS_Class: North American Industry Classification System ID 
        Industry: Industry of Record
        Sub-sector: Non-profit sub-sector
        Special_Industry: Special Industry Aggregate

    Examples
    --------
    >>> result = gdpcleaner(example_data)
    """

    #Check for DataFrame input argument
    if (isinstance(gdpdata, pd.core.frame.DataFrame)):
        pass
    else:
        raise TypeError("Argument must be a Pandas DataFrame")

    cleaned_frame = gdpdata

    #Remove spurious columns
    spurious = ['DGUID', 'UOM_ID', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 
                                          'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS', 'Value', 'Seasonal adjustment']
    for column in cleaned_frame.columns :
        if column in spurious:
            cleaned_frame = cleaned_frame.drop(columns=column)


    #Drop any rows with null value
    cleaned_frame = cleaned_frame.dropna()
    
    #Rename relevant columns
    cleaned_frame = cleaned_frame.rename(columns={'REF_DATE': 'Date', 'GEO': 'Location', 
                                            'SCALAR_FACTOR': 'Scale', 'VALUE': 'Value', 'UOM': 'Unit'})

    for column in cleaned_frame.columns:
        if 'NAICS' in column:

            cleaned_frame = cleaned_frame.rename(columns={column: 'NAICS_Class'})
        if 'aggregat' in column: #Not a spelling mistake, there are multiple similar column headers in different datasets
            cleaned_frame = cleaned_frame.rename(columns={column: 'Special_Industry'})

    return cleaned_frame