import requests, zipfile
import os
import pandas as pd
[docs]def gdpimporter(url, filename=None, filetype='csv'):
"""Download the zipped file, unzip, rename the unzipped files, and
outputs a dataframe along with the title from meta data.
This function downloads the zipped data from URL to the local path,
unzips and renames the files as desired. It then returns the data
frame along with the title as a tuple.
Parameters
----------
url : str
URL to the zip file (ends with .zip)
filename : str
the filename that the unzipped csv data (not the MetaData) has.
If None, 'open_canada_data.csv' will be the filename.
This argument is not useful when filetype is set to 'all'
filetype : {'csv', 'all'}, default 'csv'
the types of files that will be extracted. If 'csv', only csv
files are extracted'. If 'all', files of all types are extracted
Returns
-------
(DataFrame, str) :
A tuple containing the dataframe and the title of the data extracted
from the meta data.
Examples
--------
>>> gdpimporter("https://www150.statcan.gc.ca/n1/tbl/csv/36100400-eng.zip")
"""
# Exception handling: check if the arguments are feasible
if (filename != None) and (not isinstance(filename, str)):
raise TypeError("'filename' should be either None (default) or a string.")
if filetype not in ['csv', 'all']:
raise ValueError("'filetype' should either be 'csv' (by default) or 'all'.")
if (not isinstance(url, str)) or (not url.endswith('.zip')):
raise ValueError("'url' should be a valid url of a zipfile that ends with '.zip'.")
zipname = url.split("/")[-1] ## get the name of original zipfile
req = requests.get(url)
with open(zipname, "wb") as code:
code.write(req.content)
zipdata = zipfile.ZipFile(zipname)
zipinfos = zipdata.infolist()
if filetype == "csv":
# iterate through each file
for zipinfo in zipinfos:
# This will do the renaming
if zipinfo.filename.endswith(".csv") and not zipinfo.filename.endswith("MetaData.csv"):
if filename == None:
zipinfo.filename = f"open_canada_data.csv"
else: ## must be a str
zipinfo.filename = f"{filename}.csv"
zipdata.extract(zipinfo)
else:
for zipinfo in zipinfos:
zipdata.extract(zipinfo)
zipdata.close()
for filepath in os.listdir():
if filepath == f"{zipname[:-8]}_MetaData.csv":
metadata = pd.read_csv(filepath)
os.remove(filepath) # Clean up the metadata
continue
elif filepath.endswith('.zip'):
os.remove(filepath) # Clean up zip
continue
if filename == None:
if filepath == "open_canada_data.csv":
data = pd.read_csv(filepath)
else:
if filepath == f"{filename}.csv":
data = pd.read_csv(filepath)
return data, metadata.index[0]