petl is a Python package for Extracting, Transforming and Loading tables of data.
In this tutorial we will show how to use it to simulate a complete ETL pipeline
We will use the data present in https://opendata.ecdc.europa.eu/covid19/casedistribution/json/. The structure of the json file is the following:
{
"records": [
{
"dateRep": "21/04/2020",
"day": "21",
"month": "4",
"year": "2020",
"cases": "35",
"deaths": "2",
"countriesAndTerritories": "Afghanistan",
"geoId": "AF",
"countryterritoryCode": "AFG",
"popData2018": "37172386",
"continentExp": "Asia"
}, {
...
}
]
So it is not possibile to load json file directly into petl
Then we will use a python list object containing the "records" array in json file
import petl as etl
import requests
import json
import os.path # To check if a file already exists
import datetime # For create a new file every day
import matplotlib.pyplot as plt # only for example
import pandas as pd # only as an example
import math
today = datetime.date.today()
filename = "covid19-%s.json" % datetime.date.strftime(today, "%Y%m%d")
# Check if today file exists
if not os.path.isfile(filename):
# if not download data and write it into file
conn = requests.get("https://opendata.ecdc.europa.eu/covid19/casedistribution/json/")
jsondata = conn.text
f = open(filename, "w")
f.write(jsondata)
f.close()
f = open(filename)
dict_data = json.load(f)
f.close()
table = etl.fromdicts(dict_data["records"]) # create a petl table from a dict array
table
Data present in opendata.ecdc.europa.eu table have some problems
print(etl.typeset(table, "cases")) # cases column is in string format
print(table["dateRep"][0]) # bad format, impossible to sort
table = etl.convert(table, ["cases", "deaths", "day", "month", "year", "popData2018"], float) # convert to integer
print(etl.typeset(table, "cases")) # now cases column is in int format
### Create date as a transformation of dateRep (yyyy/mm/dd)
table = etl.addfield(table, "date", lambda row: row["dateRep"][6:]+row["dateRep"][2:6]+row["dateRep"][0:2])
print(table["date"][0]) # check correct ISO form printing first element of column
table = etl.sort(table, ['geoId', 'date'])
In this case we encounter a problem because in some records information about population is missing.
Then we must check it before calculate the ratio
We introduce here a lambda function (inline function)
The function fun check if popData2018 and cases fields are both float then compute ratio, else set ratio as NaN
fun = lambda row: row["cases"] / row["popData2018"] if type(row["cases"]) == float and type(row["popData2018"]) == float else math.nan
table = etl.addfield(table, "casesratio", fun)
year, month and day are not necessary
table = etl.cutout(table, "day")
table = etl.cutout(table, "month")
table = etl.cutout(table, "year")
Select only european countries using a selection
Also in this case we will use a lambda function (inline function)
europe = etl.select(table, lambda row: row["continentExp"] == "Europe")
europe = etl.cut(europe, ["date", "countriesAndTerritories","popData2018", "cases", "deaths", "casesratio"])
etl.tocsv(europe, 'europe.csv')
df = etl.todataframe(table) # petl can load file also in a <i>pandas</i> <b>DataFrame</b>
italy = df["geoId"] == "IT"
pandas can compute statistical function directly from Serie
italy = df[(df["geoId"] == "IT") & (df["date"] > '2020/03/01')]
france = df[(df["geoId"] == "FR") & (df["date"] > '2020/03/01')]
germany = df[(df["geoId"] == "DE") & (df["date"] > '2020/03/01')]
uk = df[(df["geoId"] == "UK") & (df["date"] > '2020/03/01')]
spain = df[(df["geoId"] == "ES") & (df["date"] > '2020/03/01')]
plt.figure(figsize=(16, 9))
plt.plot(italy["date"], italy["cases"].cumsum(), label = 'Italy')
plt.plot(france["date"], france["cases"].cumsum().shift(-10), label = 'France')
plt.plot(germany["date"], germany["cases"].cumsum().shift(-8), label = 'Germany')
plt.plot(uk["date"], uk["cases"].cumsum().shift(-16), label = 'UK')
plt.plot(spain["date"], spain["cases"].cumsum().shift(-6), label = 'Spain')
plt.xticks(range(0, len(italy), 7), rotation = 45)
plt.legend()
plt.show()
europe
Now:
- transform table in array of tuples
- create vector v with cumsum values
- add a new column cumsum using vector v
- use the field to draw a graph
arr = etl.toarray(europe)
cumsum = int(arr[0][3]) # first value
# remember table is well erdered
v = [cumsum] # first element of new columns
for i in range(1, len(arr)):
if arr[i][1] != arr[i-1][1]:
cumsum = int(arr[i][3]) # if country changes cumsum starts from beginning
else:
cumsum = cumsum + int(arr[i][3]) # otherwise continue to cumulate data
v.append(cumsum)
t3 = etl.addcolumn(europe, 'cumsum', v)
t3
Select Italy and France...
italy = etl.select(t3, lambda row: row["countriesAndTerritories"] == "Italy")
france = etl.select(t3, lambda row: row["countriesAndTerritories"] == "France")
plt.figure(figsize=(16, 9))
plt.plot(italy["date"], italy["cumsum"])
plt.plot(france["date"], france["cumsum"])
plt.xticks(range(0, len(italy), 7), rotation=45)
plt.show()