Get holiday data from RijksOverheid¶
NOTE THIS SCRIPT HAS BROKEN¶
- Apparently you are no longer able to retrieve data from this source from before 2019-01-01 !
…
parse dataset from url: https://opendata.rijksoverheid.nl/v1/sources/rijksoverheid/infotypes/schoolholidays
Each of these groups may be used for separate models and use different external features.
Scrapes holidays from rijksoverheid.nl and writes them to CSV
Output format:
ds | Herfstvakantie_noord | Herfstvakantie_zuid | Meivakantie_heelNederland
2014-10-10 | 1.0 | 0.0 | 0.0
2014-10-11 | 1.0 | 0.0 | 0.0
2014-10-12 | 1.0 | 0.0 | 0.0
2014-10-13 | 1.0 | 0.0 | 0.0
2014-10-14 | 1.0 | 0.0 | 0.0
2014-10-15 | 1.0 | 0.0 | 0.0
Set parameters¶
Parameters
start_date: String first date to include in output data. Must be possible to parse with pandas.to_datetime() - end_date: String last date to include in output data. Must be possible to parse with pandas.to_datetime() |
| Returns |
CSV file named {output_file} with 1 DateTime column ds and additional columns with binary holiday indicator values
[18]:
# parameters
output_file = "tmp_dump/scraped_holidays.csv"
start_date = "2018-01-01"
end_date = "2020-12-31" # prefix to standardized filename
Load packages¶
Uses conda environment: envs/schiphol-py.yml
[19]:
import re
import urllib.request
from pathlib import Path
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
Utility functions¶
[20]:
def parse_xml_schoolholidays(xml_string):
"""
Function to parse dataset from url: https://opendata.rijksoverheid.nl/v1/sources/rijksoverheid/infotypes/schoolholidays
args:
xml_string (str): string of xml document
returns:
pandas.DataFrame containing data from xml
"""
def get_schoolyear_from_string(x):
schoolyear = re.findall("20[0-9]+", x)[0]
return schoolyear
root = ET.fromstring(xml_string) # Make ElementTree root to make parsing of data easy
# In the following loops data from the xml document is taken, made into a dictionary and a DataFrame is created
data_vacations = []
for document in root.findall("document"):
schoolyear_raw = document.findall("content")[0].findall("contentblock")[0].findall("schoolyear")[0].text
schoolyear = get_schoolyear_from_string(schoolyear_raw)
for vacations in document.findall("content")[0].findall("contentblock")[0].findall("vacations"):
for vacation in vacations.findall("vacation"):
type_vakantie = vacation.findall("type")[0].text.strip()
for regio in vacation.findall("regions"):
region = regio.findall("region")[0].text
region_start_dt = regio.findall("startdate")[0].text
region_end_dt = regio.findall("enddate")[0].text
dict_region = {
"schoolyear":schoolyear,
"region": region,
"type_vakantie":type_vakantie,
"region_start_dt": region_start_dt,
"region_end_dt": region_end_dt
}
data_vacations.append(dict_region)
df_schoolholidays = pd.DataFrame(data_vacations)
return(df_schoolholidays)
def transform_schoolyear_data_to_long_format(df):
"""
Very specific function to transform schoolyear data.
Every date get's a row for every region with an indication (vakantie_ind) if the date is a holiday.
From:
region | region_end_dt | region_start_dt | schoolyear | type_vakantie
noord | 2017-10-29T22:59:00.000Z | 2017-10-21T22:00:00.000Z | 2017 | schoolvakanties
midden | 2017-10-22T21:59:00.000Z | 2017-10-14T22:00:00.000Z | 2017 | schoolvakanties
zuid | 2017-10-22T21:59:00.000Z | 2017-10-14T22:00:00.000Z | 2017 | schoolvakanties
To:
index | type_vakantie | region | vakantie_ind
2017-10-21 00:00:00+00:00 | schoolvakanties | noord | 1.0
2017-10-22 00:00:00+00:00 | schoolvakanties | noord | 1.0
2017-10-23 00:00:00+00:00 | schoolvakanties | noord | 1.0
...
2017-10-29 00:00:00+00:00 | schoolvakanties | noord | 1.0
2017-10-31 00:00:00+00:00 | schoolvakanties | noord | 0.0
2017-10-01 00:00:00+00:00 | schoolvakanties | noord | 0.0
"""
list_df_regions = []
for row in df.iterrows():
record = row[1]
start = record["region_start_dt"]
end = record["region_end_dt"]
schoolyear = record["schoolyear"]
region = record["region"]
type_vakantie = record["type_vakantie"]
index = pd.date_range(start=start, end=end)
n_periods = len(index)
region_list = [region] * n_periods
type_vakantie_list = [type_vakantie] * n_periods
dict_data = {
"type_vakantie": type_vakantie_list,
"region": region_list,
"vakantie_ind": 1,
}
df_region = pd.DataFrame(index=index, data=dict_data)
list_df_regions.append(df_region)
df_holidays = pd.concat(list_df_regions)
list_df_resampled = []
for region in df_holidays["region"].unique():
type_vakantie = df_holidays["type_vakantie"][0]
df = df_holidays[df_holidays["region"]== region].resample("D").max()
df["type_vakantie"] = df["type_vakantie"].fillna(type_vakantie)
df["region"] = df["region"].fillna(region)
df["vakantie_ind"] = df["vakantie_ind"].fillna(0)
list_df_resampled.append(df)
df_holidays = pd.concat(list_df_resampled)
return df_holidays
def pivot_df_holidays_long(df):
"""
Specific function to pivot dataframe holidays in long format.
Input format:
ds | type_vakantie | region | vakantie_ind
2014-10-10 | Herfstvakantie | noord | 1.0
2014-10-11 | Herfstvakantie | noord | 1.0
2014-10-12 | Herfstvakantie | noord | 1.0
2014-10-13 | Herfstvakantie | noord | 1.0
2014-10-14 | Herfstvakantie | noord | 1.0
2014-10-15 | Herfstvakantie | noord | 1.0
...
2022-04-08 | Herfstvakantie | heel Nederland | 0.0
2022-04-09 | Herfstvakantie | heel Nederland | 0.0
2022-04-10 | Herfstvakantie | heel Nederland | 0
Output format:
ds | Herfstvakantie_noord | Herfstvakantie_zuid | Meivakantie_heelNederland
2014-10-10 | 1.0 | 0.0 | 0.0
2014-10-11 | 1.0 | 0.0 | 0.0
2014-10-12 | 1.0 | 0.0 | 0.0
2014-10-13 | 1.0 | 0.0 | 0.0
2014-10-14 | 1.0 | 0.0 | 0.0
2014-10-15 | 1.0 | 0.0 | 0.0
"""
list_df_pivots = []
for i, region in enumerate(df["region"].unique()):
df_pivot = df[df["region"]==region].pivot(index="ds", columns="type_vakantie", values=["vakantie_ind"]).fillna(0)
renamed_columns = []
for column in df_pivot.columns.get_level_values(1):
column_renamed = column + "_" + region.replace(" ", "")
renamed_columns.append(column_renamed)
df_pivot.columns = renamed_columns
df_pivot = df_pivot.reset_index()
list_df_pivots.append(df_pivot)
if i == 0:
df_holidays_pivot = df_pivot
else:
df_holidays_pivot = df_holidays_pivot.merge(df_pivot, on="ds")
df_holidays_pivot = df_holidays_pivot.fillna(0)
return df_holidays_pivot
Main step to gather and write holiday data¶
[21]:
import ssl
# fix certificate errors..
# https://stackoverflow.com/questions/35569042/ssl-certificate-verify-failed-with-python3
ssl._create_default_https_context = ssl._create_unverified_context
[22]:
start, end
[22]:
(Timestamp('2018-01-01 00:00:00'), Timestamp('2018-12-31 00:00:00'))
[23]:
df
[23]:
| schoolyear | region | type_vakantie | region_start_dt | region_end_dt | |
|---|---|---|---|---|---|
| 0 | 2019 | noord | Herfstvakantie | 2019-10-19T22:00:00.000Z | 2019-10-27T21:59:00.000Z |
| 1 | 2019 | midden | Herfstvakantie | 2019-10-19T22:00:00.000Z | 2019-10-27T22:59:00.000Z |
| 2 | 2019 | zuid | Herfstvakantie | 2019-10-12T22:00:00.000Z | 2019-10-20T22:59:00.000Z |
| 3 | 2019 | heel Nederland | Kerstvakantie | 2019-12-21T23:00:00.000Z | 2020-01-05T22:59:00.000Z |
| 4 | 2019 | noord | Voorjaarsvakantie | 2020-02-15T23:00:00.000Z | 2020-02-23T22:59:00.000Z |
| ... | ... | ... | ... | ... | ... |
| 72 | 2025 | zuid | Voorjaarsvakantie | 2026-02-14T23:00:00.000Z | 2026-02-22T22:59:00.000Z |
| 73 | 2025 | heel Nederland | Meivakantie | 2026-04-25T22:00:00.000Z | 2026-05-03T21:59:00.000Z |
| 74 | 2025 | noord | Zomervakantie | 2026-07-04T22:00:00.000Z | 2026-08-16T21:59:00.000Z |
| 75 | 2025 | midden | Zomervakantie | 2026-07-18T22:00:00.000Z | 2026-08-30T21:59:00.000Z |
| 76 | 2025 | zuid | Zomervakantie | 2026-07-11T22:00:00.000Z | 2026-08-23T21:59:00.000Z |
77 rows × 5 columns
[24]:
start, end = pd.to_datetime(start_date), pd.to_datetime(end_date)
url_schoolvakanties = "https://opendata.rijksoverheid.nl/v1/sources/rijksoverheid/infotypes/schoolholidays"
# Get XML string
with urllib.request.urlopen(url_schoolvakanties) as url:
xml_string = url.read()
# Parse to dataframe
df = parse_xml_schoolholidays(xml_string)
# Convert to feature data format with 'ds' dates column
df_holidays_long = transform_schoolyear_data_to_long_format(df)
df_holidays_long["ds"] = pd.to_datetime(df_holidays_long.index.date)
df_holidays_long_pivot = pivot_df_holidays_long(df_holidays_long)
df_holidays_long_pivot = df_holidays_long_pivot.drop(columns=["Herfstvakantie_heelNederland"])
df_holidays_long_pivot = df_holidays_long_pivot[["ds"] + [col for col in df_holidays_long_pivot.columns if col != "ds"]]
print("Subset holidays from period: %s to %s" % (start_date, end_date))
df_holidays_long_pivot = df_holidays_long_pivot \
.query("ds >= '%s' and ds <= '%s'" % (start, end))
print("Collected %d holiday features: " % df_holidays_long_pivot.shape[1],
df_holidays_long_pivot.columns.tolist())
df_holidays_long_pivot
Subset holidays from period: 2018-01-01 to 2020-12-31
Collected 12 holiday features: ['ds', 'Herfstvakantie_noord', 'Voorjaarsvakantie_noord', 'Zomervakantie_noord', 'Herfstvakantie_midden', 'Voorjaarsvakantie_midden', 'Zomervakantie_midden', 'Herfstvakantie_zuid', 'Voorjaarsvakantie_zuid', 'Zomervakantie_zuid', 'Kerstvakantie_heelNederland', 'Meivakantie_heelNederland']
[24]:
| ds | Herfstvakantie_noord | Voorjaarsvakantie_noord | Zomervakantie_noord | Herfstvakantie_midden | Voorjaarsvakantie_midden | Zomervakantie_midden | Herfstvakantie_zuid | Voorjaarsvakantie_zuid | Zomervakantie_zuid | Kerstvakantie_heelNederland | Meivakantie_heelNederland | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-12-21 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1 | 2019-12-22 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2 | 2019-12-23 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 2019-12-24 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 2019-12-25 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 372 | 2020-12-27 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 373 | 2020-12-28 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 374 | 2020-12-29 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 375 | 2020-12-30 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 376 | 2020-12-31 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
377 rows × 12 columns
[25]:
print("Writing holiday feature data to: %s" % output_file)
df_holidays_long_pivot.to_csv(output_file, header=True, index=False)
Writing holiday feature data to: tmp_dump/scraped_holidays.csv
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-25-b8de810df7fe> in <module>
1 print("Writing holiday feature data to: %s" % output_file)
----> 2 df_holidays_long_pivot.to_csv(output_file, header=True, index=False)
~\anaconda3\envs\schiphol-py\lib\site-packages\pandas\core\generic.py in to_csv(self, path_or_buf, sep, na_rep, float_format, columns, header, index, index_label, mode, encoding, compression, quoting, quotechar, line_terminator, chunksize, date_format, doublequote, escapechar, decimal)
3226 decimal=decimal,
3227 )
-> 3228 formatter.save()
3229
3230 if path_or_buf is None:
~\anaconda3\envs\schiphol-py\lib\site-packages\pandas\io\formats\csvs.py in save(self)
181 self.mode,
182 encoding=self.encoding,
--> 183 compression=self.compression,
184 )
185 close = True
~\anaconda3\envs\schiphol-py\lib\site-packages\pandas\io\common.py in _get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text)
397 if encoding:
398 # Encoding
--> 399 f = open(path_or_buf, mode, encoding=encoding, newline="")
400 elif is_text:
401 # No explicit encoding
FileNotFoundError: [Errno 2] No such file or directory: 'tmp_dump/scraped_holidays.csv'
Assert correct output¶
- Check start/end dates of output match input parameters
- Check all date differences are exactly 1 day
[31]:
# Assert some assumptions about the data
df_date_diffs = df_holidays_long_pivot \
.sort_values("ds") \
["ds"].diff().dt.days
# assert start and end date
df_start_date, df_end_date = min(df_holidays_long_pivot["ds"]), max(df_holidays_long_pivot["ds"])
if df_start_date == pd.to_datetime(start_date) and df_end_date == pd.to_datetime(end_date):
print("SUCCESS: Output data min/max date as expected")
else:
# todo: show exactly where we found unexpected results
print("FAILURE: Output data min/max date don't match specified parameters")
# assert that all dates per group are exactly 1 day apart
date_diff_ok = all(df_date_diffs.fillna(1) == 1)
if date_diff_ok:
print("SUCCESS: Date difference between all observations exactly 1 day")
else:
# todo: show exactly where we found unexpected results
print("FAILURE: Date differences not all exactly 1 day!")
SUCCESS: Output data min/max date as expected
SUCCESS: Date difference between all observations exactly 1 day
Information about processed holiday data¶
[1]:
# unsplit data
df_holidays_long_pivot.head()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-1-b4a1da2c784b> in <module>
1 # unsplit data
----> 2 df_holidays_long_pivot.head()
NameError: name 'df_holidays_long_pivot' is not defined
[33]:
# describe all columns of the unsplit data
df_holidays_long_pivot.describe(include='all')
[33]:
| ds | Herfstvakantie_noord | Voorjaarsvakantie_noord | Zomervakantie_noord | Herfstvakantie_midden | Voorjaarsvakantie_midden | Zomervakantie_midden | Herfstvakantie_zuid | Voorjaarsvakantie_zuid | Zomervakantie_zuid | Kerstvakantie_heelNederland | Meivakantie_heelNederland | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1826 | 1826.000000 | 1826.000000 | 1826.000000 | 1826.000000 | 1826.000000 | 1826.000000 | 1826.000000 | 1826.000000 | 1826.000000 | 1826.000000 | 1826.000000 |
| unique | 1826 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| top | 2019-07-06 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| freq | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| first | 2017-01-01 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| last | 2021-12-31 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| mean | NaN | 0.023001 | 0.021906 | 0.118291 | 0.024096 | 0.021906 | 0.118291 | 0.023549 | 0.022453 | 0.118291 | 0.040526 | 0.021906 |
| std | NaN | 0.149948 | 0.146416 | 0.323041 | 0.153390 | 0.146416 | 0.323041 | 0.151680 | 0.148194 | 0.323041 | 0.197243 | 0.146416 |
| min | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | NaN | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |