Installation Allocations¶

#exports
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup as bs
import urllib.parse as urlparse
from urllib.parse import parse_qs

import re
from warnings import warn

from ipypb import track

from IPython.display import JSON

Retrieving Installation Table Links¶

In this first section we'll set out to retrieve the urls for the installations databases across different ETS phases and each country.

We'll start by retrieving the raw results for a single country

#exports
def get_country_raw_search(country_code='AT'):
    url = 'https://ec.europa.eu/clima/ets/nap.do'

    params = {
        'languageCode': 'en',
        'nap.registryCodeArray': country_code,
        'periodCode': '-1',
        'search': 'Search',
        'currentSortSettings': ''
    }

    r = requests.get(url, params=params)

    return r

r = get_country_raw_search('IS')

r

<Response [200]>

From the response we can extract a table containing the relevant information on the installation databases

#exports
def extract_search_df(r):
    soup = bs(r.text)
    results_table = soup.find('table', attrs={'id': 'tblNapSearchResult'})

    df_search = (pd
                 .read_html(str(results_table))
                 [0]
                 .iloc[2:, :-3]
                 .reset_index(drop=True)
                 .T
                 .set_index(0)
                 .T
                 .reset_index(drop=True)
                 .rename(columns={
                     'National Administrator': 'country',
                     'EU ETS Phase': 'phase',
                     'For issuance to not new entrants': 'non_new_entrants',
                     'From NER': 'new_entrants_reserve'
                 })
                )

    df_search['installations_link'] = ['https://ec.europa.eu/'+a['href'] for a in soup.findAll('a', text=re.compile('Installations linked to this Allocation Table'))]

    return df_search

df_search_country = extract_search_df(r)

df_search_country

	country	phase	non_new_entrants	new_entrants_reserve	installations_link
0	Iceland	Phase 3 (2013-2020)	11527090	172828	https://ec.europa.eu//clima/ets/napInstallatio...

It's all good doing this for Austria but we want European-wide coverage. We can identify the countries we can query from the option box on the main search page.

#exports
def get_country_codes():
    r = get_country_raw_search()

    soup = bs(r.text)

    registry_code_to_country = {
        option['value']: option.text
        for option 
        in soup.find('select', attrs={'name': 'nap.registryCodeArray'}).findAll('option')
    }

    return registry_code_to_country

registry_code_to_country = get_country_codes()

JSON([registry_code_to_country])

<IPython.core.display.JSON object>

We can now use these to repeat our search for each country and concatenate the results

#exports
def get_installation_links_dataframe():
    df_search = pd.DataFrame()

    for registry_code in registry_code_to_country.keys():
        r = get_country_raw_search(registry_code)
        df_search_country = extract_search_df(r)
        df_search = df_search.append(df_search_country)

    df_search = df_search.reset_index(drop=True)
    null_values_present = df_search.isnull().sum().sum() > 0

    if null_values_present == True:
        warn('There are null values present in the dataframe')

    return df_search

df_search = get_installation_links_dataframe()

df_search.head()

	country	phase	non_new_entrants	new_entrants_reserve	installations_link
0	Austria	Phase 1 (2005-2007)	97791309	990150	https://ec.europa.eu//clima/ets/napInstallatio...
1	Austria	Phase 2 (2008-2012)	160218569	2000000	https://ec.europa.eu//clima/ets/napInstallatio...
2	Austria	Phase 3 (2013-2020)	160295499	1893669	https://ec.europa.eu//clima/ets/napInstallatio...
3	Belgium	Phase 1 (2005-2007)	178690906	7653297	https://ec.europa.eu//clima/ets/napInstallatio...
4	Belgium	Phase 2 (2008-2012)	283317829	9153852	https://ec.europa.eu//clima/ets/napInstallatio...

Retrieving Installation Allocation Data¶

In this section we'll start by separating the root url and the parameters from each of the installation links

#exports
def get_url_root_and_params(installations_link):
    url_root = installations_link.split('?')[0]
    parsed = urlparse.urlparse(installations_link)
    params = {k: v[0] for k, v in parse_qs(parsed.query).items()}

    return url_root, params

installations_link = df_search.loc[0, 'installations_link']

url_root, params = get_url_root_and_params(installations_link)

JSON(params)

<IPython.core.display.JSON object>

We also need to pass the page number to the parameters, to do this we need to first know how many pages there are

#exports
def get_num_pages(root_url, params): 
    soup = bs(requests.get(root_url, params=params).text)
    soup_pn = soup.find('input', attrs={'name': 'resultList.lastPageNumber'})

    if soup_pn is not None:
        num_pages = int(soup_pn['value'])
    else:
        num_pages = 1

    return num_pages

num_pages = get_num_pages(root_url, params)

num_pages

We're now ready to iterate over multiple pages and combine the results for a single ETS phase in a single country

#exports
def extract_installation_allocations_df(r):
        soup = bs(r.text)
        table = soup.find('table', attrs={'id': 'tblNapList'})

        df_installation_allocations = (pd
                                       .read_html(str(table))
                                       [0]
                                       .drop([0, 1])
                                       .reset_index(drop=True)
                                       .T
                                       .set_index(0)
                                       .T
                                       .drop(columns=['Options'])
                                      )

        return df_installation_allocations

def retry_request(root_url, params, n_retries=5, **kwargs):
    for i in range(n_retries):
        try:
            r = requests.get(root_url, params=params, **kwargs)
            return r
        except Exception as e:
            continue

    raise e

def get_installation_allocations_df(root_url, params, n_retries=5):
    df_installation_allocations = pd.DataFrame()

    num_pages = get_num_pages(root_url, params)
    params['nextList'] = 'Next'

    for page_num in range(num_pages):
        params['resultList.currentPageNumber'] = page_num            
        r = retry_request(root_url, params, n_retries=n_retries)

        df_installation_allocations_page = extract_installation_allocations_df(r)
        df_installation_allocations = df_installation_allocations.append(df_installation_allocations_page)

    df_installation_allocations = df_installation_allocations.reset_index(drop=True)

    return df_installation_allocations

df_installation_allocations = get_installation_allocations_df(root_url, params)

print(f'DataFrame shape: {df_installation_allocations.shape}')

df_installation_allocations.head(3)

DataFrame shape: (201, 11)

	Installation ID	Installation Name	Address City	Account Holder Name	Account Status	Permit ID	Latest Update	2005	2006	2007	Status
0	1	Baumit Baustoffe Bad Ischl	Bad Ischl	Calmit GmbH	open	IKA119	2009-05-08 09:13:58	44894	44894	44894	Active
1	2	Breitenfelder Edelstahl Mitterdorf	Mitterdorf	Breitenfeld Edelstahl AG	open	IES069	2009-05-08 09:13:58	8492	8492	8492	Active
2	3	Ziegelwerk Danreiter Ried im Innkreis	Ried	Ziegelwerk Danreiter GmbH & Co KG	open	IZI155	2009-05-08 09:13:58	7397	7397	7397	Active

The next step is to repeat this for all countries and ETS phases, then combine the resulting dataframes

#exports
def get_all_installation_allocations_df(df_search):
    col_renaming_map = {
        'Installation ID': 'installation_id', 
        'Installation Name': 'installation_name', 
        'Address City': 'installation_city', 
        'Account Holder Name': 'account_holder', 
        'Account Status': 'account_status', 
        'Permit ID': 'permit_id', 
        'Status': 'status'
    }

    df_installation_allocations = pd.DataFrame()

    # Retrieving raw data
    for country in track(df_search['country'].unique()):
        df_installation_allocations_country = pd.DataFrame()
        country_installations_links = df_search.loc[df_search['country']==country, 'installations_link']

        for installations_link in track(country_installations_links, label=country):
            url_root, params = get_url_root_and_params(installations_link)
            df_installation_allocations_country_phase = get_installation_allocations_df(root_url, params)

            if df_installation_allocations_country.size > 0:
                df_installation_allocations_country = pd.merge(df_installation_allocations_country, df_installation_allocations_country_phase, how='outer', on=list(col_renaming_map.keys()))
            else:
                df_installation_allocations_country = df_installation_allocations_country_phase

        df_installation_allocations_country['country'] = country
        df_installation_allocations = df_installation_allocations.append(df_installation_allocations_country)

    # Collating update datetimes
    update_cols = df_installation_allocations.columns[df_installation_allocations.columns.str.contains('Latest Update')]
    df_installation_allocations['latest_update'] = df_installation_allocations[update_cols].fillna('').max(axis=1)
    df_installation_allocations = df_installation_allocations.drop(columns=update_cols)

    # Renaming columns
    df_installation_allocations = (df_installation_allocations
                                   .reset_index(drop=True)
                                   .rename(columns=col_renaming_map)
                                  )

    # Sorting column order
    non_year_cols = ['country'] + list(col_renaming_map.values()) + ['latest_update']
    year_cols = sorted(list(set(df_installation_allocations.columns) - set(non_year_cols)))
    df_installation_allocations = df_installation_allocations[non_year_cols+year_cols]

    # Dropping header rows
    idxs_to_drop = df_installation_allocations['permit_id'].str.contains('\*').replace(False, np.nan).dropna().index
    df_installation_allocations = df_installation_allocations.drop(idxs_to_drop)

    return df_installation_allocations

df_installation_allocations = get_all_installation_allocations_df(df_search)

df_installation_allocations.head()

100% 32/32 [24:25<01:34, 45.78s/it]

Austria 100% 3/3 [00:16<00:05, 5.35s/it]

Belgium 100% 3/3 [00:24<00:10, 7.95s/it]

Bulgaria 100% 2/2 [00:07<00:03, 3.37s/it]

Croatia 100% 1/1 [00:01<00:01, 1.26s/it]

Cyprus 100% 2/2 [00:01<00:01, 0.64s/it]

Czech Republic 100% 3/3 [00:33<00:12, 11.09s/it]

Denmark 100% 3/3 [00:30<00:11, 10.07s/it]

Estonia 100% 3/3 [00:04<00:01, 1.21s/it]

Finland 100% 3/3 [01:04<00:19, 21.32s/it]

France 100% 3/3 [02:50<01:18, 56.52s/it]

Germany 100% 3/3 [06:22<03:02, 127.24s/it]

Greece 100% 3/3 [00:09<00:03, 3.03s/it]

Hungary 100% 3/3 [00:18<00:07, 5.88s/it]

Iceland 100% 1/1 [00:01<00:01, 0.63s/it]

Ireland 100% 3/3 [00:08<00:02, 2.58s/it]

Italy 100% 3/3 [02:45<01:22, 54.94s/it]

Latvia 100% 3/3 [00:06<00:02, 1.88s/it]

Liechtenstein 100% 2/2 [00:01<00:00, 0.49s/it]

Lithuania 100% 3/3 [00:07<00:03, 2.46s/it]

Luxembourg 100% 3/3 [00:02<00:01, 0.54s/it]

Malta 100% 1/1 [00:00<00:00, 0.47s/it]

Netherlands 100% 3/3 [00:33<00:15, 10.85s/it]

Northern Ireland 100% 3/3 [00:01<00:00, 0.46s/it]

Norway 100% 2/2 [00:06<00:03, 3.00s/it]

Poland 100% 3/3 [01:48<00:41, 36.07s/it]

Portugal 100% 3/3 [00:16<00:05, 5.37s/it]

Romania 100% 3/3 [00:16<00:05, 5.45s/it]

Slovakia 100% 3/3 [00:11<00:04, 3.60s/it]

Slovenia 100% 3/3 [00:05<00:01, 1.74s/it]

Spain 100% 3/3 [02:11<00:48, 43.72s/it]

Sweden 100% 3/3 [01:35<00:30, 31.67s/it]

United Kingdom 100% 3/3 [01:34<00:31, 31.37s/it]

	country	installation_id	installation_name	installation_city	account_holder	account_status	permit_id	status	latest_update	2005	...	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020
0	Austria	1	Baumit Baustoffe Bad Ischl	Bad Ischl	Calmit GmbH	open	IKA119	Active	2013-12-19 15:47:52	44894	...	43171	43171	42159	41426	40685	39937	39180	38416	37643	36866
1	Austria	2	Breitenfelder Edelstahl Mitterdorf	Mitterdorf	Breitenfeld Edelstahl AG	open	IES069	Active	2013-12-19 15:48:16	8492	...	26429	26429	15118	14856	14590	14322	14051	13776	13499	13221
2	Austria	3	Ziegelwerk Danreiter Ried im Innkreis	Ried	Ziegelwerk Danreiter GmbH & Co KG	open	IZI155	Active	2013-12-19 15:48:12	7397	...	5927	5927	3494	3434	3373	3311	3248	3185	3120	3056
3	Austria	4	Wienerberger Blindenmarkt	Blindenmarkt	Wienerberger Österreich GmbH	closed	IZI146-1	Account Closed	2009-05-08 09:13:58	8335	...	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
4	Austria	5	Isomax Dekorative Laminate Wiener Neudorf	Wiener Neudorf	FunderMax GmbH	open	ICH113	Active	2013-12-19 15:47:46	24003	...	27343	27343	26223	25697	25167	24636	24102	23566	23026	22488

A quick check reveals some issues with the dataset, for example there are several permit ids that have duplicate entries

permit_ids_with_duplicate_idxs = (df_installation_allocations['permit_id'].value_counts()>1).replace(False, np.nan).dropna().index
df_dupe_permit_ids = df_installation_allocations[df_installation_allocations['permit_id'].isin(permit_ids_with_duplicate_idxs)].sort_values('permit_id')

print(f"There are {df_dupe_permit_ids['permit_id'].unique().size} permit ids which have duplicate entries due to inconsistent values (e.g. for `installation_id`)\n")

df_dupe_permit_ids.head(6)

There are 109 permit ids which have duplicate entries due to inconsistent values (e.g. for `installation_id`)

	country	installation_id	installation_name	installation_city	account_holder	account_status	permit_id	status	latest_update	2005	...	2011	2012	2013	2014	2015	2016	2017	2018	2019	2020
4184	France	204319	ARKEMA FRANCE- Usine de Mont	Orthez	ARKEMA FRANCE	open	5202690	Active	2014-01-27 16:54:18	nan	...	nan	nan	26157	25703	25242	24779	24308	23835	23355	22873
2937	France	102	ARKEMA FRANCE - Usine de Mont	MONT	ARKEMA FRANCE	closed	5202690	Account Closed	2012-11-29 14:18:52	32802	...	0	0	nan	nan	nan	nan	nan	nan	nan	nan
4165	France	204063	SOCIETE FROMAGERE DE SAINTE CECILE	SAINTE-CECILE	SOCIETE FROMAGERE DE SAINTE CECILE	open	5301510	Active	2014-01-27 16:57:52	nan	...	nan	nan	3864	3458	3063	2680	2309	1949	1602	1267
3913	France	1121	SOCIETE FROMAGERE DE SAINTE CECILE	Sainte Cécile	BOUVIER	closed	5301510	Account Closed	2009-05-08 09:13:58	7790	...	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
4162	France	204060	SOCIETE FROMAGERE DE DOMFRONT	Domfront	SOCIETE FROMAGERE DE DOMFRONT	open	5302209	Active	2014-01-27 16:54:24	nan	...	nan	nan	5780	5173	4582	4009	3454	2916	2396	1895
2961	France	127	SOCIETE FROMAGERE DE DOMFRONT	Domfront	SCOTTO	closed	5302209	Account Closed	2009-05-08 09:13:58	11326	...	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan

Lastly we'll wrap these steps in a function that will retrieve/download our data to/from the specified directory

#exports
def get_installation_allocations_df(data_dir='data', redownload=False):    
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    if redownload == True:
        df_search = get_installation_links_dataframe()
        df_installation_allocations = get_all_installation_allocations_df(df_search)
        df_installation_allocations.to_csv(f'{data_dir}/installation_allocations.csv', index=False)
    else:
        df_installation_allocations = pd.read_csv(f'{data_dir}/installation_allocations.csv')

    return df_installation_allocations

redownload_installation_allocations = False

df_installation_allocations = get_installation_allocations_df(
    data_dir='../data', 
    redownload=redownload_installation_allocations
)