"""
Collect `railway station data <http://www.railwaycodes.org.uk/stations/station0.shtm>`_.
"""
import copy
import itertools
import os
import re
import string
import urllib.parse
import bs4
import numpy as np
import pandas as pd
import requests
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.ops import fake_requests_headers
from pyhelpers.store import load_pickle, save_pickle, save_json, load_json
from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, homepage_url, \
parse_location_name, parse_table, is_internet_connected, print_conn_err, print_connection_error
[docs]class Stations:
"""
A class for collecting railway station data.
:param data_dir: name of data directory, defaults to ``None``
:type data_dir: str, None
:param verbose: whether to print relevant information in console as the function runs,
defaults to ``True``
:type verbose: bool or int
:ivar str Name: name of the data
:ivar str Key: key of the dict-type data
:ivar str HomeURL: URL of the main homepage
:ivar str SourceURL: URL of the data web page
:ivar str LUDKey: key of the last updated date
:ivar str LUD: last updated date
:ivar dict Catalogue: catalogue of the data
:ivar str DataDir: path to the data directory
:ivar str CurrentDataDir: path to the current data directory
:ivar str StnKey: key of the dict-type data of railway stations
:ivar str StnPickle: name of the pickle file of railway station data
:ivar str BilingualKey: key of the dict-type data of bilingual names
:ivar str SpStnNameSignKey: key of the dict-type data of sponsored station name signs
:ivar str NSFOKey: key of the dict-type data of stations not served by SFO
:ivar str IntlKey: key of the dict-type data of UK international railway stations
:ivar str TriviaKey: key of the dict-type data of UK railway station trivia
:ivar str ARKey: key of the dict-type data of UK railway station access rights
:ivar str BarrierErrKey: key of the dict-type data of railway station barrier error codes
**Example**::
>>> from pyrcs.other_assets import Stations
>>> stn = Stations()
>>> print(stn.Name)
Railway station data
>>> print(stn.SourceURL)
http://www.railwaycodes.org.uk/stations/station0.shtm
"""
def __init__(self, data_dir=None, verbose=True):
"""
Constructor method.
"""
if not is_internet_connected():
print_connection_error(verbose=verbose)
self.Name = 'Railway station data'
self.Key = 'Stations'
self.HomeURL = homepage_url()
self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/stations/station0.shtm')
self.LUDKey = 'Last updated date' # key to last updated date
self.LUD = get_last_updated_date(url=self.SourceURL, parsed=True, as_date_type=False)
self.StnKey = 'Railway station data'
self.StnPickle = self.StnKey.lower().replace(" ", "-")
self.BilingualKey = 'Bilingual names'
self.SpStnNameSignKey = 'Sponsored signs'
self.NSFOKey = 'Not served by SFO'
self.IntlKey = 'International'
self.TriviaKey = 'Trivia'
self.ARKey = 'Access rights'
self.BarrierErrKey = 'Barrier error codes'
if data_dir:
self.DataDir = validate_input_data_dir(data_dir)
else:
self.DataDir = cd_dat("other-assets", self.Name.lower())
self.CurrentDataDir = copy.copy(self.DataDir)
def _cdd_stn(self, *sub_dir, **kwargs):
"""
Change directory to package data directory and sub-directories (and/or a file).
The directory for this module: ``"\\dat\\other-assets\\stations"``.
:param sub_dir: sub-directory or sub-directories (and/or a file)
:type sub_dir: str
:param kwargs: optional parameters of `os.makedirs`_, e.g. ``mode=0o777``
:return: path to the backup data directory for ``Stations``
:rtype: str
.. _`os.makedirs`: https://docs.python.org/3/library/os.html#os.makedirs
:meta private:
"""
path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs)
return path
[docs] def get_station_data_catalogue(self, update=False, verbose=False):
"""
Get catalogue of railway station data.
:param update: whether to check on update and proceed to update the package data,
defaults to ``False``
:type update: bool
:param verbose: whether to print relevant information in console as the function runs,
defaults to ``False``
:type verbose: bool or int
:return: catalogue of railway station data
:rtype: dict
**Example**::
>>> from pyrcs.other_assets import Stations
>>> stn = Stations()
>>> # stn_data_cat = stn.get_station_data_catalogue(update=True, verbose=True)
>>> stn_data_cat = stn.get_station_data_catalogue()
>>> type(stn_data_cat)
dict
>>> list(stn_data_cat.keys())
['Railway station data',
'Sponsored signs',
'International',
'Trivia',
'Access rights',
'Barrier error codes']
"""
cat_json = '-'.join(x for x in urllib.parse.urlparse(self.SourceURL).path.replace(
'.shtm', '.json').split('/') if x)
path_to_cat = cd_dat("catalogue", cat_json)
if os.path.isfile(path_to_cat) and not update:
catalogue = load_json(path_to_cat)
else:
if verbose == 2:
print("Collecting a catalogue of {} data".format(self.StnKey.lower()),
end=" ... ")
try:
source = requests.get(self.SourceURL, headers=fake_requests_headers())
except requests.exceptions.ConnectionError:
print("Failed.") if verbose == 2 else ""
print_conn_err(update=update, verbose=verbose)
catalogue = load_json(path_to_cat)
else:
try:
soup = bs4.BeautifulSoup(source.text, 'lxml')
cold_soup = soup.find_all('nav')[1]
hot_soup = {a.text: urllib.parse.urljoin(self.SourceURL, a.get('href'))
for a in cold_soup.find_all('a')}
catalogue = {self.StnKey: None}
for k, v in hot_soup.items():
sub_cat = get_catalogue(v, update=True, confirmation_required=False,
json_it=False)
if sub_cat != hot_soup:
if k == 'Introduction':
catalogue.update({self.StnKey: {k: v, **sub_cat}})
else:
catalogue.update({k: sub_cat})
else:
if k in ('Bilingual names', 'Not served by SFO'):
catalogue[self.StnKey].update({k: v})
else:
catalogue.update({k: v})
print("Done.") if verbose == 2 else ""
save_json(catalogue, path_to_cat, verbose=verbose)
except Exception as e:
print("Failed. {}".format(e))
catalogue = None
return catalogue
@staticmethod
def _parse_owner_and_operator(x):
"""
Parse 'Operator' column
"""
x_ = x.strip().replace('\'', '').replace('([, ', '').replace('])', '').replace('\xa0', '')
# parsed_txt_ = re.split(r'\\r| \[\'|\\\\r| {2}\']|\', \'|\\n', x_)
# parsed_text = [y for y in parsed_txt_ if remove_punctuation(y) != '']
cname_pat = re.compile(r'(?=[A-Z]).*(?= from \d+ \w+ [0-9]{4})')
cdate_pat = re.compile(r'(?<= from )\d+ \w+ [0-9]{4}')
pdate_pat = re.compile(r'from\s\d+\s\w+\s[0-9]{4} to \d+ \w+ [0-9]{4}')
try:
current_op, past_op = [y.rstrip(', ').strip(',').strip() for y in x_.split('\\r')]
except ValueError:
try:
current_op, past_op = [y.rstrip(', ').strip(',').strip() for y in x_.split('\r')]
except ValueError:
current_op, past_op = x_, None
# Current operator
current_name = re.search(cname_pat, current_op)
if current_name and current_op != '':
current_name = current_name.group(0)
else:
current_name = current_op
current_from = re.search(cdate_pat, current_op)
if current_from:
current_from = current_from.group(0)
current_operator = [(current_name, current_from)]
if past_op:
# Past operators
past_dates = re.findall(pdate_pat, past_op)
past_names = [y.strip().lstrip('([') for y in re.split(pdate_pat, past_op) if y.strip()]
past_operators = [(n, d) for n, d in zip(past_names, past_dates)]
# for z in parsed_text:
# # Operators names
# operator_name = re.search(r'.*(?= from \d+ \w+ \d+(.*)?)', z)
# operator_name = operator_name.group() if operator_name is not None else ''
# # Start dates
# start_date = re.search(r'(?<= from )\d+ \w+ \d+( to \d+ \w+ \d+(.*))?', z)
# start_date = start_date.group() if start_date is not None else ''
# # Form a tuple
# operators.append((operator_name, start_date))
else:
past_operators = []
operators = current_operator + past_operators
return operators
[docs] def extended_info(self, info_dat, name):
"""
Get extended information of the owners/operators.
:param info_dat: raw data of owners/operators
:type info_dat: pandas.Series
:param name: original column name of the owners/operators data
:type name: str
:return: extended information of the owners/operators
:rtype: pandas.DataFrame
"""
temp = list(info_dat.map(self._parse_owner_and_operator))
length = len(max(temp, key=len))
col_names_current = [name, name + '_since']
prev_no = list(
itertools.chain.from_iterable(itertools.repeat(x, 2) for x in list(range(1, length))))
col_names_ = zip(col_names_current * (length - 1), prev_no)
col_names = col_names_current + ['_'.join(['Prev', x, str(d)]).replace('_since', '_Period')
for x, d in col_names_]
for i in range(len(temp)):
if len(temp[i]) < length:
temp[i] += [(None, None)] * (length - len(temp[i]))
temp2 = pd.DataFrame(temp)
extended_info = [temp2[c].apply(pd.Series) for c in temp2.columns]
extended_info = pd.concat(extended_info, axis=1, sort=False)
extended_info.columns = col_names
return extended_info
@staticmethod
def _parse_degrees(x):
if x == '':
z = np.nan
else:
z = float(x.replace('c.', '') if x.startswith('c.') else x)
return z
[docs] def collect_station_data_by_initial(self, initial, update=False, verbose=False):
"""
Collect `railway station data <http://www.railwaycodes.org.uk/stations/station0.shtm>`_
for the given ``initial`` letter.
:param initial: initial letter of station data
(including the station name, ELR, mileage, status, owner, operator,
degrees of longitude and latitude, and grid reference) for specifying URL
:type initial: str
:param update: whether to check on update and proceed to update the package data,
defaults to ``False``
:type update: bool
:param verbose: whether to print relevant information in console as the function runs,
defaults to ``False``
:type verbose: bool, int
:return: railway station data for the given ``initial`` letter and
date of when the data was last updated
:rtype: dict
**Example**::
>>> from pyrcs.other_assets import Stations
>>> stn = Stations()
>>> # sa = stn.collect_station_data_by_initial('a', update=True, verbose=True)
>>> sa = stn.collect_station_data_by_initial(initial='a')
>>> type(sa)
dict
>>> list(sa.keys())
['A', 'Last updated date']
>>> print(sa['A'].head())
Station ELR ... Prev_Operator_6 Prev_Operator_Period_6
0 Abbey Wood NKL ... None None
1 Abbey Wood XRS3 ... None None
2 Aber CAR ... None None
3 Abercynon North ABD ... None None
4 ABD ... None None
[5 rows x 28 columns]
"""
path_to_pickle = self._cdd_stn("a-z", initial.lower() + ".pickle")
beginning_with = initial.upper()
if os.path.isfile(path_to_pickle) and not update:
railway_station_data = load_pickle(path_to_pickle)
else:
url = self.SourceURL.replace('station0', 'station{}'.format(initial.lower()))
railway_station_data = {beginning_with: None, self.LUDKey: None}
if verbose == 2:
print("Collecting data of {} beginning with \"{}\"".format(
self.StnKey.lower(), beginning_with), end=" ... ")
stn_data_catalogue = self.get_station_data_catalogue()
if beginning_with not in list(stn_data_catalogue[self.StnKey].keys()):
if verbose == 2:
print("No data is available.")
# print("No data is available for signal box codes "
# "beginning with \"{}\".".format(beginning_with))
# railway_station_table, last_updated_date = None, None
pass
else:
try:
source = requests.get(url, headers=fake_requests_headers())
except requests.exceptions.ConnectionError:
print("Failed.") if verbose == 2 else ""
print_conn_err(verbose=verbose)
else:
try:
records, header = parse_table(source, parser='lxml')
# Create a DataFrame of the requested table
dat = [[x.replace('=', 'See').strip('\xa0') for x in i] for i in records]
col = [re.sub(r'\n?\r+\n?', ' ', h) for h in header]
stn_dat = pd.DataFrame(dat, columns=col)
temp_degree = stn_dat['Degrees Longitude'].str.split(' ')
temp_degree_len = temp_degree.map(len).sum()
temp_elr = stn_dat['ELR'].map(
lambda x: x.split(' ') if not re.match('^[Ss]ee ', x) else [x])
temp_elr_len = temp_elr.map(len).sum()
if max(temp_degree_len, temp_elr_len) > len(stn_dat):
temp_col = ['ELR', 'Degrees Longitude', 'Degrees Latitude',
'Grid Reference']
idx = [j for j in stn_dat.index
if max(len(temp_degree[j]), len(temp_elr[j])) > 1]
temp_vals = []
for i in idx:
t = max(len(temp_degree[i]), len(temp_elr[i]))
temp_val = []
for c in col:
x_ = stn_dat.loc[i, c]
if c in temp_col:
y = x_.split(' ')
if len(y) == 1:
y = y * t
temp_val.append(y)
elif c == 'Mileage':
y = re.findall(r'\d+m \d+ch|\d+\.\d+km|\w+', x_)
if len(y) > t:
y = re.findall(r'\d+m \d+ch', x_)
temp_val.append(y)
else:
temp_val.append([x_] * t)
temp_vals.append(
pd.DataFrame(np.array(temp_val, dtype=object).T, columns=col))
stn_dat.drop(idx, axis='index', inplace=True)
stn_dat = pd.concat(
[stn_dat] + temp_vals, axis=0, ignore_index=True)
stn_dat.sort_values(['Station'], inplace=True)
stn_dat.index = range(len(stn_dat))
degrees_col = ['Degrees Longitude', 'Degrees Latitude']
stn_dat[degrees_col] = stn_dat[degrees_col].applymap(self._parse_degrees)
stn_dat['Grid Reference'] = stn_dat['Grid Reference'].map(
lambda x: x.replace('c.', '') if x.startswith('c.') else x)
stn_dat[['Station', 'Station_Note']] = stn_dat.Station.map(
parse_location_name).apply(pd.Series)
# Owner
owners = self.extended_info(stn_dat.Owner, name='Owner')
stn_dat.drop('Owner', axis=1, inplace=True)
stn_dat = stn_dat.join(owners)
# Operator
# temp = list(stn_dat.Operator.map(self._parse_owner_and_operator))
# length = len(max(temp, key=len))
# col_names_current = ['Operator', 'Date']
# prev_no = list(itertools.chain.from_iterable(
# itertools.repeat(x, 2) for x in list(range(1, length))))
# col_names = zip(col_names_current * (length - 1), prev_no)
# col_names = col_names_current + [
# '_'.join(['Prev', x, str(d)]) for x, d in col_names]
#
# for i in range(len(temp)):
# if len(temp[i]) < length:
# temp[i] += [(None, None)] * (length - len(temp[i]))
#
# temp2 = pd.DataFrame(temp)
# operators = [temp2[c].apply(pd.Series) for c in temp2.columns]
# operators = pd.concat(operators, axis=1, sort=False)
# operators.columns = col_names
operators = self.extended_info(stn_dat.Operator, name='Operator')
stn_dat.drop('Operator', axis=1, inplace=True)
stn_dat = stn_dat.join(operators)
last_updated_date = get_last_updated_date(url)
railway_station_data.update({beginning_with: stn_dat,
self.LUDKey: last_updated_date})
print("Done.") if verbose == 2 else ""
save_pickle(railway_station_data, path_to_pickle, verbose=verbose)
except Exception as e:
print("Failed. {}".format(e))
return railway_station_data
[docs] def fetch_station_data(self, update=False, pickle_it=False, data_dir=None, verbose=False):
"""
Fetch `railway station data <http://www.railwaycodes.org.uk/stations/station0.shtm>`_
from local backup.
:param update: whether to check on update and proceed to update the package data,
defaults to ``False``
:type update: bool
:param pickle_it: whether to replace the current package data with newly collected data,
defaults to ``False``
:type pickle_it: bool
:param data_dir: name of package data folder, defaults to ``None``
:type data_dir: str, None
:param verbose: whether to print relevant information in console as the function runs,
defaults to ``False``
:type verbose: bool, int
:return: railway station data
(including the station name, ELR, mileage, status, owner, operator,
degrees of longitude and latitude, and grid reference) and
date of when the data was last updated
:rtype: dict
**Example**::
>>> from pyrcs.other_assets import Stations
>>> stn = Stations()
>>> # rail_stn_data = stn.fetch_station_data(update=True, verbose=True)
>>> rail_stn_data = stn.fetch_station_data()
>>> type(rail_stn_data)
dict
>>> list(rail_stn_data.keys())
['Railway station data', 'Last updated date']
>>> rail_stn_dat = rail_stn_data['Railway station data']
>>> type(rail_stn_dat)
pandas.core.frame.DataFrame
>>> print(rail_stn_dat.head())
Station ELR ... Prev_Operator_6 Prev_Operator_Period_6
2606 MRL1 ... None None
723 TAT ... None None
89 ABD ... None None
90 CAM ... None None
85 Abbey Wood NKL ... None None
[5 rows x 32 columns]
"""
verbose_ = False if (data_dir or not verbose) else (2 if verbose == 2 else True)
data_sets = [
self.collect_station_data_by_initial(
x, update=update, verbose=verbose_ if is_internet_connected() else False)
for x in string.ascii_lowercase]
if all(d[x] is None for d, x in zip(data_sets, string.ascii_uppercase)):
if update:
print_conn_err(verbose=verbose)
print("No data of the {} has been freshly collected.".format(self.StnKey.lower()))
data_sets = [self.collect_station_data_by_initial(x, update=False, verbose=verbose_)
for x in string.ascii_lowercase]
stn_dat_tbl_ = (item[x] for item, x in zip(data_sets, string.ascii_uppercase))
stn_dat_tbl = sorted([x for x in stn_dat_tbl_ if x is not None], key=lambda x: x.shape[1],
reverse=True)
stn_data = pd.concat(stn_dat_tbl, axis=0, ignore_index=True, sort=False)
stn_data = stn_data.where(pd.notna(stn_data), None)
stn_data.sort_values(['Station'], inplace=True)
last_updated_dates = (d[self.LUDKey] for d in data_sets)
latest_update_date = max(d for d in last_updated_dates if d is not None)
railway_station_data = {self.StnKey: stn_data, self.LUDKey: latest_update_date}
if pickle_it and data_dir:
self.CurrentDataDir = validate_input_data_dir(data_dir)
path_to_pickle = os.path.join(self.CurrentDataDir, self.StnPickle + ".pickle")
save_pickle(railway_station_data, path_to_pickle, verbose=verbose)
return railway_station_data