Source code for station

"""
Collect `railway station data <http://www.railwaycodes.org.uk/stations/station0.shtm>`_.
"""

import copy
import itertools
import os
import re
import string
import urllib.parse

import bs4
import numpy as np
import pandas as pd
import requests
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.ops import fake_requests_headers
from pyhelpers.store import load_pickle, save_pickle, save_json, load_json

from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, homepage_url, \
    parse_location_name, parse_table, is_internet_connected, print_conn_err, print_connection_error


[docs]class Stations: """ A class for collecting railway station data. :param data_dir: name of data directory, defaults to ``None`` :type data_dir: str, None :param verbose: whether to print relevant information in console as the function runs, defaults to ``True`` :type verbose: bool or int :ivar str Name: name of the data :ivar str Key: key of the dict-type data :ivar str HomeURL: URL of the main homepage :ivar str SourceURL: URL of the data web page :ivar str LUDKey: key of the last updated date :ivar str LUD: last updated date :ivar dict Catalogue: catalogue of the data :ivar str DataDir: path to the data directory :ivar str CurrentDataDir: path to the current data directory :ivar str StnKey: key of the dict-type data of railway stations :ivar str StnPickle: name of the pickle file of railway station data :ivar str BilingualKey: key of the dict-type data of bilingual names :ivar str SpStnNameSignKey: key of the dict-type data of sponsored station name signs :ivar str NSFOKey: key of the dict-type data of stations not served by SFO :ivar str IntlKey: key of the dict-type data of UK international railway stations :ivar str TriviaKey: key of the dict-type data of UK railway station trivia :ivar str ARKey: key of the dict-type data of UK railway station access rights :ivar str BarrierErrKey: key of the dict-type data of railway station barrier error codes **Example**:: >>> from pyrcs.other_assets import Stations >>> stn = Stations() >>> print(stn.Name) Railway station data >>> print(stn.SourceURL) http://www.railwaycodes.org.uk/stations/station0.shtm """ def __init__(self, data_dir=None, verbose=True): """ Constructor method. """ if not is_internet_connected(): print_connection_error(verbose=verbose) self.Name = 'Railway station data' self.Key = 'Stations' self.HomeURL = homepage_url() self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/stations/station0.shtm') self.LUDKey = 'Last updated date' # key to last updated date self.LUD = get_last_updated_date(url=self.SourceURL, parsed=True, as_date_type=False) self.StnKey = 'Railway station data' self.StnPickle = self.StnKey.lower().replace(" ", "-") self.BilingualKey = 'Bilingual names' self.SpStnNameSignKey = 'Sponsored signs' self.NSFOKey = 'Not served by SFO' self.IntlKey = 'International' self.TriviaKey = 'Trivia' self.ARKey = 'Access rights' self.BarrierErrKey = 'Barrier error codes' if data_dir: self.DataDir = validate_input_data_dir(data_dir) else: self.DataDir = cd_dat("other-assets", self.Name.lower()) self.CurrentDataDir = copy.copy(self.DataDir) def _cdd_stn(self, *sub_dir, **kwargs): """ Change directory to package data directory and sub-directories (and/or a file). The directory for this module: ``"\\dat\\other-assets\\stations"``. :param sub_dir: sub-directory or sub-directories (and/or a file) :type sub_dir: str :param kwargs: optional parameters of `os.makedirs`_, e.g. ``mode=0o777`` :return: path to the backup data directory for ``Stations`` :rtype: str .. _`os.makedirs`: https://docs.python.org/3/library/os.html#os.makedirs :meta private: """ path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs) return path
[docs] def get_station_data_catalogue(self, update=False, verbose=False): """ Get catalogue of railway station data. :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool or int :return: catalogue of railway station data :rtype: dict **Example**:: >>> from pyrcs.other_assets import Stations >>> stn = Stations() >>> # stn_data_cat = stn.get_station_data_catalogue(update=True, verbose=True) >>> stn_data_cat = stn.get_station_data_catalogue() >>> type(stn_data_cat) dict >>> list(stn_data_cat.keys()) ['Railway station data', 'Sponsored signs', 'International', 'Trivia', 'Access rights', 'Barrier error codes'] """ cat_json = '-'.join(x for x in urllib.parse.urlparse(self.SourceURL).path.replace( '.shtm', '.json').split('/') if x) path_to_cat = cd_dat("catalogue", cat_json) if os.path.isfile(path_to_cat) and not update: catalogue = load_json(path_to_cat) else: if verbose == 2: print("Collecting a catalogue of {} data".format(self.StnKey.lower()), end=" ... ") try: source = requests.get(self.SourceURL, headers=fake_requests_headers()) except requests.exceptions.ConnectionError: print("Failed.") if verbose == 2 else "" print_conn_err(update=update, verbose=verbose) catalogue = load_json(path_to_cat) else: try: soup = bs4.BeautifulSoup(source.text, 'lxml') cold_soup = soup.find_all('nav')[1] hot_soup = {a.text: urllib.parse.urljoin(self.SourceURL, a.get('href')) for a in cold_soup.find_all('a')} catalogue = {self.StnKey: None} for k, v in hot_soup.items(): sub_cat = get_catalogue(v, update=True, confirmation_required=False, json_it=False) if sub_cat != hot_soup: if k == 'Introduction': catalogue.update({self.StnKey: {k: v, **sub_cat}}) else: catalogue.update({k: sub_cat}) else: if k in ('Bilingual names', 'Not served by SFO'): catalogue[self.StnKey].update({k: v}) else: catalogue.update({k: v}) print("Done.") if verbose == 2 else "" save_json(catalogue, path_to_cat, verbose=verbose) except Exception as e: print("Failed. {}".format(e)) catalogue = None return catalogue
@staticmethod def _parse_owner_and_operator(x): """ Parse 'Operator' column """ x_ = x.strip().replace('\'', '').replace('([, ', '').replace('])', '').replace('\xa0', '') # parsed_txt_ = re.split(r'\\r| \[\'|\\\\r| {2}\']|\', \'|\\n', x_) # parsed_text = [y for y in parsed_txt_ if remove_punctuation(y) != ''] cname_pat = re.compile(r'(?=[A-Z]).*(?= from \d+ \w+ [0-9]{4})') cdate_pat = re.compile(r'(?<= from )\d+ \w+ [0-9]{4}') pdate_pat = re.compile(r'from\s\d+\s\w+\s[0-9]{4} to \d+ \w+ [0-9]{4}') try: current_op, past_op = [y.rstrip(', ').strip(',').strip() for y in x_.split('\\r')] except ValueError: try: current_op, past_op = [y.rstrip(', ').strip(',').strip() for y in x_.split('\r')] except ValueError: current_op, past_op = x_, None # Current operator current_name = re.search(cname_pat, current_op) if current_name and current_op != '': current_name = current_name.group(0) else: current_name = current_op current_from = re.search(cdate_pat, current_op) if current_from: current_from = current_from.group(0) current_operator = [(current_name, current_from)] if past_op: # Past operators past_dates = re.findall(pdate_pat, past_op) past_names = [y.strip().lstrip('([') for y in re.split(pdate_pat, past_op) if y.strip()] past_operators = [(n, d) for n, d in zip(past_names, past_dates)] # for z in parsed_text: # # Operators names # operator_name = re.search(r'.*(?= from \d+ \w+ \d+(.*)?)', z) # operator_name = operator_name.group() if operator_name is not None else '' # # Start dates # start_date = re.search(r'(?<= from )\d+ \w+ \d+( to \d+ \w+ \d+(.*))?', z) # start_date = start_date.group() if start_date is not None else '' # # Form a tuple # operators.append((operator_name, start_date)) else: past_operators = [] operators = current_operator + past_operators return operators
[docs] def extended_info(self, info_dat, name): """ Get extended information of the owners/operators. :param info_dat: raw data of owners/operators :type info_dat: pandas.Series :param name: original column name of the owners/operators data :type name: str :return: extended information of the owners/operators :rtype: pandas.DataFrame """ temp = list(info_dat.map(self._parse_owner_and_operator)) length = len(max(temp, key=len)) col_names_current = [name, name + '_since'] prev_no = list( itertools.chain.from_iterable(itertools.repeat(x, 2) for x in list(range(1, length)))) col_names_ = zip(col_names_current * (length - 1), prev_no) col_names = col_names_current + ['_'.join(['Prev', x, str(d)]).replace('_since', '_Period') for x, d in col_names_] for i in range(len(temp)): if len(temp[i]) < length: temp[i] += [(None, None)] * (length - len(temp[i])) temp2 = pd.DataFrame(temp) extended_info = [temp2[c].apply(pd.Series) for c in temp2.columns] extended_info = pd.concat(extended_info, axis=1, sort=False) extended_info.columns = col_names return extended_info
@staticmethod def _parse_degrees(x): if x == '': z = np.nan else: z = float(x.replace('c.', '') if x.startswith('c.') else x) return z
[docs] def collect_station_data_by_initial(self, initial, update=False, verbose=False): """ Collect `railway station data <http://www.railwaycodes.org.uk/stations/station0.shtm>`_ for the given ``initial`` letter. :param initial: initial letter of station data (including the station name, ELR, mileage, status, owner, operator, degrees of longitude and latitude, and grid reference) for specifying URL :type initial: str :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool, int :return: railway station data for the given ``initial`` letter and date of when the data was last updated :rtype: dict **Example**:: >>> from pyrcs.other_assets import Stations >>> stn = Stations() >>> # sa = stn.collect_station_data_by_initial('a', update=True, verbose=True) >>> sa = stn.collect_station_data_by_initial(initial='a') >>> type(sa) dict >>> list(sa.keys()) ['A', 'Last updated date'] >>> print(sa['A'].head()) Station ELR ... Prev_Operator_6 Prev_Operator_Period_6 0 Abbey Wood NKL ... None None 1 Abbey Wood XRS3 ... None None 2 Aber CAR ... None None 3 Abercynon North ABD ... None None 4 ABD ... None None [5 rows x 28 columns] """ path_to_pickle = self._cdd_stn("a-z", initial.lower() + ".pickle") beginning_with = initial.upper() if os.path.isfile(path_to_pickle) and not update: railway_station_data = load_pickle(path_to_pickle) else: url = self.SourceURL.replace('station0', 'station{}'.format(initial.lower())) railway_station_data = {beginning_with: None, self.LUDKey: None} if verbose == 2: print("Collecting data of {} beginning with \"{}\"".format( self.StnKey.lower(), beginning_with), end=" ... ") stn_data_catalogue = self.get_station_data_catalogue() if beginning_with not in list(stn_data_catalogue[self.StnKey].keys()): if verbose == 2: print("No data is available.") # print("No data is available for signal box codes " # "beginning with \"{}\".".format(beginning_with)) # railway_station_table, last_updated_date = None, None pass else: try: source = requests.get(url, headers=fake_requests_headers()) except requests.exceptions.ConnectionError: print("Failed.") if verbose == 2 else "" print_conn_err(verbose=verbose) else: try: records, header = parse_table(source, parser='lxml') # Create a DataFrame of the requested table dat = [[x.replace('=', 'See').strip('\xa0') for x in i] for i in records] col = [re.sub(r'\n?\r+\n?', ' ', h) for h in header] stn_dat = pd.DataFrame(dat, columns=col) temp_degree = stn_dat['Degrees Longitude'].str.split(' ') temp_degree_len = temp_degree.map(len).sum() temp_elr = stn_dat['ELR'].map( lambda x: x.split(' ') if not re.match('^[Ss]ee ', x) else [x]) temp_elr_len = temp_elr.map(len).sum() if max(temp_degree_len, temp_elr_len) > len(stn_dat): temp_col = ['ELR', 'Degrees Longitude', 'Degrees Latitude', 'Grid Reference'] idx = [j for j in stn_dat.index if max(len(temp_degree[j]), len(temp_elr[j])) > 1] temp_vals = [] for i in idx: t = max(len(temp_degree[i]), len(temp_elr[i])) temp_val = [] for c in col: x_ = stn_dat.loc[i, c] if c in temp_col: y = x_.split(' ') if len(y) == 1: y = y * t temp_val.append(y) elif c == 'Mileage': y = re.findall(r'\d+m \d+ch|\d+\.\d+km|\w+', x_) if len(y) > t: y = re.findall(r'\d+m \d+ch', x_) temp_val.append(y) else: temp_val.append([x_] * t) temp_vals.append( pd.DataFrame(np.array(temp_val, dtype=object).T, columns=col)) stn_dat.drop(idx, axis='index', inplace=True) stn_dat = pd.concat( [stn_dat] + temp_vals, axis=0, ignore_index=True) stn_dat.sort_values(['Station'], inplace=True) stn_dat.index = range(len(stn_dat)) degrees_col = ['Degrees Longitude', 'Degrees Latitude'] stn_dat[degrees_col] = stn_dat[degrees_col].applymap(self._parse_degrees) stn_dat['Grid Reference'] = stn_dat['Grid Reference'].map( lambda x: x.replace('c.', '') if x.startswith('c.') else x) stn_dat[['Station', 'Station_Note']] = stn_dat.Station.map( parse_location_name).apply(pd.Series) # Owner owners = self.extended_info(stn_dat.Owner, name='Owner') stn_dat.drop('Owner', axis=1, inplace=True) stn_dat = stn_dat.join(owners) # Operator # temp = list(stn_dat.Operator.map(self._parse_owner_and_operator)) # length = len(max(temp, key=len)) # col_names_current = ['Operator', 'Date'] # prev_no = list(itertools.chain.from_iterable( # itertools.repeat(x, 2) for x in list(range(1, length)))) # col_names = zip(col_names_current * (length - 1), prev_no) # col_names = col_names_current + [ # '_'.join(['Prev', x, str(d)]) for x, d in col_names] # # for i in range(len(temp)): # if len(temp[i]) < length: # temp[i] += [(None, None)] * (length - len(temp[i])) # # temp2 = pd.DataFrame(temp) # operators = [temp2[c].apply(pd.Series) for c in temp2.columns] # operators = pd.concat(operators, axis=1, sort=False) # operators.columns = col_names operators = self.extended_info(stn_dat.Operator, name='Operator') stn_dat.drop('Operator', axis=1, inplace=True) stn_dat = stn_dat.join(operators) last_updated_date = get_last_updated_date(url) railway_station_data.update({beginning_with: stn_dat, self.LUDKey: last_updated_date}) print("Done.") if verbose == 2 else "" save_pickle(railway_station_data, path_to_pickle, verbose=verbose) except Exception as e: print("Failed. {}".format(e)) return railway_station_data
[docs] def fetch_station_data(self, update=False, pickle_it=False, data_dir=None, verbose=False): """ Fetch `railway station data <http://www.railwaycodes.org.uk/stations/station0.shtm>`_ from local backup. :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param pickle_it: whether to replace the current package data with newly collected data, defaults to ``False`` :type pickle_it: bool :param data_dir: name of package data folder, defaults to ``None`` :type data_dir: str, None :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool, int :return: railway station data (including the station name, ELR, mileage, status, owner, operator, degrees of longitude and latitude, and grid reference) and date of when the data was last updated :rtype: dict **Example**:: >>> from pyrcs.other_assets import Stations >>> stn = Stations() >>> # rail_stn_data = stn.fetch_station_data(update=True, verbose=True) >>> rail_stn_data = stn.fetch_station_data() >>> type(rail_stn_data) dict >>> list(rail_stn_data.keys()) ['Railway station data', 'Last updated date'] >>> rail_stn_dat = rail_stn_data['Railway station data'] >>> type(rail_stn_dat) pandas.core.frame.DataFrame >>> print(rail_stn_dat.head()) Station ELR ... Prev_Operator_6 Prev_Operator_Period_6 2606 MRL1 ... None None 723 TAT ... None None 89 ABD ... None None 90 CAM ... None None 85 Abbey Wood NKL ... None None [5 rows x 32 columns] """ verbose_ = False if (data_dir or not verbose) else (2 if verbose == 2 else True) data_sets = [ self.collect_station_data_by_initial( x, update=update, verbose=verbose_ if is_internet_connected() else False) for x in string.ascii_lowercase] if all(d[x] is None for d, x in zip(data_sets, string.ascii_uppercase)): if update: print_conn_err(verbose=verbose) print("No data of the {} has been freshly collected.".format(self.StnKey.lower())) data_sets = [self.collect_station_data_by_initial(x, update=False, verbose=verbose_) for x in string.ascii_lowercase] stn_dat_tbl_ = (item[x] for item, x in zip(data_sets, string.ascii_uppercase)) stn_dat_tbl = sorted([x for x in stn_dat_tbl_ if x is not None], key=lambda x: x.shape[1], reverse=True) stn_data = pd.concat(stn_dat_tbl, axis=0, ignore_index=True, sort=False) stn_data = stn_data.where(pd.notna(stn_data), None) stn_data.sort_values(['Station'], inplace=True) last_updated_dates = (d[self.LUDKey] for d in data_sets) latest_update_date = max(d for d in last_updated_dates if d is not None) railway_station_data = {self.StnKey: stn_data, self.LUDKey: latest_update_date} if pickle_it and data_dir: self.CurrentDataDir = validate_input_data_dir(data_dir) path_to_pickle = os.path.join(self.CurrentDataDir, self.StnPickle + ".pickle") save_pickle(railway_station_data, path_to_pickle, verbose=verbose) return railway_station_data