Source code for pyrcs.other_assets.stations

""" Collecting railway station data.

Data source: http://www.railwaycodes.org.uk/stations/station0.shtm

.. todo::

   Bilingual station names
   Sponsored stations
   Stations not served by their Station Facility Operator (SFO)
   International stations
   Station trivia
"""

import copy
import itertools
import os
import re
import string
import urllib.parse

import numpy as np
import pandas as pd
import requests
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.ops import fake_requests_headers
from pyhelpers.store import load_pickle, save_pickle

from pyrcs.utils import cd_dat, get_last_updated_date, get_station_data_catalogue, homepage_url, parse_location_name, \
    parse_table


[docs]class Stations: """ A class for collecting railway station data. :param data_dir: name of data directory, defaults to ``None`` :type data_dir: str, None :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool **Example**:: from pyrcs.other_assets import Stations stn = Stations() print(stn.Name) # Stations print(stn.SourceURL) # http://www.railwaycodes.org.uk/stations/station0.shtm """ def __init__(self, data_dir=None, update=False): """ Constructor method. """ self.Name = 'Stations' self.HomeURL = homepage_url() self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/stations/station0.shtm') self.StnKey = 'Railway station data' self.BilingualKey = 'Bilingual names' self.SpStnNameSignKey = 'Sponsored signs' self.NSFOKey = 'Not served by SFO' self.IntlKey = 'International' self.TriviaKey = 'Trivia' self.ARKey = 'Access rights' self.BarrierErrKey = 'Barrier error codes' self.LUDKey = 'Last updated date' # key to last updated date self.Catalogue = get_station_data_catalogue(self.SourceURL, self.StnKey, update=update) self.Date = get_last_updated_date(self.SourceURL, parsed=True, as_date_type=False) self.DataDir = validate_input_data_dir(data_dir) if data_dir else cd_dat("other-assets", self.Name.lower()) self.CurrentDataDir = copy.copy(self.DataDir) def cdd_stn(self, *sub_dir, **kwargs): """ Change directory to "dat\\other-assets\\stations\\" and sub-directories (and/or a file) :param sub_dir: sub-directory or sub-directories (and/or a file) :type sub_dir: str :param kwargs: optional parameters of `os.makedirs <https://docs.python.org/3/library/os.html#os.makedirs>`_, e.g. ``mode=0o777`` :return: path to the backup data directory for ``Stations`` :rtype: str :meta private: """ path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs) return path
[docs] @staticmethod def parse_current_operator(x): """ Parse 'Operator' column :param x: :return: """ contents = re.split(r'\\r| \[\'|\\\\r| {2}\'\]|\', \'|\\n', x.lstrip(' [\'').rstrip(' \']').lstrip('\n').strip()) contents = [x for x in contents if x != ''] operators = [] for y in contents: # Operators names operator_name = re.search(r'.*(?= \(from \d+ \w+ \d+(.*)?\))', y) operator_name = operator_name.group() if operator_name is not None else '' # Start dates start_date = re.search(r'(?<= \(from )\d+ \w+ \d+( to \d+ \w+ \d+(.*))?(?=\))', y) start_date = start_date.group() if start_date is not None else '' # Form a tuple operators.append((operator_name, start_date)) return operators
[docs] def collect_railway_station_data_by_initial(self, initial, update=False, verbose=False): """ Collect railway station data for the given ``initial`` letter. :param initial: initial letter of station data (including the station name, ELR, mileage, status, owner, operator, degrees of longitude and latitude, and grid reference) for specifying URL :type initial: str :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool, int :return: railway station data for the given ``initial`` letter; and date of when the data was last updated :rtype: dict **Example**:: from pyrcs.other_assets import Stations stn = Stations() update = False initial = 'a' railway_station_data_a = stn.collect_railway_station_data_by_initial(initial, update) print(railway_station_data_a) # {'A': <codes>, # 'Last updated date': <date>} """ path_to_pickle = self.cdd_stn("a-z", initial.lower() + ".pickle") if os.path.isfile(path_to_pickle) and not update: railway_station_data = load_pickle(path_to_pickle) else: url = self.SourceURL.replace('station0', 'station{}'.format(initial.lower())) if initial.upper() not in list(self.Catalogue[self.StnKey].keys()): print("No data is available for signal box codes beginning with \"{}\".".format(initial.upper())) railway_station_table, last_updated_date = None, None else: try: source = requests.get(url, headers=fake_requests_headers()) # Request to get connected to the url records, header = parse_table(source, parser='lxml') # Create a DataFrame of the requested table dat = [[x.replace('=', 'See').strip('\xa0') for x in i] for i in records] col = [re.sub(r'\n?\r+\n?', ' ', h) for h in header] railway_station_table = pd.DataFrame(dat, columns=col) def parse_degrees(x): if x == '': y = np.nan else: y = float(x.replace('c.', '') if x.startswith('c.') else x) return y railway_station_table[['Degrees Longitude', 'Degrees Latitude']] = \ railway_station_table[['Degrees Longitude', 'Degrees Latitude']].applymap(parse_degrees) railway_station_table['Grid Reference'] = railway_station_table['Grid Reference'].map( lambda x: x.replace('c.', '') if x.startswith('c.') else x) railway_station_table[['Station', 'Station_Note']] = \ railway_station_table.Station.map(parse_location_name).apply(pd.Series) # Operator temp = list(railway_station_table.Operator.map(self.parse_current_operator)) length = len(max(temp, key=len)) col_names_current = ['Operator', 'Date'] prev_no = list( itertools.chain.from_iterable(itertools.repeat(x, 2) for x in list(range(1, length)))) col_names = zip(col_names_current * (length - 1), prev_no) col_names = col_names_current + ['_'.join(['Prev', x, str(d)]) for x, d in col_names] for i in range(len(temp)): if len(temp[i]) < length: temp[i] += [(None, None)] * (length - len(temp[i])) temp = pd.DataFrame(temp) operators = [pd.DataFrame(temp)[col].apply(pd.Series) for col in temp.columns] operators = pd.concat(operators, axis=1, sort=False) operators.columns = col_names railway_station_table.drop('Operator', axis=1, inplace=True) railway_station_table = railway_station_table.join(operators) except Exception as e: print("Failed to collect station location codes beginning with \"{}\". {}".format( initial.upper(), e)) railway_station_table = None try: last_updated_date = get_last_updated_date(url) except Exception as e: print("Failed to find the last updated date of the station location codes beginning with " "\"{}\" {}".format(initial.upper(), e)) last_updated_date = None railway_station_data = {initial.upper(): railway_station_table, self.LUDKey: last_updated_date} save_pickle(railway_station_data, path_to_pickle, verbose=verbose) return railway_station_data
[docs] def fetch_railway_station_data(self, update=False, pickle_it=False, data_dir=None, verbose=False): """ Fetch railway station data from local backup. :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param pickle_it: whether to replace the current package data with newly collected data, defaults to ``False`` :type pickle_it: bool :param data_dir: name of package data folder, defaults to ``None`` :type data_dir: str, None :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool, int :return: railway station data (incl. the station name, ELR, mileage, status, owner, operator, degrees of longitude and latitude, and grid reference) and date of when the data was last updated :rtype: dict **Example**:: from pyrcs.other_assets import Stations stn = Stations() update = False pickle_it = False data_dir = None railway_station_data = stn.fetch_railway_station_data(update, pickle_it, data_dir) print(railway_station_data) # {'Railway station data': <codes>, # 'Latest update date': <date>} """ verbose_ = False if data_dir or not verbose else True data_sets = [self.collect_railway_station_data_by_initial(x, update, verbose_) for x in string.ascii_lowercase] railway_station_tables = (item[x] for item, x in zip(data_sets, string.ascii_uppercase)) railway_station_data_ = pd.concat(railway_station_tables, axis=0, ignore_index=True, sort=False) last_updated_dates = (d[self.LUDKey] for d in data_sets) latest_update_date = max(d for d in last_updated_dates if d is not None) railway_station_data = {self.StnKey: railway_station_data_, self.LUDKey: latest_update_date} if pickle_it and data_dir: self.CurrentDataDir = validate_input_data_dir(data_dir) path_to_pickle = os.path.join(self.CurrentDataDir, self.StnKey.lower().replace(" ", "-") + ".pickle") save_pickle(railway_station_data, path_to_pickle, verbose=verbose) return railway_station_data