Source code for line_name

"""
Collect
British `railway line names <http://www.railwaycodes.org.uk/misc/line_names.shtm>`_.
"""

import copy
import os
import re
import urllib.parse

import pandas as pd
import requests
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.ops import confirmed, fake_requests_headers
from pyhelpers.store import load_pickle, save_pickle

from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, homepage_url, \
    parse_table, print_conn_err, is_internet_connected, print_connection_error


[docs]class LineNames: """ A class for collecting British railway line names. :param data_dir: name of data directory, defaults to ``None`` :type data_dir: str or None :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``True`` :type verbose: bool or int **Example**:: >>> from pyrcs.line_data import LineNames >>> ln = LineNames() >>> print(ln.Name) Railway line names >>> print(ln.SourceURL) http://www.railwaycodes.org.uk/misc/line_names.shtm """ def __init__(self, data_dir=None, update=False, verbose=True): """ Constructor method. """ if not is_internet_connected(): print_connection_error(verbose=verbose) self.Name = 'Railway line names' self.Key = 'Line names' self.HomeURL = homepage_url() self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/misc/line_names.shtm') self.LUDKey = 'Last updated date' self.Date = get_last_updated_date( url=self.SourceURL, parsed=True, as_date_type=False) self.Catalogue = get_catalogue( page_url=self.SourceURL, update=update, confirmation_required=False) if data_dir: self.DataDir = validate_input_data_dir(data_dir) else: self.DataDir = cd_dat("line-data", self.Key.lower().replace(" ", "-")) self.CurrentDataDir = copy.copy(self.DataDir) def _cdd_ln(self, *sub_dir, **kwargs): """ Change directory to package data directory and sub-directories (and/or a file). The directory for this module: ``"\\dat\\line-data\\line-names"``. :param sub_dir: sub-directory or sub-directories (and/or a file) :type sub_dir: str :param kwargs: optional parameters of `os.makedirs <https://docs.python.org/3/library/os.html#os.makedirs>`_, e.g. ``mode=0o777`` :return: path to the backup data directory for ``LineNames`` :rtype: str :meta private: """ path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs) return path
[docs] def collect_line_names(self, confirmation_required=True, verbose=False): """ Collect data of railway line names from source web page. :param confirmation_required: whether to require users to confirm and proceed, defaults to ``True`` :type confirmation_required: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool :return: railway line names and routes data and date of when the data was last updated :rtype: dict or None **Example**:: >>> from pyrcs.line_data import LineNames >>> ln = LineNames() >>> line_names_dat = ln.collect_line_names(confirmation_required=False) >>> type(line_names_dat) <class 'dict'> >>> print(list(line_names_dat.keys())) ['Line names', 'Last updated date'] """ if confirmed("To collect British railway {}?".format(self.Key.lower()), confirmation_required=confirmation_required): if verbose == 2: print("Collecting the railway {}".format(self.Key.lower()), end=" ... ") line_names_data = None try: source = requests.get(self.SourceURL, headers=fake_requests_headers()) except requests.ConnectionError: print("Failed. ") if verbose == 2 else "" print_conn_err(verbose=verbose) else: try: row_lst, header = parse_table(source, parser='lxml') line_names = pd.DataFrame( [[r.replace('\xa0', '').strip() for r in row] for row in row_lst], columns=header) # Parse route column def parse_route_column(x): if 'Watford - Euston suburban route' in x: route, route_note = 'Watford - Euston suburban route', x elif ', including Moorgate - Farringdon' in x: route_note = 'including Moorgate - Farringdon' route = x.replace(', including Moorgate - Farringdon', '') elif re.match(r'.+(?= \[\')', x): route, route_note = re.split(r' \[\'\(?', x) route_note = route_note.strip(")']") elif re.match(r'.+\)$', x): if re.match(r'.+(?= - \()', x): route, route_note = x, None else: route, route_note = re.split(r' \(\[?\'?', x) route_note = route_note.rstrip('\'])') else: route, route_note = x, None return route, route_note line_names[['Route', 'Route_note']] = \ line_names.Route.map(parse_route_column).apply(pd.Series) last_updated_date = get_last_updated_date(self.SourceURL) line_names_data = {self.Key: line_names, self.LUDKey: last_updated_date} print("Done. ") if verbose == 2 else "" pickle_filename = self.Key.lower().replace(" ", "-") + ".pickle" path_to_pickle = self._cdd_ln(pickle_filename) save_pickle(line_names_data, path_to_pickle, verbose=verbose) except Exception as e: print("Failed. {}".format(e)) return line_names_data
[docs] def fetch_line_names(self, update=False, pickle_it=False, data_dir=None, verbose=False): """ Fetch data of railway line names from local backup. :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param pickle_it: whether to replace the current package data with newly collected data, defaults to ``False`` :type pickle_it: bool :param data_dir: name of package data folder, defaults to ``None`` :type data_dir: str or None :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool :return: railway line names and routes data and date of when the data was last updated :rtype: dict **Example**:: >>> from pyrcs.line_data import LineNames >>> ln = LineNames() >>> line_names_dat = ln.fetch_line_names() >>> type(line_names_dat) <class 'dict'> >>> print(list(line_names_dat.keys())) ['Line names', 'Last updated date'] """ pickle_filename = self.Key.lower().replace(" ", "-") + ".pickle" path_to_pickle = self._cdd_ln(pickle_filename) if os.path.isfile(path_to_pickle) and not update: line_names_data = load_pickle(path_to_pickle) else: verbose_ = False if data_dir or not verbose else (2 if verbose == 2 else True) line_names_data = self.collect_line_names(confirmation_required=False, verbose=verbose_) if line_names_data: # line-names is not None if pickle_it and data_dir: self.CurrentDataDir = validate_input_data_dir(data_dir) path_to_pickle = os.path.join(self.CurrentDataDir, pickle_filename) save_pickle(line_names_data, path_to_pickle, verbose=verbose) else: print("No data of the railway {} has been freshly collected.".format( self.Key.lower())) line_names_data = load_pickle(path_to_pickle) return line_names_data