Source code for viaduct

"""
Collect codes of `railway viaducts <http://www.railwaycodes.org.uk/tunnels/tunnels0.shtm>`_.
"""

import copy
import itertools
import os
import re
import socket
import urllib.error
import urllib.parse

import pandas as pd
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.store import load_pickle, save_pickle
from pyhelpers.text import find_similar_str

from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, homepage_url, \
    print_conn_err, is_internet_connected, print_connection_error


[docs]class Viaducts: """ A class for collecting railway viaducts. :param data_dir: name of data directory, defaults to ``None`` :type data_dir: str, None :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``True`` :type verbose: bool or int :ivar str Name: name of the data :ivar str Key: key of the dict-type data :ivar str HomeURL: URL of the main homepage :ivar str SourceURL: URL of the data web page :ivar str LUDKey: key of the last updated date :ivar str LUD: last updated date :ivar dict Catalogue: catalogue of the data :ivar str DataDir: path to the data directory :ivar str CurrentDataDir: path to the current data directory :ivar str P1Key: key of the dict-type data of Page 1 :ivar str P2Key: key of the dict-type data of Page 2 :ivar str P3Key: key of the dict-type data of Page 3 :ivar str P4Key: key of the dict-type data of Page 4 :ivar str P5Key: key of the dict-type data of Page 5 :ivar str P6Key: key of the dict-type data of Page 6 **Example**:: >>> from pyrcs.other_assets import Viaducts >>> vdct = Viaducts() >>> print(vdct.Name) Railway viaducts >>> print(vdct.SourceURL) http://www.railwaycodes.org.uk/viaducts/viaducts0.shtm """ def __init__(self, data_dir=None, update=False, verbose=True): """ Constructor method. """ if not is_internet_connected(): print_connection_error(verbose=verbose) self.Name = 'Railway viaducts' self.Key = 'Viaducts' self.HomeURL = homepage_url() self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/viaducts/viaducts0.shtm') self.LUDKey = 'Last updated date' self.LUD = get_last_updated_date(url=self.SourceURL, parsed=True, as_date_type=False) self.Catalogue = get_catalogue(page_url=self.SourceURL, update=update, confirmation_required=False) self.P1Key, self.P2Key, self.P3Key, self.P4Key, self.P5Key, self.P6Key = \ list(self.Catalogue.keys())[1:] if data_dir: self.DataDir = validate_input_data_dir(data_dir) else: self.DataDir = cd_dat("other-assets", self.Key.lower()) self.CurrentDataDir = copy.copy(self.DataDir) def _cdd_vdct(self, *sub_dir, **kwargs): """ Change directory to package data directory and sub-directories (and/or a file). The directory for this module: ``"\\dat\\other-assets\\viaducts"``. :param sub_dir: sub-directory or sub-directories (and/or a file) :type sub_dir: str :param kwargs: optional parameters of `os.makedirs`_, e.g. ``mode=0o777`` :return: path to the backup data directory for ``Viaducts`` :rtype: str .. _`os.makedirs`: https://docs.python.org/3/library/os.html#os.makedirs :meta private: """ path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs) return path
[docs] def collect_viaduct_codes_by_page(self, page_no, update=False, verbose=False): """ Collect data of railway viaducts for a given page number from source web page. :param page_no: page number; valid values include ``1``, ``2``, ``3``, ``4``, ``5``, and ``6`` :type page_no: int, str :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool :return: railway viaducts data of the given ``page_no`` and date of when the data was last updated :rtype: dict **Example**:: >>> from pyrcs.other_assets import Viaducts >>> vdct = Viaducts() >>> # vd1 = vdct.collect_viaduct_codes_by_page(1, update=True, verbose=True) >>> vd1 = vdct.collect_viaduct_codes_by_page(page_no=1) >>> type(vd1) dict >>> list(vd1.keys()) ['Page 1 (A-C)', 'Last updated date'] >>> viaducts_1 = vd1['Page 1 (A-C)'] >>> print(viaducts_1.head()) Name ... Spans 0 7 Arches ... 7 1 36 Arch ... 36 2 42 Arch ... 3 A6120 ... 4 A698 ... [5 rows x 7 columns] """ assert page_no in range(1, 7), "Valid \"page_no\" must be one of 1, 2, 3, 4, 5, and 6." page_name = find_similar_str(str(page_no), list(self.Catalogue.keys())) pickle_filename = re.sub(r"[()]", "", re.sub(r"[ -]", "-", page_name)).lower() + ".pickle" path_to_pickle = self._cdd_vdct(pickle_filename) if os.path.isfile(path_to_pickle) and not update: page_railway_viaducts = load_pickle(path_to_pickle) else: url = self.Catalogue[page_name] page_railway_viaducts = None if verbose == 2: print("Collecting data of {} on {}".format(self.Key.lower(), page_name), end=" ... ") try: header, viaducts_table = pd.read_html(url, na_values=[''], keep_default_na=False) except (urllib.error.URLError, socket.gaierror): print("Failed. ") if verbose == 2 else "" print_conn_err(verbose=verbose) else: try: viaducts_table.columns = header.columns.to_list() viaducts_table.fillna('', inplace=True) last_updated_date = get_last_updated_date(url) print("Done.") if verbose == 2 else "" page_railway_viaducts = { page_name: viaducts_table, self.LUDKey: last_updated_date} save_pickle(page_railway_viaducts, path_to_pickle, verbose=verbose) except Exception as e: print("Failed. {}".format(page_name, e)) return page_railway_viaducts
[docs] def fetch_viaduct_codes(self, update=False, pickle_it=False, data_dir=None, verbose=False): """ Fetch data of railway viaducts from local backup. :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param pickle_it: whether to replace the current package data with newly collected data, defaults to ``False`` :type pickle_it: bool :param data_dir: name of package data folder, defaults to ``None`` :type data_dir: str, None :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool :return: railway viaducts data and date of when the data was last updated :rtype: dict **Example**:: >>> from pyrcs.other_assets import Viaducts >>> vdct = Viaducts() >>> # viaducts_codes = vdct.fetch_viaduct_codes(update=True, verbose=True) >>> viaducts_codes = vdct.fetch_viaduct_codes() >>> type(viaducts_codes) dict >>> list(viaducts_codes.keys()) ['Viaducts', 'Last updated date'] >>> viaducts_dat = viaducts_codes['Viaducts'] >>> type(viaducts_dat) dict >>> list(viaducts_dat.keys()) ['Page 1 (A-C)', 'Page 2 (D-G)', 'Page 3 (H-K)', 'Page 4 (L-P)', 'Page 5 (Q-S)', 'Page 6 (T-Z)'] >>> viaducts_dat_6 = viaducts_dat['Page 6 (T-Z)'] >>> print(viaducts_dat_6.head()) Name ... Spans 0 Taff ... 1 Taff ... 2 Taff River ... 3 Taffs Well ... 4 Tame ... 4 [5 rows x 7 columns] """ verbose_ = False if (data_dir or not verbose) else (2 if verbose == 2 else True) page_data = [ self.collect_viaduct_codes_by_page( page_no, update, verbose=verbose_ if is_internet_connected() else False) for page_no in range(1, 7)] if all(x is None for x in page_data): if update: print_conn_err(verbose=verbose) print("No data of the {} has been freshly collected.".format(self.Key.lower())) page_data = [self.collect_viaduct_codes_by_page(x, update=False, verbose=verbose_) for x in range(1, 7)] railways_viaducts_data = { self.Key: {next(iter(x)): next(iter(x.values())) for x in page_data}, self.LUDKey: max(next(itertools.islice(iter(x.values()), 1, 2)) for x in page_data)} if pickle_it and data_dir: self.CurrentDataDir = validate_input_data_dir(data_dir) path_to_pickle = os.path.join( self.CurrentDataDir, self.Key.lower().replace(" ", "-") + ".pickle") save_pickle(railways_viaducts_data, path_to_pickle, verbose=verbose) return railways_viaducts_data