Source code for viaduct

"""
Collect codes of
`railway viaducts <http://www.railwaycodes.org.uk/tunnels/tunnels0.shtm>`_.
"""

import copy
import itertools
import os
import re
import urllib.parse

import pandas as pd
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.store import load_pickle, save_pickle
from pyhelpers.text import find_similar_str

from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, homepage_url


[docs]class Viaducts:
    """
    A class for collecting railway viaducts.

    :param data_dir: name of data directory, defaults to ``None``
    :type data_dir: str, None
    :param update: whether to check on update and proceed to update the package data, 
        defaults to ``False``
    :type update: bool

    **Example**::

        >>> from pyrcs.other_assets import Viaducts

        >>> viaducts = Viaducts()

        >>> print(viaducts.Name)
        Railway viaducts

        >>> print(viaducts.SourceURL)
        http://www.railwaycodes.org.uk/viaducts/viaducts0.shtm
    """

    def __init__(self, data_dir=None, update=False):
        """
        Constructor method.
        """
        self.Name = 'Railway viaducts'
        self.Key = 'Viaducts'
        self.LUDKey = 'Last updated date'
        self.HomeURL = homepage_url()
        self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/viaducts/viaducts0.shtm')
        self.Catalogue = \
            get_catalogue(self.SourceURL, update=update, confirmation_required=False)
        self.P1Key, self.P2Key, self.P3Key, self.P4Key, self.P5Key, self.P6Key = \
            list(self.Catalogue.keys())[1:]
        self.Date = get_last_updated_date(self.SourceURL, parsed=True, as_date_type=False)
        self.DataDir = validate_input_data_dir(data_dir) if data_dir \
            else cd_dat("other-assets", self.Key.lower())
        self.CurrentDataDir = copy.copy(self.DataDir)

[docs]    def cdd_viaducts(self, *sub_dir, **kwargs):
        """
        Change directory to package data directory and sub-directories (and/or a file).

        The directory for this module: ``"\\dat\\other-assets\\viaducts"``.

        :param sub_dir: sub-directory or sub-directories (and/or a file)
        :type sub_dir: str
        :param kwargs: optional parameters of
            `os.makedirs <https://docs.python.org/3/library/os.html#os.makedirs>`_,
            e.g. ``mode=0o777``
        :return: path to the backup data directory for ``Viaducts``
        :rtype: str

        :meta private:
        """

        path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs)

        return path

[docs]    def collect_railway_viaducts_by_page(self, page_no, update=False, verbose=False):
        """
        Collect data of railway viaducts for a given page number from source web page.

        :param page_no: page number;
            valid values include ``1``, ``2``, ``3``, ``4``, ``5``, and ``6``
        :type page_no: int, str
        :param update: whether to check on update and proceed to update the package data,
            defaults to ``False``
        :type update: bool
        :param verbose: whether to print relevant information in console
            as the function runs, defaults to ``False``
        :type verbose: bool
        :return: railway viaducts data of the given ``page_no`` and
            date of when the data was last updated
        :rtype: dict

        **Example**::

            >>> from pyrcs.other_assets import Viaducts

            >>> viaducts = Viaducts()

            >>> viaducts_1 = viaducts.collect_railway_viaducts_by_page(page_no=1)

            >>> type(viaducts_1)
            <class 'dict'>
            >>> print(list(viaducts_1.keys()))
            ['Page 1 (A-C)', 'Last updated date']
        """

        assert page_no in range(1, 7), \
            "Valid \"page_no\" must be one of 1, 2, 3, 4, 5, and 6."

        page_name = find_similar_str(str(page_no), list(self.Catalogue.keys()))

        pickle_filename = re.sub(
            r"[()]", "", re.sub(r"[ -]", "-", page_name)).lower() + ".pickle"
        path_to_pickle = self.cdd_viaducts(pickle_filename)

        if os.path.isfile(path_to_pickle) and not update:
            page_railway_viaducts = load_pickle(path_to_pickle)

        else:
            url = self.Catalogue[page_name]

            try:
                last_updated_date = get_last_updated_date(url)
            except Exception as e:
                print("Failed to find the last updated date for viaducts data of "
                      "{}. {}".format(page_name, e))
                last_updated_date = None

            try:
                header, viaducts_table = \
                    pd.read_html(url, na_values=[''], keep_default_na=False)
                viaducts_table.columns = header.columns.to_list()
                viaducts_table.fillna('', inplace=True)
            except Exception as e:
                print("Failed to collect viaducts data of {}. {}".format(page_name, e))
                viaducts_table = None

            page_railway_viaducts = {page_name: viaducts_table,
                                     self.LUDKey: last_updated_date}

            save_pickle(page_railway_viaducts, path_to_pickle, verbose=verbose)

        return page_railway_viaducts

[docs]    def fetch_railway_viaducts(self, update=False, pickle_it=False, data_dir=None,
                               verbose=False):
        """
        Fetch data of railway viaducts from local backup.

        :param update: whether to check on update and proceed to update the package data,
            defaults to ``False``
        :type update: bool
        :param pickle_it: whether to replace the current package data
            with newly collected data, defaults to ``False``
        :type pickle_it: bool
        :param data_dir: name of package data folder, defaults to ``None``
        :type data_dir: str, None
        :param verbose: whether to print relevant information in console
            as the function runs, defaults to ``False``
        :type verbose: bool
        :return: railway viaducts data and date of when the data was last updated
        :rtype: dict

        **Example**::

            >>> from pyrcs.other_assets import Viaducts

            >>> viaducts = Viaducts()

            >>> viaducts_data = viaducts.fetch_railway_viaducts()

            >>> type(viaducts_data)
            <class 'dict'>
            >>> print(list(viaducts_data.keys()))
            ['Viaducts', 'Last updated date']

            >>> viaducts_dat = viaducts_data['Viaducts']
            >>> type(viaducts_dat)
            <class 'dict'>
            >>> print(list(viaducts_dat.keys()))
            ['Page 1 (A-C)',
             'Page 2 (D-G)',
             'Page 3 (H-K)',
             'Page 4 (L-P)',
             'Page 5 (Q-S)',
             'Page 6 (T-Z)']
        """

        verbose_ = False if data_dir or not verbose else True
        codes = [self.collect_railway_viaducts_by_page(page_no, update, verbose=verbose_)
                 for page_no in range(1, 7)]

        railways_viaducts_data = {
            self.Key: {next(iter(x)): next(iter(x.values())) for x in codes},
            self.LUDKey:
                max(next(itertools.islice(iter(x.values()), 1, 2)) for x in codes)}

        if pickle_it and data_dir:
            self.CurrentDataDir = validate_input_data_dir(data_dir)
            path_to_pickle = os.path.join(
                self.CurrentDataDir, self.Key.lower().replace(" ", "-") + ".pickle")
            save_pickle(railways_viaducts_data, path_to_pickle, verbose=verbose)

        return railways_viaducts_data