Source code for viaduct

"""
Collect codes of `railway viaducts <http://www.railwaycodes.org.uk/tunnels/tunnels0.shtm>`_.
"""

import copy
import itertools
import os
import re
import socket
import urllib.error
import urllib.parse

import pandas as pd
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.store import load_pickle, save_pickle
from pyhelpers.text import find_similar_str

from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, homepage_url, \
    print_conn_err, is_internet_connected, print_connection_error


[docs]class Viaducts:
    """
    A class for collecting railway viaducts.

    :param data_dir: name of data directory, defaults to ``None``
    :type data_dir: str, None
    :param update: whether to check on update and proceed to update the package data, 
        defaults to ``False``
    :type update: bool
    :param verbose: whether to print relevant information in console as the function runs,
        defaults to ``True``
    :type verbose: bool or int

    :ivar str Name: name of the data
    :ivar str Key: key of the dict-type data
    :ivar str HomeURL: URL of the main homepage
    :ivar str SourceURL: URL of the data web page
    :ivar str LUDKey: key of the last updated date
    :ivar str LUD: last updated date
    :ivar dict Catalogue: catalogue of the data
    :ivar str DataDir: path to the data directory
    :ivar str CurrentDataDir: path to the current data directory

    :ivar str P1Key: key of the dict-type data of Page 1
    :ivar str P2Key: key of the dict-type data of Page 2
    :ivar str P3Key: key of the dict-type data of Page 3
    :ivar str P4Key: key of the dict-type data of Page 4
    :ivar str P5Key: key of the dict-type data of Page 5
    :ivar str P6Key: key of the dict-type data of Page 6

    **Example**::

        >>> from pyrcs.other_assets import Viaducts

        >>> vdct = Viaducts()

        >>> print(vdct.Name)
        Railway viaducts

        >>> print(vdct.SourceURL)
        http://www.railwaycodes.org.uk/viaducts/viaducts0.shtm
    """

    def __init__(self, data_dir=None, update=False, verbose=True):
        """
        Constructor method.
        """
        if not is_internet_connected():
            print_connection_error(verbose=verbose)

        self.Name = 'Railway viaducts'
        self.Key = 'Viaducts'

        self.HomeURL = homepage_url()
        self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/viaducts/viaducts0.shtm')

        self.LUDKey = 'Last updated date'
        self.LUD = get_last_updated_date(url=self.SourceURL, parsed=True, as_date_type=False)

        self.Catalogue = get_catalogue(page_url=self.SourceURL, update=update,
                                       confirmation_required=False)

        self.P1Key, self.P2Key, self.P3Key, self.P4Key, self.P5Key, self.P6Key = \
            list(self.Catalogue.keys())[1:]

        if data_dir:
            self.DataDir = validate_input_data_dir(data_dir)
        else:
            self.DataDir = cd_dat("other-assets", self.Key.lower())
        self.CurrentDataDir = copy.copy(self.DataDir)

    def _cdd_vdct(self, *sub_dir, **kwargs):
        """
        Change directory to package data directory and sub-directories (and/or a file).

        The directory for this module: ``"\\dat\\other-assets\\viaducts"``.

        :param sub_dir: sub-directory or sub-directories (and/or a file)
        :type sub_dir: str
        :param kwargs: optional parameters of `os.makedirs`_, e.g. ``mode=0o777``
        :return: path to the backup data directory for ``Viaducts``
        :rtype: str

        .. _`os.makedirs`: https://docs.python.org/3/library/os.html#os.makedirs

        :meta private:
        """

        path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs)

        return path

[docs]    def collect_viaduct_codes_by_page(self, page_no, update=False, verbose=False):
        """
        Collect data of railway viaducts for a given page number from source web page.

        :param page_no: page number;
            valid values include ``1``, ``2``, ``3``, ``4``, ``5``, and ``6``
        :type page_no: int, str
        :param update: whether to check on update and proceed to update the package data,
            defaults to ``False``
        :type update: bool
        :param verbose: whether to print relevant information in console as the function runs,
            defaults to ``False``
        :type verbose: bool
        :return: railway viaducts data of the given ``page_no`` and
            date of when the data was last updated
        :rtype: dict

        **Example**::

            >>> from pyrcs.other_assets import Viaducts

            >>> vdct = Viaducts()

            >>> # vd1 = vdct.collect_viaduct_codes_by_page(1, update=True, verbose=True)
            >>> vd1 = vdct.collect_viaduct_codes_by_page(page_no=1)

            >>> type(vd1)
            dict
            >>> list(vd1.keys())
            ['Page 1 (A-C)', 'Last updated date']

            >>> viaducts_1 = vd1['Page 1 (A-C)']
            >>> print(viaducts_1.head())
                   Name  ... Spans
            0  7 Arches  ...     7
            1   36 Arch  ...    36
            2   42 Arch  ...
            3     A6120  ...
            4      A698  ...
            [5 rows x 7 columns]
        """

        assert page_no in range(1, 7), "Valid \"page_no\" must be one of 1, 2, 3, 4, 5, and 6."

        page_name = find_similar_str(str(page_no), list(self.Catalogue.keys()))

        pickle_filename = re.sub(r"[()]", "", re.sub(r"[ -]", "-", page_name)).lower() + ".pickle"
        path_to_pickle = self._cdd_vdct(pickle_filename)

        if os.path.isfile(path_to_pickle) and not update:
            page_railway_viaducts = load_pickle(path_to_pickle)

        else:
            url = self.Catalogue[page_name]

            page_railway_viaducts = None

            if verbose == 2:
                print("Collecting data of {} on {}".format(self.Key.lower(), page_name),
                      end=" ... ")

            try:
                header, viaducts_table = pd.read_html(url, na_values=[''], keep_default_na=False)
            except (urllib.error.URLError, socket.gaierror):
                print("Failed. ") if verbose == 2 else ""
                print_conn_err(verbose=verbose)

            else:
                try:
                    viaducts_table.columns = header.columns.to_list()
                    viaducts_table.fillna('', inplace=True)

                    last_updated_date = get_last_updated_date(url)

                    print("Done.") if verbose == 2 else ""

                    page_railway_viaducts = {
                        page_name: viaducts_table, self.LUDKey: last_updated_date}

                    save_pickle(page_railway_viaducts, path_to_pickle, verbose=verbose)

                except Exception as e:
                    print("Failed. {}".format(page_name, e))

        return page_railway_viaducts

[docs]    def fetch_viaduct_codes(self, update=False, pickle_it=False, data_dir=None, verbose=False):
        """
        Fetch data of railway viaducts from local backup.

        :param update: whether to check on update and proceed to update the package data,
            defaults to ``False``
        :type update: bool
        :param pickle_it: whether to replace the current package data with newly collected data,
            defaults to ``False``
        :type pickle_it: bool
        :param data_dir: name of package data folder, defaults to ``None``
        :type data_dir: str, None
        :param verbose: whether to print relevant information in console as the function runs,
            defaults to ``False``
        :type verbose: bool
        :return: railway viaducts data and date of when the data was last updated
        :rtype: dict

        **Example**::

            >>> from pyrcs.other_assets import Viaducts

            >>> vdct = Viaducts()

            >>> # viaducts_codes = vdct.fetch_viaduct_codes(update=True, verbose=True)
            >>> viaducts_codes = vdct.fetch_viaduct_codes()

            >>> type(viaducts_codes)
            dict
            >>> list(viaducts_codes.keys())
            ['Viaducts', 'Last updated date']

            >>> viaducts_dat = viaducts_codes['Viaducts']
            >>> type(viaducts_dat)
            dict
            >>> list(viaducts_dat.keys())
            ['Page 1 (A-C)',
             'Page 2 (D-G)',
             'Page 3 (H-K)',
             'Page 4 (L-P)',
             'Page 5 (Q-S)',
             'Page 6 (T-Z)']

            >>> viaducts_dat_6 = viaducts_dat['Page 6 (T-Z)']
            >>> print(viaducts_dat_6.head())
                     Name  ... Spans
            0        Taff  ...
            1        Taff  ...
            2  Taff River  ...
            3  Taffs Well  ...
            4        Tame  ...     4
            [5 rows x 7 columns]
        """

        verbose_ = False if (data_dir or not verbose) else (2 if verbose == 2 else True)

        page_data = [
            self.collect_viaduct_codes_by_page(
                page_no, update, verbose=verbose_ if is_internet_connected() else False)
            for page_no in range(1, 7)]

        if all(x is None for x in page_data):
            if update:
                print_conn_err(verbose=verbose)
                print("No data of the {} has been freshly collected.".format(self.Key.lower()))
            page_data = [self.collect_viaduct_codes_by_page(x, update=False, verbose=verbose_)
                         for x in range(1, 7)]

        railways_viaducts_data = {
            self.Key: {next(iter(x)): next(iter(x.values())) for x in page_data},
            self.LUDKey: max(next(itertools.islice(iter(x.values()), 1, 2)) for x in page_data)}

        if pickle_it and data_dir:
            self.CurrentDataDir = validate_input_data_dir(data_dir)
            path_to_pickle = os.path.join(
                self.CurrentDataDir, self.Key.lower().replace(" ", "-") + ".pickle")
            save_pickle(railways_viaducts_data, path_to_pickle, verbose=verbose)

        return railways_viaducts_data