Source code for line_name

"""
Collect
British `railway line names <http://www.railwaycodes.org.uk/misc/line_names.shtm>`_.
"""

import copy
import os
import re
import urllib.parse

import pandas as pd
import requests
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.ops import confirmed, fake_requests_headers
from pyhelpers.store import load_pickle, save_pickle

from pyrcs.utils import cd_dat, get_catalogue, get_last_updated_date, homepage_url, \
    parse_table, print_conn_err, is_internet_connected, print_connection_error


[docs]class LineNames:
    """
    A class for collecting British railway line names.

    :param data_dir: name of data directory, defaults to ``None``
    :type data_dir: str or None
    :param update: whether to check on update and proceed to update the package data, 
        defaults to ``False``
    :type update: bool
    :param verbose: whether to print relevant information in console as the function runs,
        defaults to ``True``
    :type verbose: bool or int

    **Example**::

        >>> from pyrcs.line_data import LineNames

        >>> ln = LineNames()

        >>> print(ln.Name)
        Railway line names

        >>> print(ln.SourceURL)
        http://www.railwaycodes.org.uk/misc/line_names.shtm
    """

    def __init__(self, data_dir=None, update=False, verbose=True):
        """
        Constructor method.
        """
        if not is_internet_connected():
            print_connection_error(verbose=verbose)

        self.Name = 'Railway line names'
        self.Key = 'Line names'

        self.HomeURL = homepage_url()
        self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/misc/line_names.shtm')

        self.LUDKey = 'Last updated date'
        self.Date = get_last_updated_date(
            url=self.SourceURL, parsed=True, as_date_type=False)

        self.Catalogue = get_catalogue(
            page_url=self.SourceURL, update=update, confirmation_required=False)

        if data_dir:
            self.DataDir = validate_input_data_dir(data_dir)
        else:
            self.DataDir = cd_dat("line-data", self.Key.lower().replace(" ", "-"))
        self.CurrentDataDir = copy.copy(self.DataDir)

    def _cdd_ln(self, *sub_dir, **kwargs):
        """
        Change directory to package data directory and sub-directories (and/or a file).

        The directory for this module: ``"\\dat\\line-data\\line-names"``.

        :param sub_dir: sub-directory or sub-directories (and/or a file)
        :type sub_dir: str
        :param kwargs: optional parameters of
            `os.makedirs <https://docs.python.org/3/library/os.html#os.makedirs>`_,
            e.g. ``mode=0o777``
        :return: path to the backup data directory for ``LineNames``
        :rtype: str

        :meta private:
        """

        path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs)

        return path

[docs]    def collect_line_names(self, confirmation_required=True, verbose=False):
        """
        Collect data of railway line names from source web page.

        :param confirmation_required: whether to require users to confirm and proceed, 
            defaults to ``True``
        :type confirmation_required: bool
        :param verbose: whether to print relevant information in console 
            as the function runs, defaults to ``False``
        :type verbose: bool
        :return: railway line names and routes data and 
            date of when the data was last updated
        :rtype: dict or None

        **Example**::

            >>> from pyrcs.line_data import LineNames

            >>> ln = LineNames()

            >>> line_names_dat = ln.collect_line_names(confirmation_required=False)

            >>> type(line_names_dat)
            <class 'dict'>
            >>> print(list(line_names_dat.keys()))
            ['Line names', 'Last updated date']
        """

        if confirmed("To collect British railway {}?".format(self.Key.lower()),
                     confirmation_required=confirmation_required):

            if verbose == 2:
                print("Collecting the railway {}".format(self.Key.lower()), end=" ... ")

            line_names_data = None

            try:
                source = requests.get(self.SourceURL, headers=fake_requests_headers())
            except requests.ConnectionError:
                print("Failed. ") if verbose == 2 else ""
                print_conn_err(verbose=verbose)

            else:
                try:
                    row_lst, header = parse_table(source, parser='lxml')
                    line_names = pd.DataFrame(
                        [[r.replace('\xa0', '').strip() for r in row] for row in row_lst],
                        columns=header)

                    # Parse route column
                    def parse_route_column(x):
                        if 'Watford - Euston suburban route' in x:
                            route, route_note = 'Watford - Euston suburban route', x
                        elif ', including Moorgate - Farringdon' in x:
                            route_note = 'including Moorgate - Farringdon'
                            route = x.replace(', including Moorgate - Farringdon', '')
                        elif re.match(r'.+(?= \[\')', x):
                            route, route_note = re.split(r' \[\'\(?', x)
                            route_note = route_note.strip(")']")
                        elif re.match(r'.+\)$', x):
                            if re.match(r'.+(?= - \()', x):
                                route, route_note = x, None
                            else:
                                route, route_note = re.split(r' \(\[?\'?', x)
                                route_note = route_note.rstrip('\'])')
                        else:
                            route, route_note = x, None
                        return route, route_note

                    line_names[['Route', 'Route_note']] = \
                        line_names.Route.map(parse_route_column).apply(pd.Series)

                    last_updated_date = get_last_updated_date(self.SourceURL)

                    line_names_data = {self.Key: line_names,
                                       self.LUDKey: last_updated_date}

                    print("Done. ") if verbose == 2 else ""

                    pickle_filename = self.Key.lower().replace(" ", "-") + ".pickle"
                    path_to_pickle = self._cdd_ln(pickle_filename)
                    save_pickle(line_names_data, path_to_pickle, verbose=verbose)

                except Exception as e:
                    print("Failed. {}".format(e))

            return line_names_data

[docs]    def fetch_line_names(self, update=False, pickle_it=False, data_dir=None,
                         verbose=False):
        """
        Fetch data of railway line names from local backup.

        :param update: whether to check on update and proceed to update the package data, 
            defaults to ``False``
        :type update: bool
        :param pickle_it: whether to replace the current package data 
            with newly collected data, defaults to ``False``
        :type pickle_it: bool
        :param data_dir: name of package data folder, defaults to ``None``
        :type data_dir: str or None
        :param verbose: whether to print relevant information in console 
            as the function runs, defaults to ``False``
        :type verbose: bool
        :return: railway line names and routes data and 
            date of when the data was last updated
        :rtype: dict

        **Example**::

            >>> from pyrcs.line_data import LineNames

            >>> ln = LineNames()

            >>> line_names_dat = ln.fetch_line_names()

            >>> type(line_names_dat)
            <class 'dict'>
            >>> print(list(line_names_dat.keys()))
            ['Line names', 'Last updated date']
        """

        pickle_filename = self.Key.lower().replace(" ", "-") + ".pickle"
        path_to_pickle = self._cdd_ln(pickle_filename)

        if os.path.isfile(path_to_pickle) and not update:
            line_names_data = load_pickle(path_to_pickle)

        else:
            verbose_ = False if data_dir or not verbose else (2 if verbose == 2 else True)

            line_names_data = self.collect_line_names(confirmation_required=False,
                                                      verbose=verbose_)

            if line_names_data:  # line-names is not None
                if pickle_it and data_dir:
                    self.CurrentDataDir = validate_input_data_dir(data_dir)
                    path_to_pickle = os.path.join(self.CurrentDataDir, pickle_filename)
                    save_pickle(line_names_data, path_to_pickle, verbose=verbose)
            else:
                print("No data of the railway {} has been freshly collected.".format(
                    self.Key.lower()))
                line_names_data = load_pickle(path_to_pickle)

        return line_names_data