Source code for loc_id

"""
Collect
`CRS, NLC, TIPLOC and STANOX codes <http://www.railwaycodes.org.uk/crs/CRS0.shtm>`_.
"""

import copy
import os
import re
import string
import urllib.parse

import bs4
import pandas as pd
import requests
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.ops import confirmed, fake_requests_headers, split_list_by_size
from pyhelpers.store import load_json, load_pickle, save, save_pickle

from pyrcs.utils import cd_dat, homepage_url, get_catalogue, get_last_updated_date, \
    parse_date, parse_location_name, parse_table, parse_tr, print_conn_err, \
    is_internet_connected, print_connection_error


[docs]class LocationIdentifiers:
    """
    A class for collecting location identifiers
    (including `other systems <http://www.railwaycodes.org.uk/crs/CRS1.shtm>`_ station).

    :param data_dir: name of data directory, defaults to ``None``
    :type data_dir: str or None
    :param update: whether to check on update and proceed to update the package data,
        defaults to ``False``
    :type update: bool
    :param verbose: whether to print relevant information in console as the function runs,
        defaults to ``True``
    :type verbose: bool or int

    **Example**::

        >>> from pyrcs.line_data import LocationIdentifiers

        >>> lid = LocationIdentifiers()

        >>> print(lid.Name)
        CRS, NLC, TIPLOC and STANOX codes

        >>> print(lid.SourceURL)
        http://www.railwaycodes.org.uk/crs/CRS0.shtm
    """

    def __init__(self, data_dir=None, update=False, verbose=True):
        """
        Constructor method.
        """
        if not is_internet_connected():
            print_connection_error(verbose=verbose)

        self.Name = 'CRS, NLC, TIPLOC and STANOX codes'
        self.Key = 'Location codes'  # key to location codes

        self.HomeURL = homepage_url()
        self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/crs/CRS0.shtm')

        self.LUDKey = 'Last updated date'  # key to last updated date
        self.Date = get_last_updated_date(url=self.SourceURL, parsed=True,
                                          as_date_type=False)

        self.Catalogue = get_catalogue(page_url=self.SourceURL, update=update,
                                       confirmation_required=False)

        if data_dir:
            self.DataDir = validate_input_data_dir(data_dir)
        else:
            self.DataDir = cd_dat(
                "line-data",
                re.sub(r',| codes| and', '', self.Name.lower()).replace(" ", "-"))
        self.CurrentDataDir = copy.copy(self.DataDir)

        self.OSKey = 'Other systems'  # key to other systems codes
        self.OSPickle = self.OSKey.lower().replace(" ", "-")
        self.ANKey = 'Additional notes'  # key to additional notes
        self.MSCENKey = 'Multiple station codes explanatory note'
        self.MSCENPickle = self.MSCENKey.lower().replace(" ", "-")

    def _cdd_locid(self, *sub_dir, **kwargs):
        """
        Change directory to package data directory and sub-directories (and/or a file).

        The directory for this module: ``"\\dat\\line-data\\crs-nlc-tiploc-stanox"``.

        :param sub_dir: sub-directory or sub-directories (and/or a file)
        :type sub_dir: str
        :param kwargs: optional parameters of
            `os.makedirs <https://docs.python.org/3/library/os.html#os.makedirs>`_,
            e.g. ``mode=0o777``
        :return: path to the backup data directory for ``LocationIdentifiers``
        :rtype: str

        :meta private:
        """

        path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs)

        return path

[docs]    @staticmethod
    def amendment_to_loc_names():
        """
        Create a replacement dictionary for location name amendments.

        :return: dictionary of regular-expression amendments to location names
        :rtype: dict

        **Example**::

            >>> from pyrcs.line_data import LocationIdentifiers

            >>> lid = LocationIdentifiers()

            >>> loc_name_amendment_dict = lid.amendment_to_loc_names()

            >>> print(list(loc_name_amendment_dict.keys()))
            ['Location']
        """

        location_name_amendment_dict = {
            'Location': {re.compile(r' And | \+ '): ' & ',
                         re.compile(r'-By-'): '-by-',
                         re.compile(r'-In-'): '-in-',
                         re.compile(r'-En-Le-'): '-en-le-',
                         re.compile(r'-La-'): '-la-',
                         re.compile(r'-Le-'): '-le-',
                         re.compile(r'-On-'): '-on-',
                         re.compile(r'-The-'): '-the-',
                         re.compile(r' Of '): ' of ',
                         re.compile(r'-Super-'): '-super-',
                         re.compile(r'-Upon-'): '-upon-',
                         re.compile(r'-Under-'): '-under-',
                         re.compile(r'-Y-'): '-y-'}}
        return location_name_amendment_dict

[docs]    @staticmethod
    def parse_note_page(note_url, parser='lxml', verbose=False):
        """
        Parse addition note page.

        :param note_url: URL link of the target web page
        :type note_url: str
        :param parser: the `parser`_ to use for `bs4.BeautifulSoup`_,
            defaults to ``'lxml'``
        :type parser: str
        :param verbose: whether to print relevant information in console
            as the function runs, defaults to ``False``
        :type verbose: bool or int
        :return: parsed texts
        :rtype: list

        .. _`parser`:
            https://www.crummy.com/software/BeautifulSoup/bs4/doc/
            index.html#specifying-the-parser-to-use
        .. _`bs4.BeautifulSoup`:
            https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.html

        **Example**::

            >>> from pyrcs.line_data import LocationIdentifiers

            >>> lid = LocationIdentifiers()

            >>> url = lid.HomeURL + '/crs/CRS2.shtm'

            >>> parsed_note_ = lid.parse_note_page(url, parser='lxml')

            >>> print(parsed_note_[3].head())
                              Location  CRS CRS_alt1 CRS_alt2
            0  Glasgow Queen Street     GLQ      GQL
            1  Glasgow Central          GLC      GCL
            2  Heworth                  HEW      HEZ
            3  Highbury & Islington     HHY      HII      XHZ
            4  Lichfield Trent Valley   LTV      LIF
        """

        try:
            source = requests.get(note_url, headers=fake_requests_headers())
        except requests.ConnectionError:
            print_conn_err(verbose=verbose)
            return None

        web_page_text = bs4.BeautifulSoup(source.text, parser).find_all(['p', 'pre'])
        parsed_text = [x.text for x in web_page_text if isinstance(x.next_element, str)]

        parsed_note = []
        for x in parsed_text:
            if '\n' in x:
                text = re.sub('\t+', ',', x).replace('\t', ' '). \
                    replace('\xa0', '').split('\n')
            else:
                text = x.replace('\t', ' ').replace('\xa0', '')

            if isinstance(text, list):
                text = [t.split(',') for t in text if t != '']
                temp = pd.DataFrame(
                    text, columns=['Location', 'CRS', 'CRS_alt1', 'CRS_alt2']).fillna('')
                parsed_note.append(temp)
            else:
                to_remove = ['click the link',
                             'click your browser',
                             'Thank you',
                             'shown below']
                if text != '' and not any(t in text for t in to_remove):
                    parsed_note.append(text)

        return parsed_note

[docs]    def collect_explanatory_note(self, confirmation_required=True, verbose=False):
        """
        Collect note about CRS code from source web page.

        :param confirmation_required: whether to prompt a message 
            for confirmation to proceed, defaults to ``True``
        :type confirmation_required: bool
        :param verbose: whether to print relevant information in console 
            as the function runs, defaults to ``False``
        :type verbose: bool or int
        :return: data of multiple station codes explanatory note
        :rtype: dict, None

        **Example**::

            >>> from pyrcs.line_data import LocationIdentifiers

            >>> lid = LocationIdentifiers()

            >>> exp_note = lid.collect_explanatory_note(
            ...     confirmation_required=False)

            >>> type(exp_note)
            <class 'dict'>
            >>> print(list(exp_note.keys()))
            ['Multiple station codes explanatory note', 'Notes', 'Last updated date']
        """

        if confirmed("To collect data of {}?".format(self.MSCENKey.lower()),
                     confirmation_required=confirmation_required):

            if verbose == 2:
                print("Collecting data of {}".format(self.MSCENKey.lower()), end=" ... ")

            note_url = self.HomeURL + '/crs/CRS2.shtm'

            explanatory_note_ = self.parse_note_page(note_url, verbose=False)
            if explanatory_note_ is None:
                print("Failed. ") if verbose == 2 else ""
                if not is_internet_connected():
                    print_conn_err(verbose=verbose)
                explanatory_note = None

            else:
                try:
                    explanatory_note, notes = {}, []

                    for x in explanatory_note_:
                        if isinstance(x, str):
                            if 'Last update' in x:
                                explanatory_note.update(
                                    {self.LUDKey: parse_date(x, as_date_type=False)})
                            else:
                                notes.append(x)
                        else:
                            explanatory_note.update({self.MSCENKey: x})

                    explanatory_note.update({'Notes': notes})

                    # Rearrange the dict
                    explanatory_note = {k: explanatory_note[k]
                                        for k in [self.MSCENKey, 'Notes', self.LUDKey]}

                    print("Done.") if verbose == 2 else ""

                    save_pickle(explanatory_note,
                                self._cdd_locid(self.MSCENPickle + ".pickle"),
                                verbose=verbose)

                except Exception as e:
                    print("Failed. {}.".format(e))
                    explanatory_note = None

            return explanatory_note

[docs]    def fetch_explanatory_note(self, update=False, pickle_it=False, data_dir=None,
                               verbose=False):
        """
        Fetch multiple station codes explanatory note from local backup.

        :param update: whether to check on update and proceed to update the package data, 
            defaults to ``False``
        :type update: bool
        :param pickle_it: whether to replace the current package data
            with newly collected data, defaults to ``False``
        :type pickle_it: bool
        :param data_dir: name of package data folder, defaults to ``None``
        :type data_dir: str or None
        :param verbose: whether to print relevant information in console 
            as the function runs, defaults to ``False``
        :type verbose: bool or int
        :return: data of multiple station codes explanatory note
        :rtype: dict

        **Example**::

            >>> from pyrcs.line_data import LocationIdentifiers

            >>> lid = LocationIdentifiers()

            >>> exp_note = lid.fetch_explanatory_note(
            ...     update=False, pickle_it=False, data_dir=None, verbose=True)

            >>> type(exp_note)
            <class 'dict'>
            >>> print(list(exp_note.keys()))
            ['Multiple station codes explanatory note', 'Notes', 'Last updated date']
        """

        path_to_pickle = self._cdd_locid(self.MSCENPickle + ".pickle")

        if os.path.isfile(path_to_pickle) and not update:
            explanatory_note = load_pickle(path_to_pickle)

        else:
            verbose_ = False if data_dir or not verbose else (2 if verbose == 2 else True)
            explanatory_note = self.collect_explanatory_note(confirmation_required=False,
                                                             verbose=verbose_)

            if explanatory_note:  # additional_note is not None
                if pickle_it and data_dir:
                    self.CurrentDataDir = validate_input_data_dir(data_dir)
                    path_to_pickle = os.path.join(self.CurrentDataDir,
                                                  self.MSCENPickle + ".pickle")
                    save_pickle(explanatory_note, path_to_pickle, verbose=verbose)
            else:
                print("No data of {} has been freshly collected.".format(
                    self.MSCENKey.lower()))
                explanatory_note = load_pickle(path_to_pickle)

        return explanatory_note

[docs]    def collect_other_systems_codes(self, confirmation_required=True, verbose=False):
        """
        Collect data of `other systems' codes
        <http://www.railwaycodes.org.uk/crs/CRS1.shtm>`_ from source web page.

        :param confirmation_required: whether to require users to confirm and proceed,
            defaults to ``True``
        :type confirmation_required: bool
        :param verbose: whether to print relevant information in console 
            as the function runs, defaults to ``False``
        :type verbose: bool or int
        :return: codes of other systems
        :rtype: dict, None

        **Example**::

            >>> from pyrcs.line_data import LocationIdentifiers

            >>> lid = LocationIdentifiers()

            >>> os_codes = lid.collect_other_systems_codes(confirmation_required=False)

            >>> type(os_codes)
            <class 'dict'>
            >>> print(list(os_codes.keys()))
            ['Other systems', 'Last updated date']
        """

        if confirmed("To collect data of {}?".format(self.OSKey.lower()),
                     confirmation_required=confirmation_required):

            url = self.Catalogue['Other systems']

            if verbose == 2:
                print("Collecting data of {}".format(self.OSKey.lower()), end=" ... ")

            other_systems_codes = None

            try:
                source = requests.get(url, headers=fake_requests_headers())
            except requests.ConnectionError:
                print("Failed. ") if verbose == 2 else ""
                print_conn_err(verbose=verbose)

            else:
                try:
                    web_page_text = bs4.BeautifulSoup(source.text, 'lxml')

                    # Get system name
                    system_names = [k.text for k in web_page_text.find_all('h3')]

                    # Parse table data for each system
                    table_data = list(
                        split_list_by_size(web_page_text.find_all('table'), sub_len=2))

                    tables = []
                    for table in table_data:
                        headers = [x.text for x in table[0].find_all('th')]
                        tbl_dat = table[1].find_all('tr')
                        tbl_data = pd.DataFrame(parse_tr(headers, tbl_dat),
                                                columns=headers)
                        tables.append(tbl_data)

                    # Make a dict
                    other_systems_codes = {self.OSKey: dict(zip(system_names, tables)),
                                           self.LUDKey: get_last_updated_date(url)}

                    print("Done.") if verbose == 2 else ""

                    save_pickle(other_systems_codes,
                                self._cdd_locid(self.OSPickle + ".pickle"),
                                verbose=verbose)

                except Exception as e:
                    print("Failed. {}.".format(e))

            return other_systems_codes

[docs]    def fetch_other_systems_codes(self, update=False, pickle_it=False, data_dir=None,
                                  verbose=False):
        """
        Fetch data of `other systems' codes
        <http://www.railwaycodes.org.uk/crs/CRS1.shtm>`_ from local backup.

        :param update: whether to check on update and proceed to update the package data, 
            defaults to ``False``
        :type update: bool
        :param pickle_it: whether to replace the current package data
            with newly collected data, defaults to ``False``
        :type pickle_it: bool
        :param data_dir: name of package data folder, defaults to ``None``
        :type data_dir: str or None
        :param verbose: whether to print relevant information in console 
            as the function runs, defaults to ``False``
        :type verbose: bool or int
        :return: codes of other systems
        :rtype: dict

        **Example**::

            >>> from pyrcs.line_data import LocationIdentifiers

            >>> lid = LocationIdentifiers()

            >>> os_codes = lid.fetch_other_systems_codes()

            >>> type(os_codes)
            <class 'dict'>
            >>> print(list(os_codes.keys()))
            ['Other systems', 'Last updated date']
        """

        path_to_pickle = self._cdd_locid(self.OSPickle + ".pickle")

        if os.path.isfile(path_to_pickle) and not update:
            other_systems_codes = load_pickle(path_to_pickle)

        else:
            verbose_ = False if data_dir or not verbose else (2 if verbose == 2 else True)

            other_systems_codes = self.collect_other_systems_codes(
                confirmation_required=False, verbose=verbose_)

            if other_systems_codes:  # other_systems_codes is not None
                if pickle_it and data_dir:
                    self.CurrentDataDir = validate_input_data_dir(data_dir)
                    path_to_pickle = os.path.join(
                        self.CurrentDataDir, self.OSPickle + ".pickle")
                    save_pickle(other_systems_codes, path_to_pickle, verbose=verbose)

            else:
                print("No data of {} has been freshly collected.".format(
                    self.OSKey.lower()))
                other_systems_codes = load_pickle(path_to_pickle)

        return other_systems_codes

[docs]    def collect_loc_codes_by_initial(self, initial, update=False, verbose=False):
        """
        Collect `CRS, NLC, TIPLOC, STANME and STANOX codes
        <http://www.railwaycodes.org.uk/crs/CRS0.shtm>`_ for a given ``initial`` letter.

        :param initial: initial letter of station/junction name or certain word
            for specifying URL
        :type initial: str
        :param update: whether to check on update and proceed to update the package data, 
            defaults to ``False``
        :type update: bool
        :param verbose: whether to print relevant information in console 
            as the function runs, defaults to ``False``
        :type verbose: bool or int
        :return: data of location codes for the given ``initial`` letter;
            and date of when the data was last updated
        :rtype: dict

        **Example**::

            >>> from pyrcs.line_data import LocationIdentifiers

            >>> lid = LocationIdentifiers()

            >>> location_codes_a = lid.collect_loc_codes_by_initial(initial='a')

            >>> type(location_codes_a)
            <class 'dict'>
            >>> print(list(location_codes_a.keys()))
            ['A', 'Additional notes', 'Last updated date']
        """

        assert initial in string.ascii_letters
        beginning_with = initial.upper()

        path_to_pickle = self._cdd_locid("a-z", initial.lower() + ".pickle")

        if os.path.isfile(path_to_pickle) and not update:
            location_codes_initial = load_pickle(path_to_pickle)

        else:
            url = self.Catalogue[beginning_with]

            if verbose == 2:
                print("Collecting data of locations starting with \"{}\"".format(
                    beginning_with), end=" ... ")

            location_codes_initial = {beginning_with: None,
                                      self.ANKey: None,
                                      self.LUDKey: None}

            try:
                source = requests.get(url, headers=fake_requests_headers())
            except requests.ConnectionError:
                print("Failed. ") if verbose == 2 else ""
                print_conn_err(verbose=verbose)

            else:
                try:
                    tbl_lst, header = parse_table(source, parser='lxml')

                    # Get a raw DataFrame
                    reps = {'\b-\b': '', '\xa0\xa0': ' ', '&half;': ' and 1/2'}
                    pattern = re.compile("|".join(reps.keys()))
                    tbl_lst = [
                        [pattern.sub(lambda x: reps[x.group(0)], item) for item in record]
                        for record in tbl_lst]
                    loc_codes = pd.DataFrame(tbl_lst, columns=header)
                    loc_codes.replace({'\xa0': ''}, regex=True, inplace=True)

                    # Collect additional information as note
                    loc_codes[['Location', 'Location_Note']] = \
                        loc_codes.Location.map(parse_location_name).apply(pd.Series)

                    # CRS, NLC, TIPLOC, STANME
                    drop_pattern = re.compile(r'[Ff]ormerly|[Ss]ee[ also]|Also .[\w ,]+')
                    idx = [loc_codes[loc_codes.CRS == x].index[0]
                           for x in loc_codes.CRS if re.match(drop_pattern, x)]
                    loc_codes.drop(labels=idx, axis=0, inplace=True)

                    # Collect others note
                    def collect_others_note(other_note_x):
                        n = re.search(r'(?<=[\[(\'])[\w,? ]+(?=[)\]\'])', other_note_x)
                        note = n.group() if n is not None else ''
                        return note

                    # Strip others note
                    def strip_others_note(other_note_x):
                        d = re.search(r'[\w ,]+(?= [\[(\'])', other_note_x)
                        dat = d.group() if d is not None else other_note_x
                        return dat

                    other_codes_col = loc_codes.columns[1:-1]
                    other_notes_col = [x + '_Note' for x in other_codes_col]
                    loc_codes[other_notes_col] = \
                        loc_codes[other_codes_col].applymap(collect_others_note)
                    loc_codes[other_codes_col] = \
                        loc_codes[other_codes_col].applymap(strip_others_note)

                    # Parse STANOX note
                    def parse_stanox_note(x):
                        if x == '-':
                            dat, note = '', ''
                        else:
                            d = re.search(r'[\w *,]+(?= [\[(\'])', x)
                            dat = d.group() if d is not None else x
                            note = 'Pseudo STANOX' if '*' in dat else ''
                            n = re.search(r'(?<=[\[(\'])[\w, ]+.(?=[)\]\'])', x)
                            if n is not None:
                                note = '; '.join(x for x in [note, n.group()] if x != '')
                            if '(' not in note and note.endswith(')'):
                                note = note.rstrip(')')
                            dat = dat.rstrip('*') if '*' in dat else dat
                        return dat, note

                    if not loc_codes.empty:
                        loc_codes[['STANOX', 'STANOX_Note']] = loc_codes.STANOX.map(
                            parse_stanox_note).apply(pd.Series)
                    else:
                        # It is likely that no data is available on the web page
                        # for the given 'key_word'
                        loc_codes['STANOX_Note'] = loc_codes.STANOX

                    if any('see note' in crs_note for crs_note in loc_codes.CRS_Note):
                        loc_idx = [i for i, crs_note in enumerate(loc_codes.CRS_Note)
                                   if 'see note' in crs_note]

                        web_page_text = bs4.BeautifulSoup(source.text, 'lxml')

                        note_urls = [urllib.parse.urljoin(
                            self.Catalogue[beginning_with], x['href'])
                            for x in web_page_text.find_all('a', href=True, text='note')]
                        add_notes = [
                            self.parse_note_page(note_url) for note_url in note_urls]

                        additional_notes = dict(
                            zip(loc_codes.CRS.iloc[loc_idx], add_notes))

                    else:
                        additional_notes = None

                    loc_codes = loc_codes.replace(
                        self.amendment_to_loc_names(), regex=True)

                    loc_codes.STANOX = loc_codes.STANOX.replace({'-': ''})

                    loc_codes.index = range(len(loc_codes))  # Rearrange index

                    last_updated_date = get_last_updated_date(url)

                    print("Done.") if verbose == 2 else ""

                    location_codes_initial.update({beginning_with: loc_codes,
                                                   self.ANKey: additional_notes,
                                                   self.LUDKey: last_updated_date})

                    save_pickle(location_codes_initial, path_to_pickle, verbose=verbose)

                except Exception as e:
                    print("Failed. {}.".format(e))

        return location_codes_initial

[docs]    def fetch_location_codes(self, update=False, pickle_it=False, data_dir=None,
                             verbose=False):
        """
        Fetch `CRS, NLC, TIPLOC, STANME and STANOX codes
        <http://www.railwaycodes.org.uk/crs/CRS0.shtm>`_ from local backup.

        :param update: whether to check on update and proceed to update the package data, 
            defaults to ``False``
        :type update: bool
        :param pickle_it: whether to replace the current package data
            with newly collected data, defaults to ``False``
        :type pickle_it: bool
        :param data_dir: name of package data folder, defaults to ``None``
        :type data_dir: str or None
        :param verbose: whether to print relevant information in console 
            as the function runs, defaults to ``False``
        :type verbose: bool or int
        :return: data of location codes and date of when the data was last updated
        :rtype: dict

        **Example**::

            >>> from pyrcs.line_data import LocationIdentifiers

            >>> lid = LocationIdentifiers()

            >>> loc_codes = lid.fetch_location_codes()

            >>> type(loc_codes)
            <class 'dict'>
            >>> print(list(loc_codes.keys()))
            ['Location codes',
             'Other systems',
             'Additional notes',
             'Last updated date']
        """

        verbose_ = False if (data_dir or not verbose) else (2 if verbose == 2 else True)

        # Get every data table
        data = [
            self.collect_loc_codes_by_initial(
                x, update, verbose=verbose_ if is_internet_connected() else False)
            for x in string.ascii_lowercase]

        if all(d[x] is None for d, x in zip(data, string.ascii_uppercase)):
            if update:
                print_conn_err(verbose=verbose)
                print("No data of the {} has been freshly collected.".format(
                    self.Key.lower()))
            data = [self.collect_loc_codes_by_initial(x, update=False, verbose=verbose_)
                    for x in string.ascii_lowercase]

        # Select DataFrames only
        location_codes_data = (item[x] for item, x in zip(data, string.ascii_uppercase))
        location_codes_data_table = pd.concat(location_codes_data, axis=0,
                                              ignore_index=True, sort=False)

        # Likely errors (spotted occasionally)
        idx = location_codes_data_table[
            location_codes_data_table.Location == 'Selby Melmerby Estates'].index
        values = location_codes_data_table.loc[idx, 'STANME':'STANOX'].values
        location_codes_data_table.loc[idx, 'STANME':'STANOX'] = ['', '']
        idx = location_codes_data_table[
            location_codes_data_table.Location == 'Selby Potter Group'].index
        location_codes_data_table.loc[idx, 'STANME':'STANOX'] = values

        # Get the latest updated date
        last_updated_dates = (item[self.LUDKey]
                              for item, _ in zip(data, string.ascii_uppercase))
        latest_update_date = max(d for d in last_updated_dates if d is not None)

        # Get other systems codes
        other_systems_codes = self.fetch_other_systems_codes(
            update=update, verbose=verbose_)[self.OSKey]

        # Get additional note
        additional_notes = self.fetch_explanatory_note(
            update=update, verbose=verbose_)

        # Create a dict to include all information
        location_codes = {self.Key: location_codes_data_table,
                          self.OSKey: other_systems_codes,
                          self.ANKey: additional_notes,
                          self.LUDKey: latest_update_date}

        if pickle_it and data_dir:
            self.CurrentDataDir = validate_input_data_dir(data_dir)
            path_to_pickle = os.path.join(self.CurrentDataDir,
                                          self.Key.lower().replace(" ", "-") + ".pickle")
            save_pickle(location_codes, path_to_pickle, verbose=verbose)

        return location_codes

[docs]    def make_loc_id_dict(self, keys, initials=None, drop_duplicates=False, as_dict=False,
                         main_key=None, save_it=False, data_dir=None, update=False,
                         verbose=False):
        """
        Make a dict/dataframe for location code data for the given ``keys``.

        :param keys: one or a sublist of ['CRS', 'NLC', 'TIPLOC', 'STANOX', 'STANME']
        :type keys: str, list
        :param initials: one or a sequence of initials
            for which the location codes are used, defaults to ``None``
        :type initials: str, list, None
        :param drop_duplicates: whether to drop duplicates, defaults to ``False``
        :type drop_duplicates: bool
        :param as_dict: whether to return a dictionary, defaults to ``False``
        :type as_dict: bool
        :param main_key: key of the returned dictionary if ``as_dict`` is ``True``,
            defaults to ``None``
        :type main_key: str or None
        :param save_it: whether to save the location codes dictionary,
            defaults to ``False``
        :type save_it: bool
        :param data_dir: name of package data folder, defaults to ``None``
        :type data_dir: str or None
        :param update: whether to check on update and proceed to update the package data, 
            defaults to ``False``
        :type update: bool
        :param verbose: whether to print relevant information in console 
            as the function runs, defaults to ``False``
        :type verbose: bool or int
        :return: dictionary or a data frame for location code data for the given ``keys``
        :rtype: dict, pandas.DataFrame, None

        **Examples**::

            >>> from pyrcs.line_data import LocationIdentifiers

            >>> lid = LocationIdentifiers()

            >>> key = 'STANOX'
            >>> stanox_dictionary = lid.make_loc_id_dict(key)

            >>> print(stanox_dictionary.head())
                                      Location
            STANOX
            00005                       Aachen
            04309           Abbeyhill Junction
            04311        Abbeyhill Signal E811
            04308   Abbeyhill Turnback Sidings
            88601                   Abbey Wood

            >>> keys_ = ['STANOX', 'TIPLOC']
            >>> initial_ = 'a'

            >>> stanox_dictionary = lid.make_loc_id_dict(keys_, initial_)

            >>> print(stanox_dictionary.head())
                                              Location
            STANOX TIPLOC
            00005  AACHEN                       Aachen
            04309  ABHLJN           Abbeyhill Junction
            04311  ABHL811       Abbeyhill Signal E811
            04308  ABHLTB   Abbeyhill Turnback Sidings
            88601  ABWD                     Abbey Wood

            >>> keys_ = ['STANOX', 'TIPLOC']
            >>> initial_ = 'b'

            >>> stanox_dictionary = lid.make_loc_id_dict(
            ...     keys_, initial_, as_dict=True, main_key='Data')

            >>> type(stanox_dictionary)
            <class 'dict'>
            >>> print(list(stanox_dictionary['Data'].keys())[:5])
            [('55115', ''),
             ('23490', 'BABWTHL'),
             ('38306', 'BACHE'),
             ('66021', 'BADESCL'),
             ('81003', 'BADMTN')]
        """

        valid_keys = {'CRS', 'NLC', 'TIPLOC', 'STANOX', 'STANME'}

        if isinstance(keys, str):
            assert keys in valid_keys
            keys = [keys]
        elif isinstance(keys, list):
            assert all(x in valid_keys for x in keys)

        if initials:
            if isinstance(initials, str):
                assert initials in string.ascii_letters
                initials = [initials]
            else:  # e.g. isinstance(initials, list)
                assert all(x in string.ascii_letters for x in initials)

        if main_key:
            assert isinstance(main_key, str)

        dat_dir = validate_input_data_dir(data_dir) if data_dir else self.DataDir
        path_to_file = os.path.join(
            dat_dir,
            "-".join(keys) + ("" if initials is None else "-" + "".join(initials)) +
            (".json" if as_dict and len(keys) == 1 else ".pickle"))

        if os.path.isfile(path_to_file) and not update:
            if as_dict:
                location_codes_dictionary = load_json(path_to_file)
            else:
                location_codes_dictionary = load_pickle(path_to_file)

        else:
            if initials is None:
                location_codes = self.fetch_location_codes(verbose=verbose)[self.Key]
            else:
                temp = [
                    self.collect_loc_codes_by_initial(initial, verbose=verbose)[
                        initial.upper()]
                    for initial in initials]
                location_codes = pd.concat(temp, axis=0, ignore_index=True, sort=False)

            if verbose == 2:
                print("To make/update a location code dictionary", end=" ... ")

            # Deep cleansing location_code
            try:
                key_location_codes = location_codes[['Location'] + keys]
                key_location_codes = key_location_codes.query(
                    ' | '.join(['{} != \'\''.format(k) for k in keys]))

                if drop_duplicates:
                    location_codes_subset = key_location_codes.drop_duplicates(
                        subset=keys, keep='first')
                    location_codes_duplicated = None

                else:  # drop_duplicates is False or None
                    location_codes_subset = key_location_codes.drop_duplicates(
                        subset=keys, keep=False)
                    #
                    dupl_temp_1 = key_location_codes[
                        key_location_codes.duplicated(['Location'] + keys, keep=False)]
                    dupl_temp_2 = key_location_codes[
                        key_location_codes.duplicated(keys, keep=False)]
                    duplicated_1 = dupl_temp_2[
                        dupl_temp_1.eq(dupl_temp_2)].dropna().drop_duplicates()
                    duplicated_2 = dupl_temp_2[~dupl_temp_1.eq(dupl_temp_2)].dropna()
                    duplicated = pd.concat(
                        [duplicated_1, duplicated_2], axis=0, sort=False)
                    location_codes_duplicated = duplicated.groupby(keys).agg(tuple)
                    location_codes_duplicated.Location = \
                        location_codes_duplicated.Location.map(
                            lambda x: x[0] if len(set(x)) == 1 else x)

                location_codes_subset.set_index(keys, inplace=True)
                location_codes_ref = pd.concat(
                    [location_codes_subset, location_codes_duplicated], axis=0,
                    sort=False)

                if as_dict:
                    location_codes_ref_dict = location_codes_ref.to_dict()
                    if main_key is None:
                        location_codes_dictionary = location_codes_ref_dict['Location']
                    else:
                        location_codes_ref_dict[main_key] = \
                            location_codes_ref_dict.pop('Location')
                        location_codes_dictionary = location_codes_ref_dict
                else:
                    location_codes_dictionary = location_codes_ref

                print("Successfully.") if verbose == 2 else ""

                if save_it:
                    save(location_codes_dictionary, path_to_file, verbose=verbose)

            except Exception as e:
                print("Failed. {}.".format(e))
                location_codes_dictionary = None

        return location_codes_dictionary