"""
Collect
`CRS, NLC, TIPLOC and STANOX codes <http://www.railwaycodes.org.uk/crs/CRS0.shtm>`_.
"""
import copy
import os
import re
import string
import urllib.parse
import bs4
import pandas as pd
import requests
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.ops import confirmed, fake_requests_headers, split_list_by_size
from pyhelpers.store import load_json, load_pickle, save, save_pickle
from pyrcs.utils import cd_dat, homepage_url, get_catalogue, get_last_updated_date, \
parse_date, parse_location_name, parse_table, parse_tr, print_conn_err, \
is_internet_connected, print_connection_error
[docs]class LocationIdentifiers:
"""
A class for collecting location identifiers
(including `other systems <http://www.railwaycodes.org.uk/crs/CRS1.shtm>`_ station).
:param data_dir: name of data directory, defaults to ``None``
:type data_dir: str or None
:param update: whether to check on update and proceed to update the package data,
defaults to ``False``
:type update: bool
:param verbose: whether to print relevant information in console as the function runs,
defaults to ``True``
:type verbose: bool or int
**Example**::
>>> from pyrcs.line_data import LocationIdentifiers
>>> lid = LocationIdentifiers()
>>> print(lid.Name)
CRS, NLC, TIPLOC and STANOX codes
>>> print(lid.SourceURL)
http://www.railwaycodes.org.uk/crs/CRS0.shtm
"""
def __init__(self, data_dir=None, update=False, verbose=True):
"""
Constructor method.
"""
if not is_internet_connected():
print_connection_error(verbose=verbose)
self.Name = 'CRS, NLC, TIPLOC and STANOX codes'
self.Key = 'Location codes' # key to location codes
self.HomeURL = homepage_url()
self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/crs/CRS0.shtm')
self.LUDKey = 'Last updated date' # key to last updated date
self.Date = get_last_updated_date(url=self.SourceURL, parsed=True,
as_date_type=False)
self.Catalogue = get_catalogue(page_url=self.SourceURL, update=update,
confirmation_required=False)
if data_dir:
self.DataDir = validate_input_data_dir(data_dir)
else:
self.DataDir = cd_dat(
"line-data",
re.sub(r',| codes| and', '', self.Name.lower()).replace(" ", "-"))
self.CurrentDataDir = copy.copy(self.DataDir)
self.OSKey = 'Other systems' # key to other systems codes
self.OSPickle = self.OSKey.lower().replace(" ", "-")
self.ANKey = 'Additional notes' # key to additional notes
self.MSCENKey = 'Multiple station codes explanatory note'
self.MSCENPickle = self.MSCENKey.lower().replace(" ", "-")
def _cdd_locid(self, *sub_dir, **kwargs):
"""
Change directory to package data directory and sub-directories (and/or a file).
The directory for this module: ``"\\dat\\line-data\\crs-nlc-tiploc-stanox"``.
:param sub_dir: sub-directory or sub-directories (and/or a file)
:type sub_dir: str
:param kwargs: optional parameters of
`os.makedirs <https://docs.python.org/3/library/os.html#os.makedirs>`_,
e.g. ``mode=0o777``
:return: path to the backup data directory for ``LocationIdentifiers``
:rtype: str
:meta private:
"""
path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs)
return path
[docs] @staticmethod
def amendment_to_loc_names():
"""
Create a replacement dictionary for location name amendments.
:return: dictionary of regular-expression amendments to location names
:rtype: dict
**Example**::
>>> from pyrcs.line_data import LocationIdentifiers
>>> lid = LocationIdentifiers()
>>> loc_name_amendment_dict = lid.amendment_to_loc_names()
>>> print(list(loc_name_amendment_dict.keys()))
['Location']
"""
location_name_amendment_dict = {
'Location': {re.compile(r' And | \+ '): ' & ',
re.compile(r'-By-'): '-by-',
re.compile(r'-In-'): '-in-',
re.compile(r'-En-Le-'): '-en-le-',
re.compile(r'-La-'): '-la-',
re.compile(r'-Le-'): '-le-',
re.compile(r'-On-'): '-on-',
re.compile(r'-The-'): '-the-',
re.compile(r' Of '): ' of ',
re.compile(r'-Super-'): '-super-',
re.compile(r'-Upon-'): '-upon-',
re.compile(r'-Under-'): '-under-',
re.compile(r'-Y-'): '-y-'}}
return location_name_amendment_dict
[docs] @staticmethod
def parse_note_page(note_url, parser='lxml', verbose=False):
"""
Parse addition note page.
:param note_url: URL link of the target web page
:type note_url: str
:param parser: the `parser`_ to use for `bs4.BeautifulSoup`_,
defaults to ``'lxml'``
:type parser: str
:param verbose: whether to print relevant information in console
as the function runs, defaults to ``False``
:type verbose: bool or int
:return: parsed texts
:rtype: list
.. _`parser`:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
index.html#specifying-the-parser-to-use
.. _`bs4.BeautifulSoup`:
https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.html
**Example**::
>>> from pyrcs.line_data import LocationIdentifiers
>>> lid = LocationIdentifiers()
>>> url = lid.HomeURL + '/crs/CRS2.shtm'
>>> parsed_note_ = lid.parse_note_page(url, parser='lxml')
>>> print(parsed_note_[3].head())
Location CRS CRS_alt1 CRS_alt2
0 Glasgow Queen Street GLQ GQL
1 Glasgow Central GLC GCL
2 Heworth HEW HEZ
3 Highbury & Islington HHY HII XHZ
4 Lichfield Trent Valley LTV LIF
"""
try:
source = requests.get(note_url, headers=fake_requests_headers())
except requests.ConnectionError:
print_conn_err(verbose=verbose)
return None
web_page_text = bs4.BeautifulSoup(source.text, parser).find_all(['p', 'pre'])
parsed_text = [x.text for x in web_page_text if isinstance(x.next_element, str)]
parsed_note = []
for x in parsed_text:
if '\n' in x:
text = re.sub('\t+', ',', x).replace('\t', ' '). \
replace('\xa0', '').split('\n')
else:
text = x.replace('\t', ' ').replace('\xa0', '')
if isinstance(text, list):
text = [t.split(',') for t in text if t != '']
temp = pd.DataFrame(
text, columns=['Location', 'CRS', 'CRS_alt1', 'CRS_alt2']).fillna('')
parsed_note.append(temp)
else:
to_remove = ['click the link',
'click your browser',
'Thank you',
'shown below']
if text != '' and not any(t in text for t in to_remove):
parsed_note.append(text)
return parsed_note
[docs] def collect_explanatory_note(self, confirmation_required=True, verbose=False):
"""
Collect note about CRS code from source web page.
:param confirmation_required: whether to prompt a message
for confirmation to proceed, defaults to ``True``
:type confirmation_required: bool
:param verbose: whether to print relevant information in console
as the function runs, defaults to ``False``
:type verbose: bool or int
:return: data of multiple station codes explanatory note
:rtype: dict, None
**Example**::
>>> from pyrcs.line_data import LocationIdentifiers
>>> lid = LocationIdentifiers()
>>> exp_note = lid.collect_explanatory_note(
... confirmation_required=False)
>>> type(exp_note)
<class 'dict'>
>>> print(list(exp_note.keys()))
['Multiple station codes explanatory note', 'Notes', 'Last updated date']
"""
if confirmed("To collect data of {}?".format(self.MSCENKey.lower()),
confirmation_required=confirmation_required):
if verbose == 2:
print("Collecting data of {}".format(self.MSCENKey.lower()), end=" ... ")
note_url = self.HomeURL + '/crs/CRS2.shtm'
explanatory_note_ = self.parse_note_page(note_url, verbose=False)
if explanatory_note_ is None:
print("Failed. ") if verbose == 2 else ""
if not is_internet_connected():
print_conn_err(verbose=verbose)
explanatory_note = None
else:
try:
explanatory_note, notes = {}, []
for x in explanatory_note_:
if isinstance(x, str):
if 'Last update' in x:
explanatory_note.update(
{self.LUDKey: parse_date(x, as_date_type=False)})
else:
notes.append(x)
else:
explanatory_note.update({self.MSCENKey: x})
explanatory_note.update({'Notes': notes})
# Rearrange the dict
explanatory_note = {k: explanatory_note[k]
for k in [self.MSCENKey, 'Notes', self.LUDKey]}
print("Done.") if verbose == 2 else ""
save_pickle(explanatory_note,
self._cdd_locid(self.MSCENPickle + ".pickle"),
verbose=verbose)
except Exception as e:
print("Failed. {}.".format(e))
explanatory_note = None
return explanatory_note
[docs] def fetch_explanatory_note(self, update=False, pickle_it=False, data_dir=None,
verbose=False):
"""
Fetch multiple station codes explanatory note from local backup.
:param update: whether to check on update and proceed to update the package data,
defaults to ``False``
:type update: bool
:param pickle_it: whether to replace the current package data
with newly collected data, defaults to ``False``
:type pickle_it: bool
:param data_dir: name of package data folder, defaults to ``None``
:type data_dir: str or None
:param verbose: whether to print relevant information in console
as the function runs, defaults to ``False``
:type verbose: bool or int
:return: data of multiple station codes explanatory note
:rtype: dict
**Example**::
>>> from pyrcs.line_data import LocationIdentifiers
>>> lid = LocationIdentifiers()
>>> exp_note = lid.fetch_explanatory_note(
... update=False, pickle_it=False, data_dir=None, verbose=True)
>>> type(exp_note)
<class 'dict'>
>>> print(list(exp_note.keys()))
['Multiple station codes explanatory note', 'Notes', 'Last updated date']
"""
path_to_pickle = self._cdd_locid(self.MSCENPickle + ".pickle")
if os.path.isfile(path_to_pickle) and not update:
explanatory_note = load_pickle(path_to_pickle)
else:
verbose_ = False if data_dir or not verbose else (2 if verbose == 2 else True)
explanatory_note = self.collect_explanatory_note(confirmation_required=False,
verbose=verbose_)
if explanatory_note: # additional_note is not None
if pickle_it and data_dir:
self.CurrentDataDir = validate_input_data_dir(data_dir)
path_to_pickle = os.path.join(self.CurrentDataDir,
self.MSCENPickle + ".pickle")
save_pickle(explanatory_note, path_to_pickle, verbose=verbose)
else:
print("No data of {} has been freshly collected.".format(
self.MSCENKey.lower()))
explanatory_note = load_pickle(path_to_pickle)
return explanatory_note
[docs] def collect_other_systems_codes(self, confirmation_required=True, verbose=False):
"""
Collect data of `other systems' codes
<http://www.railwaycodes.org.uk/crs/CRS1.shtm>`_ from source web page.
:param confirmation_required: whether to require users to confirm and proceed,
defaults to ``True``
:type confirmation_required: bool
:param verbose: whether to print relevant information in console
as the function runs, defaults to ``False``
:type verbose: bool or int
:return: codes of other systems
:rtype: dict, None
**Example**::
>>> from pyrcs.line_data import LocationIdentifiers
>>> lid = LocationIdentifiers()
>>> os_codes = lid.collect_other_systems_codes(confirmation_required=False)
>>> type(os_codes)
<class 'dict'>
>>> print(list(os_codes.keys()))
['Other systems', 'Last updated date']
"""
if confirmed("To collect data of {}?".format(self.OSKey.lower()),
confirmation_required=confirmation_required):
url = self.Catalogue['Other systems']
if verbose == 2:
print("Collecting data of {}".format(self.OSKey.lower()), end=" ... ")
other_systems_codes = None
try:
source = requests.get(url, headers=fake_requests_headers())
except requests.ConnectionError:
print("Failed. ") if verbose == 2 else ""
print_conn_err(verbose=verbose)
else:
try:
web_page_text = bs4.BeautifulSoup(source.text, 'lxml')
# Get system name
system_names = [k.text for k in web_page_text.find_all('h3')]
# Parse table data for each system
table_data = list(
split_list_by_size(web_page_text.find_all('table'), sub_len=2))
tables = []
for table in table_data:
headers = [x.text for x in table[0].find_all('th')]
tbl_dat = table[1].find_all('tr')
tbl_data = pd.DataFrame(parse_tr(headers, tbl_dat),
columns=headers)
tables.append(tbl_data)
# Make a dict
other_systems_codes = {self.OSKey: dict(zip(system_names, tables)),
self.LUDKey: get_last_updated_date(url)}
print("Done.") if verbose == 2 else ""
save_pickle(other_systems_codes,
self._cdd_locid(self.OSPickle + ".pickle"),
verbose=verbose)
except Exception as e:
print("Failed. {}.".format(e))
return other_systems_codes
[docs] def fetch_other_systems_codes(self, update=False, pickle_it=False, data_dir=None,
verbose=False):
"""
Fetch data of `other systems' codes
<http://www.railwaycodes.org.uk/crs/CRS1.shtm>`_ from local backup.
:param update: whether to check on update and proceed to update the package data,
defaults to ``False``
:type update: bool
:param pickle_it: whether to replace the current package data
with newly collected data, defaults to ``False``
:type pickle_it: bool
:param data_dir: name of package data folder, defaults to ``None``
:type data_dir: str or None
:param verbose: whether to print relevant information in console
as the function runs, defaults to ``False``
:type verbose: bool or int
:return: codes of other systems
:rtype: dict
**Example**::
>>> from pyrcs.line_data import LocationIdentifiers
>>> lid = LocationIdentifiers()
>>> os_codes = lid.fetch_other_systems_codes()
>>> type(os_codes)
<class 'dict'>
>>> print(list(os_codes.keys()))
['Other systems', 'Last updated date']
"""
path_to_pickle = self._cdd_locid(self.OSPickle + ".pickle")
if os.path.isfile(path_to_pickle) and not update:
other_systems_codes = load_pickle(path_to_pickle)
else:
verbose_ = False if data_dir or not verbose else (2 if verbose == 2 else True)
other_systems_codes = self.collect_other_systems_codes(
confirmation_required=False, verbose=verbose_)
if other_systems_codes: # other_systems_codes is not None
if pickle_it and data_dir:
self.CurrentDataDir = validate_input_data_dir(data_dir)
path_to_pickle = os.path.join(
self.CurrentDataDir, self.OSPickle + ".pickle")
save_pickle(other_systems_codes, path_to_pickle, verbose=verbose)
else:
print("No data of {} has been freshly collected.".format(
self.OSKey.lower()))
other_systems_codes = load_pickle(path_to_pickle)
return other_systems_codes
[docs] def collect_loc_codes_by_initial(self, initial, update=False, verbose=False):
"""
Collect `CRS, NLC, TIPLOC, STANME and STANOX codes
<http://www.railwaycodes.org.uk/crs/CRS0.shtm>`_ for a given ``initial`` letter.
:param initial: initial letter of station/junction name or certain word
for specifying URL
:type initial: str
:param update: whether to check on update and proceed to update the package data,
defaults to ``False``
:type update: bool
:param verbose: whether to print relevant information in console
as the function runs, defaults to ``False``
:type verbose: bool or int
:return: data of location codes for the given ``initial`` letter;
and date of when the data was last updated
:rtype: dict
**Example**::
>>> from pyrcs.line_data import LocationIdentifiers
>>> lid = LocationIdentifiers()
>>> location_codes_a = lid.collect_loc_codes_by_initial(initial='a')
>>> type(location_codes_a)
<class 'dict'>
>>> print(list(location_codes_a.keys()))
['A', 'Additional notes', 'Last updated date']
"""
assert initial in string.ascii_letters
beginning_with = initial.upper()
path_to_pickle = self._cdd_locid("a-z", initial.lower() + ".pickle")
if os.path.isfile(path_to_pickle) and not update:
location_codes_initial = load_pickle(path_to_pickle)
else:
url = self.Catalogue[beginning_with]
if verbose == 2:
print("Collecting data of locations starting with \"{}\"".format(
beginning_with), end=" ... ")
location_codes_initial = {beginning_with: None,
self.ANKey: None,
self.LUDKey: None}
try:
source = requests.get(url, headers=fake_requests_headers())
except requests.ConnectionError:
print("Failed. ") if verbose == 2 else ""
print_conn_err(verbose=verbose)
else:
try:
tbl_lst, header = parse_table(source, parser='lxml')
# Get a raw DataFrame
reps = {'\b-\b': '', '\xa0\xa0': ' ', '½': ' and 1/2'}
pattern = re.compile("|".join(reps.keys()))
tbl_lst = [
[pattern.sub(lambda x: reps[x.group(0)], item) for item in record]
for record in tbl_lst]
loc_codes = pd.DataFrame(tbl_lst, columns=header)
loc_codes.replace({'\xa0': ''}, regex=True, inplace=True)
# Collect additional information as note
loc_codes[['Location', 'Location_Note']] = \
loc_codes.Location.map(parse_location_name).apply(pd.Series)
# CRS, NLC, TIPLOC, STANME
drop_pattern = re.compile(r'[Ff]ormerly|[Ss]ee[ also]|Also .[\w ,]+')
idx = [loc_codes[loc_codes.CRS == x].index[0]
for x in loc_codes.CRS if re.match(drop_pattern, x)]
loc_codes.drop(labels=idx, axis=0, inplace=True)
# Collect others note
def collect_others_note(other_note_x):
n = re.search(r'(?<=[\[(\'])[\w,? ]+(?=[)\]\'])', other_note_x)
note = n.group() if n is not None else ''
return note
# Strip others note
def strip_others_note(other_note_x):
d = re.search(r'[\w ,]+(?= [\[(\'])', other_note_x)
dat = d.group() if d is not None else other_note_x
return dat
other_codes_col = loc_codes.columns[1:-1]
other_notes_col = [x + '_Note' for x in other_codes_col]
loc_codes[other_notes_col] = \
loc_codes[other_codes_col].applymap(collect_others_note)
loc_codes[other_codes_col] = \
loc_codes[other_codes_col].applymap(strip_others_note)
# Parse STANOX note
def parse_stanox_note(x):
if x == '-':
dat, note = '', ''
else:
d = re.search(r'[\w *,]+(?= [\[(\'])', x)
dat = d.group() if d is not None else x
note = 'Pseudo STANOX' if '*' in dat else ''
n = re.search(r'(?<=[\[(\'])[\w, ]+.(?=[)\]\'])', x)
if n is not None:
note = '; '.join(x for x in [note, n.group()] if x != '')
if '(' not in note and note.endswith(')'):
note = note.rstrip(')')
dat = dat.rstrip('*') if '*' in dat else dat
return dat, note
if not loc_codes.empty:
loc_codes[['STANOX', 'STANOX_Note']] = loc_codes.STANOX.map(
parse_stanox_note).apply(pd.Series)
else:
# It is likely that no data is available on the web page
# for the given 'key_word'
loc_codes['STANOX_Note'] = loc_codes.STANOX
if any('see note' in crs_note for crs_note in loc_codes.CRS_Note):
loc_idx = [i for i, crs_note in enumerate(loc_codes.CRS_Note)
if 'see note' in crs_note]
web_page_text = bs4.BeautifulSoup(source.text, 'lxml')
note_urls = [urllib.parse.urljoin(
self.Catalogue[beginning_with], x['href'])
for x in web_page_text.find_all('a', href=True, text='note')]
add_notes = [
self.parse_note_page(note_url) for note_url in note_urls]
additional_notes = dict(
zip(loc_codes.CRS.iloc[loc_idx], add_notes))
else:
additional_notes = None
loc_codes = loc_codes.replace(
self.amendment_to_loc_names(), regex=True)
loc_codes.STANOX = loc_codes.STANOX.replace({'-': ''})
loc_codes.index = range(len(loc_codes)) # Rearrange index
last_updated_date = get_last_updated_date(url)
print("Done.") if verbose == 2 else ""
location_codes_initial.update({beginning_with: loc_codes,
self.ANKey: additional_notes,
self.LUDKey: last_updated_date})
save_pickle(location_codes_initial, path_to_pickle, verbose=verbose)
except Exception as e:
print("Failed. {}.".format(e))
return location_codes_initial
[docs] def fetch_location_codes(self, update=False, pickle_it=False, data_dir=None,
verbose=False):
"""
Fetch `CRS, NLC, TIPLOC, STANME and STANOX codes
<http://www.railwaycodes.org.uk/crs/CRS0.shtm>`_ from local backup.
:param update: whether to check on update and proceed to update the package data,
defaults to ``False``
:type update: bool
:param pickle_it: whether to replace the current package data
with newly collected data, defaults to ``False``
:type pickle_it: bool
:param data_dir: name of package data folder, defaults to ``None``
:type data_dir: str or None
:param verbose: whether to print relevant information in console
as the function runs, defaults to ``False``
:type verbose: bool or int
:return: data of location codes and date of when the data was last updated
:rtype: dict
**Example**::
>>> from pyrcs.line_data import LocationIdentifiers
>>> lid = LocationIdentifiers()
>>> loc_codes = lid.fetch_location_codes()
>>> type(loc_codes)
<class 'dict'>
>>> print(list(loc_codes.keys()))
['Location codes',
'Other systems',
'Additional notes',
'Last updated date']
"""
verbose_ = False if (data_dir or not verbose) else (2 if verbose == 2 else True)
# Get every data table
data = [
self.collect_loc_codes_by_initial(
x, update, verbose=verbose_ if is_internet_connected() else False)
for x in string.ascii_lowercase]
if all(d[x] is None for d, x in zip(data, string.ascii_uppercase)):
if update:
print_conn_err(verbose=verbose)
print("No data of the {} has been freshly collected.".format(
self.Key.lower()))
data = [self.collect_loc_codes_by_initial(x, update=False, verbose=verbose_)
for x in string.ascii_lowercase]
# Select DataFrames only
location_codes_data = (item[x] for item, x in zip(data, string.ascii_uppercase))
location_codes_data_table = pd.concat(location_codes_data, axis=0,
ignore_index=True, sort=False)
# Likely errors (spotted occasionally)
idx = location_codes_data_table[
location_codes_data_table.Location == 'Selby Melmerby Estates'].index
values = location_codes_data_table.loc[idx, 'STANME':'STANOX'].values
location_codes_data_table.loc[idx, 'STANME':'STANOX'] = ['', '']
idx = location_codes_data_table[
location_codes_data_table.Location == 'Selby Potter Group'].index
location_codes_data_table.loc[idx, 'STANME':'STANOX'] = values
# Get the latest updated date
last_updated_dates = (item[self.LUDKey]
for item, _ in zip(data, string.ascii_uppercase))
latest_update_date = max(d for d in last_updated_dates if d is not None)
# Get other systems codes
other_systems_codes = self.fetch_other_systems_codes(
update=update, verbose=verbose_)[self.OSKey]
# Get additional note
additional_notes = self.fetch_explanatory_note(
update=update, verbose=verbose_)
# Create a dict to include all information
location_codes = {self.Key: location_codes_data_table,
self.OSKey: other_systems_codes,
self.ANKey: additional_notes,
self.LUDKey: latest_update_date}
if pickle_it and data_dir:
self.CurrentDataDir = validate_input_data_dir(data_dir)
path_to_pickle = os.path.join(self.CurrentDataDir,
self.Key.lower().replace(" ", "-") + ".pickle")
save_pickle(location_codes, path_to_pickle, verbose=verbose)
return location_codes
[docs] def make_loc_id_dict(self, keys, initials=None, drop_duplicates=False, as_dict=False,
main_key=None, save_it=False, data_dir=None, update=False,
verbose=False):
"""
Make a dict/dataframe for location code data for the given ``keys``.
:param keys: one or a sublist of ['CRS', 'NLC', 'TIPLOC', 'STANOX', 'STANME']
:type keys: str, list
:param initials: one or a sequence of initials
for which the location codes are used, defaults to ``None``
:type initials: str, list, None
:param drop_duplicates: whether to drop duplicates, defaults to ``False``
:type drop_duplicates: bool
:param as_dict: whether to return a dictionary, defaults to ``False``
:type as_dict: bool
:param main_key: key of the returned dictionary if ``as_dict`` is ``True``,
defaults to ``None``
:type main_key: str or None
:param save_it: whether to save the location codes dictionary,
defaults to ``False``
:type save_it: bool
:param data_dir: name of package data folder, defaults to ``None``
:type data_dir: str or None
:param update: whether to check on update and proceed to update the package data,
defaults to ``False``
:type update: bool
:param verbose: whether to print relevant information in console
as the function runs, defaults to ``False``
:type verbose: bool or int
:return: dictionary or a data frame for location code data for the given ``keys``
:rtype: dict, pandas.DataFrame, None
**Examples**::
>>> from pyrcs.line_data import LocationIdentifiers
>>> lid = LocationIdentifiers()
>>> key = 'STANOX'
>>> stanox_dictionary = lid.make_loc_id_dict(key)
>>> print(stanox_dictionary.head())
Location
STANOX
00005 Aachen
04309 Abbeyhill Junction
04311 Abbeyhill Signal E811
04308 Abbeyhill Turnback Sidings
88601 Abbey Wood
>>> keys_ = ['STANOX', 'TIPLOC']
>>> initial_ = 'a'
>>> stanox_dictionary = lid.make_loc_id_dict(keys_, initial_)
>>> print(stanox_dictionary.head())
Location
STANOX TIPLOC
00005 AACHEN Aachen
04309 ABHLJN Abbeyhill Junction
04311 ABHL811 Abbeyhill Signal E811
04308 ABHLTB Abbeyhill Turnback Sidings
88601 ABWD Abbey Wood
>>> keys_ = ['STANOX', 'TIPLOC']
>>> initial_ = 'b'
>>> stanox_dictionary = lid.make_loc_id_dict(
... keys_, initial_, as_dict=True, main_key='Data')
>>> type(stanox_dictionary)
<class 'dict'>
>>> print(list(stanox_dictionary['Data'].keys())[:5])
[('55115', ''),
('23490', 'BABWTHL'),
('38306', 'BACHE'),
('66021', 'BADESCL'),
('81003', 'BADMTN')]
"""
valid_keys = {'CRS', 'NLC', 'TIPLOC', 'STANOX', 'STANME'}
if isinstance(keys, str):
assert keys in valid_keys
keys = [keys]
elif isinstance(keys, list):
assert all(x in valid_keys for x in keys)
if initials:
if isinstance(initials, str):
assert initials in string.ascii_letters
initials = [initials]
else: # e.g. isinstance(initials, list)
assert all(x in string.ascii_letters for x in initials)
if main_key:
assert isinstance(main_key, str)
dat_dir = validate_input_data_dir(data_dir) if data_dir else self.DataDir
path_to_file = os.path.join(
dat_dir,
"-".join(keys) + ("" if initials is None else "-" + "".join(initials)) +
(".json" if as_dict and len(keys) == 1 else ".pickle"))
if os.path.isfile(path_to_file) and not update:
if as_dict:
location_codes_dictionary = load_json(path_to_file)
else:
location_codes_dictionary = load_pickle(path_to_file)
else:
if initials is None:
location_codes = self.fetch_location_codes(verbose=verbose)[self.Key]
else:
temp = [
self.collect_loc_codes_by_initial(initial, verbose=verbose)[
initial.upper()]
for initial in initials]
location_codes = pd.concat(temp, axis=0, ignore_index=True, sort=False)
if verbose == 2:
print("To make/update a location code dictionary", end=" ... ")
# Deep cleansing location_code
try:
key_location_codes = location_codes[['Location'] + keys]
key_location_codes = key_location_codes.query(
' | '.join(['{} != \'\''.format(k) for k in keys]))
if drop_duplicates:
location_codes_subset = key_location_codes.drop_duplicates(
subset=keys, keep='first')
location_codes_duplicated = None
else: # drop_duplicates is False or None
location_codes_subset = key_location_codes.drop_duplicates(
subset=keys, keep=False)
#
dupl_temp_1 = key_location_codes[
key_location_codes.duplicated(['Location'] + keys, keep=False)]
dupl_temp_2 = key_location_codes[
key_location_codes.duplicated(keys, keep=False)]
duplicated_1 = dupl_temp_2[
dupl_temp_1.eq(dupl_temp_2)].dropna().drop_duplicates()
duplicated_2 = dupl_temp_2[~dupl_temp_1.eq(dupl_temp_2)].dropna()
duplicated = pd.concat(
[duplicated_1, duplicated_2], axis=0, sort=False)
location_codes_duplicated = duplicated.groupby(keys).agg(tuple)
location_codes_duplicated.Location = \
location_codes_duplicated.Location.map(
lambda x: x[0] if len(set(x)) == 1 else x)
location_codes_subset.set_index(keys, inplace=True)
location_codes_ref = pd.concat(
[location_codes_subset, location_codes_duplicated], axis=0,
sort=False)
if as_dict:
location_codes_ref_dict = location_codes_ref.to_dict()
if main_key is None:
location_codes_dictionary = location_codes_ref_dict['Location']
else:
location_codes_ref_dict[main_key] = \
location_codes_ref_dict.pop('Location')
location_codes_dictionary = location_codes_ref_dict
else:
location_codes_dictionary = location_codes_ref
print("Successfully.") if verbose == 2 else ""
if save_it:
save(location_codes_dictionary, path_to_file, verbose=verbose)
except Exception as e:
print("Failed. {}.".format(e))
location_codes_dictionary = None
return location_codes_dictionary