Source code for pyrcs.line_data.elrs_mileages

""" Collecting Engineer's Line References (ELRs) codes.

Data source: http://www.railwaycodes.org.uk/elrs/elr0.shtm
"""

import copy
import itertools
import os
import re
import string
import urllib.parse

import bs4
import measurement.measures
import numpy as np
import pandas as pd
import requests
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.ops import confirmed, fake_requests_headers
from pyhelpers.store import load_pickle, save_pickle
from pyhelpers.text import remove_punctuation

from pyrcs.utils import cd_dat, homepage_url
from pyrcs.utils import get_catalogue, get_last_updated_date, is_str_float, parse_table
from pyrcs.utils import mile_chain_to_nr_mileage, nr_mileage_to_mile_chain, yards_to_nr_mileage


[docs]class ELRMileages: """ A class for collecting Engineer's Line References (ELRs) codes. :param data_dir: name of data directory, defaults to ``None`` :type data_dir: str, None :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool **Example**:: from pyrcs.line_data import ELRMileages em = ELRMileages() print(em.Name) # ELRs and mileages print(em.SourceURL) # http://www.railwaycodes.org.uk/elrs/elr0.shtm """ def __init__(self, data_dir=None, update=False): """ Constructor method. """ self.Name = "ELRs and mileages" self.HomeURL = homepage_url() self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/elrs/elr0.shtm') self.Catalogue = get_catalogue(self.SourceURL, update=update, confirmation_required=False) self.Date = get_last_updated_date(self.SourceURL, parsed=True, as_date_type=False) self.Key = 'ELRs' # key to ELRs and mileages self.LUDKey = 'Last updated date' # key to last updated date if data_dir: self.DataDir = validate_input_data_dir(data_dir) else: self.DataDir = cd_dat("line-data", self.Name.lower().replace(" ", "-")) self.CurrentDataDir = copy.copy(self.DataDir) def cdd_em(self, *sub_dir, **kwargs): """ Change directory to "dat\\line-data\\elrs-and-mileages" and sub-directories (and/or a file) :param sub_dir: sub-directory or sub-directories (and/or a file) :type sub_dir: str :param kwargs: optional parameters of `os.makedirs <https://docs.python.org/3/library/os.html#os.makedirs>`_, e.g. ``mode=0o777`` :return: path to the backup data directory for ``ELRMileages`` :rtype: str :meta private: """ path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs) return path
[docs] @staticmethod def identify_multiple_measures(mileage_data): """ Identify the scraped data of mileage file if it has multiple measures and, if so, preprocess it. :param mileage_data: scraped raw mileage file from source web page :type: pandas.DataFrame """ test_temp = mileage_data[~mileage_data.Mileage.astype(bool)] if not test_temp.empty: test_temp_node, sep_rows_idx = test_temp.Node.tolist(), test_temp.index[-1] if '1949 measure' in test_temp_node: mileage_data.Node = mileage_data.Node.str.replace('1949 measure', 'Current measure') test_temp_node = [re.sub(r'1949 ', 'Current ', x) for x in test_temp_node] if 'Distances in km' in test_temp_node: temp_mileage_data = mileage_data[~mileage_data.Node.str.contains('Distances in km')] temp_mileages = temp_mileage_data.Mileage.map( lambda x: nr_mileage_to_mile_chain(yards_to_nr_mileage(measurement.measures.Distance(km=x).yd))) temp_mileage_data.Mileage = temp_mileages.tolist() checked_mileage_data = temp_mileage_data elif 'One measure' in test_temp_node: sep_rows_idx = mileage_data[mileage_data.Node.str.contains('Alternative measure')].index[0] mileage_data_1, mileage_data_2 = np.split(mileage_data, [sep_rows_idx], axis=0) checked_mileage_data = { 'One measure': mileage_data_1[~mileage_data_1.Node.str.contains('One measure')], 'Alternative measure': mileage_data_2[~mileage_data_2.Node.str.contains('Alternative measure')]} elif 'This line has two \'legs\':' in test_temp_node: temp_mileage_data = mileage_data.iloc[1:].drop_duplicates() temp_mileage_data.index = range(len(temp_mileage_data)) checked_mileage_data = temp_mileage_data else: test_temp_text = [' '.join(x) for x in itertools.product( *(('Current', 'Later', 'One', 'Original', 'Former', 'Alternative', 'Usual', 'Earlier'), ('measure', 'route')))] alt_sep_rows_idx = [x in test_temp_node for x in test_temp_text] num_of_measures = sum(alt_sep_rows_idx) if num_of_measures == 1: # mileage_data_1, mileage_data_2 = np.split(mileage_data, [sep_rows_idx], axis=0) if re.match(r'(Original)|(Former)|(Alternative)|(Usual)', test_temp_node[0]): measure_ = re.sub(r'(Original)|(Former)|(Alternative)|(Usual)', r'Current', test_temp_node[0]) else: measure_ = re.sub(r'(Current)|(Later)|(One)', r'Previous', test_temp_node[0]) checked_mileage_data = {measure_: mileage_data_1.loc[0:sep_rows_idx, :], test_temp_node[0]: mileage_data_2.loc[sep_rows_idx + 1:, :]} elif num_of_measures == 2: # e.g. elr='BTJ' sep_rows_idx_items = [test_temp_text[x] for x in np.where(alt_sep_rows_idx)[0]] sep_rows_idx = mileage_data[mileage_data.Node.isin(sep_rows_idx_items)].index[-1] mileage_data_1, mileage_data_2 = np.split(mileage_data, [sep_rows_idx], axis=0) sep_rows_idx_items_checked = [ mileage_data_1[mileage_data_1.Node.isin(sep_rows_idx_items)].Node.iloc[0], mileage_data_2[mileage_data_2.Node.isin(sep_rows_idx_items)].Node.iloc[0]] mileage_data_1 = mileage_data_1[~mileage_data_1.Node.isin(sep_rows_idx_items)] mileage_data_2 = mileage_data_2[~mileage_data_2.Node.isin(sep_rows_idx_items)] checked_mileage_data = dict(zip(sep_rows_idx_items_checked, [mileage_data_1, mileage_data_2])) else: if mileage_data.loc[sep_rows_idx, 'Mileage'] == '': mileage_data.loc[sep_rows_idx, 'Mileage'] = mileage_data.loc[sep_rows_idx - 1, 'Mileage'] checked_mileage_data = mileage_data else: checked_mileage_data = mileage_data return checked_mileage_data
@staticmethod def parse_mileage_col(mileage): mileage.index = range(len(mileage)) if any(mileage.str.match('.*km')): if all(mileage.str.match('.*km')): temp_mileage = mileage.str.replace('km', '').map( lambda x: yards_to_nr_mileage(measurement.measures.Distance(km=x.replace('≈', '')).british_yd)) miles_chains = temp_mileage.map(lambda x: nr_mileage_to_mile_chain(x)) # Might be wrong! else: miles_chains = mileage.map(lambda x: re.sub(r'/?\d+\.\d+km/?', '', x)) temp_mileage = miles_chains.map(lambda x: mile_chain_to_nr_mileage(x)) mileage_note = [x + ' (Approximate)' if x.startswith('≈') else x for x in list(mileage)] else: if all(mileage.map(is_str_float)): temp_mileage = mileage mileage_note = [''] * len(temp_mileage) else: temp_mileage, mileage_note = [], [] for m in mileage: if m == '': temp_mileage.append(m) mileage_note.append('Unknown') elif m.startswith('(') and m.endswith(')'): temp_mileage.append(re.search(r'\d+\.\d+', m).group(0)) mileage_note.append('Not on this route but given for reference') elif m.startswith('≈') or m.endswith('?'): temp_mileage.append(m.strip('≈').strip('?')) mileage_note.append('Approximate') elif re.match(r'\d+\.\d+/\s?\d+\.\d+', m): m1, m2 = m.split('/') temp_mileage.append(m1) mileage_note.append(m2.strip() + ' (Alternative)') else: temp_mileage.append(m.strip(' ').replace(' ', '.')) mileage_note.append('') miles_chains = temp_mileage.copy() temp_mileage = [mile_chain_to_nr_mileage(m) for m in temp_mileage] parsed_mileage = pd.DataFrame({'Mileage': temp_mileage, 'Mileage_Note': mileage_note, 'Miles_Chains': miles_chains}) return parsed_mileage @staticmethod def parse_node_col(node): def preprocess_node_x(node_x): # node_x = node_x.replace(' with Freightliner terminal', ' & Freightliner Terminal'). \ # replace(' with curve to', ' with'). \ # replace(' (0.37 long)', '') # pat = re.compile(r'\w+.*( \(\d+\.\d+\))?(/| and \w+)? with ([A-Z]){3}(\d)?( \(\d+\.\d+\))?') pat = re.compile(r'\w+.*( \(\d+\.\d+\))?(/| and \w+)? with ([A-Z]).*(\d)?( \(\d+\.\d+\))?') if re.match(pat, node_x): node_name = [x.group() for x in re.finditer(r'\w+.*(?= with)', node_x)] conn_node = [x.group() for x in re.finditer(r'(?<= with )[^*]+', node_x)] else: node_name, conn_node = [node_x], [None] return node_name + conn_node prep_node = pd.DataFrame((preprocess_node_x(n) for n in node), columns=['Node', 'Connection']) # def parse_nodes(prep_nodes): conn_node_lst = [] for n in prep_nodes.Connection: if n is not None: if re.match(r'[A-Z]{3}(\d)?( \(\d+.\d+\))? ?/ ?[A-Z]{3}(\d)?( \(\d+.\d+\))?', n): m = [x.strip() for x in n.split('/')] else: m = n.split(' and ') if len(m) > 2: m = [' and '.join(m[:2]), ' and '.join(m[2:])] else: m = [n] conn_node_lst.append(m) # assert isinstance(conn_node_lst, list) for i in [conn_node_lst.index(c) for c in conn_node_lst if len(c) > 1]: temp_lst = [x.replace('later ', '').rstrip(',').split(' and ') for x in conn_node_lst[i] if isinstance(x, str)] conn_node_lst[i] = [v for lst in temp_lst for v in lst] temp_lst = [x.split(', ') for x in conn_node_lst[i]] conn_node_lst[i] = [v for lst in temp_lst for v in lst] most_conn = max(len(c) for c in conn_node_lst) # conn_node_list = [c + [None] * (most_conn - len(c)) for c in conn_node_list] return pd.DataFrame(conn_node_lst, columns=['Link_{}'.format(n + 1) for n in range(most_conn)]) conn_nodes = parse_nodes(prep_node) # def uncouple_elr_mileage(node_x): # e.g. x = 'ECM5 (44.64)' or x = 'DNT' if node_x is None: y = ['', ''] else: # pat0 = re.compile(r'\w+.*(( lines)|( terminal))$') pat1 = re.compile(r'([A-Z]{3}(\d)?$)|((\w\s?)*\w$)') pat2 = re.compile(r'([A-Z]{3}(\d)?$)|(([\w\s&]?)*(\s\(\d+\.\d+\))?$)') pat3 = re.compile(r'[A-Z]{3}(\d)?(\s\(\d+.\d+\))?\s\[.*?\]$') pat4 = re.compile(r'[A-Z]{3}(\d)?\s\(\d+\.\d+km\)') # if re.match(pat0, node_x): # y = ['', ''] if re.match(pat1, node_x): y = [node_x, ''] elif re.match(pat2, node_x): y = [z[:-1] if re.match(r'\d+.\d+\)', z) else z.strip() for z in node_x.split('(')] y[0] = '' if len(y[0]) > 4 else y[0] elif re.match(pat3, node_x): try: y = [re.search(r'[A-Z]{3}(\d)?', node_x).group(0), re.search(r'\d+\.\d+', node_x).group(0)] except AttributeError: y = [re.search(r'[A-Z]{3}(\d)?', node_x).group(0), ''] elif re.match(pat4, node_x): y = [re.search(r'[A-Z]{3}(\d)?', node_x).group(0), nr_mileage_to_mile_chain(yards_to_nr_mileage( measurement.measures.Distance(km=re.search(r'\d+\.\d+', node_x).group(0)).yd))] else: y = [node_x, ''] if len(node_x) <= 4 else ['', ''] y[0] = y[0] if len(y[0]) <= 4 else '' return y # link_cols = [x for x in conn_nodes.columns if re.match(r'^(Link_\d)', x)] link_nodes = conn_nodes[link_cols].applymap(lambda x: uncouple_elr_mileage(x)) link_elr_mileage = pd.concat( [pd.DataFrame(link_nodes[col].values.tolist(), columns=[col + '_ELR', col + '_Mile_Chain']) for col in link_cols], axis=1, sort=False) parsed_node_and_conn = pd.concat([prep_node, conn_nodes, link_elr_mileage], axis=1, sort=False) return parsed_node_and_conn
[docs] def parse_mileage_data(self, mileage_data): """ Parse scraped data of mileage file. :param mileage_data: preprocessed data of mileage file scraped from source web page :type mileage_data: pandas.DataFrame :return: parsed data of mileage file :rtype: pandas.DataFrame """ mileage, node = mileage_data.iloc[:, 0], mileage_data.iloc[:, 1] parsed_mileage, parsed_node_and_conn = self.parse_mileage_col(mileage), self.parse_node_col(node) parsed_dat = pd.concat([parsed_mileage, parsed_node_and_conn], axis=1, sort=False) return parsed_dat
[docs] def collect_elr_by_initial(self, initial, update=False, verbose=False): """ Collect Engineer's Line References (ELRs) for the given initial letter from source web page. :param initial: initial letter of an ELR, e.g. ``'a'``, ``'z'`` :type initial: str :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool, int :return: data of ELRs whose names start with the given ``initial`` and date of when the data was last updated :rtype: dict **Example**:: from pyrcs.line_data import ELRMileages em = ELRMileages() initial = 'a' update = False elrs_a = em.collect_elr_by_initial(initial, update) print(elrs_a) # {'A': <codes>, # 'Last updated date': <date>} """ assert initial in string.ascii_letters beginning_with = initial.upper() path_to_pickle = self.cdd_em("a-z", beginning_with.lower() + ".pickle") if os.path.isfile(path_to_pickle) and not update: elrs = load_pickle(path_to_pickle) else: url = self.Catalogue[beginning_with] # Specify the requested URL if verbose == 2: print("Collecting data of ELRs beginning with \"{}\"".format(beginning_with.upper()), end=" ... ") try: source = requests.get(url, headers=fake_requests_headers()) # Request to get connected to the url records, header = parse_table(source, parser='lxml') # Create a DataFrame of the requested table data = pd.DataFrame([[x.replace('=', 'See').strip('\xa0') for x in i] for i in records], columns=header) # Return a dictionary containing both the DataFrame and its last updated date elrs = {beginning_with: data, self.LUDKey: get_last_updated_date(url)} print("Done. ") if verbose == 2 else "" save_pickle(elrs, path_to_pickle, verbose=verbose) except Exception as e: # e.g the requested URL is not available: print("Failed. {}".format(e)) elrs = {beginning_with: None, self.LUDKey: None} return elrs
[docs] def fetch_elr(self, update=False, pickle_it=False, data_dir=None, verbose=False): """ Fetch ELRs and mileages from local backup. :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param pickle_it: whether to replace the current package data with newly collected data, defaults to ``False`` :type pickle_it: bool :param data_dir: name of package data folder, defaults to ``None`` :type data_dir: str, None :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool, int :return: data of all available ELRs and date of when the data was last updated :rtype: dict **Example**:: from pyrcs.line_data import ELRMileages em = ELRMileages() update = False pickle_it = False data_dir = None elrs_data = em.fetch_elr(update, pickle_it, data_dir) print(elrs_data) # {'ELRs': <codes>, # 'Latest update date': <date>} """ data = [self.collect_elr_by_initial(x, update, verbose=False if data_dir or not verbose else True) for x in string.ascii_lowercase] elrs_data = (item[x] for item, x in zip(data, string.ascii_uppercase)) # Select DataFrames only elrs_data_table = pd.concat(elrs_data, axis=0, ignore_index=True, sort=False) # Get the latest updated date last_updated_dates = (item[self.LUDKey] for item, _ in zip(data, string.ascii_uppercase)) latest_update_date = max(d for d in last_updated_dates if d is not None) elrs_data = {self.Key: elrs_data_table, self.LUDKey: latest_update_date} if pickle_it and data_dir: pickle_filename = self.Name.lower().replace(" ", "-") + ".pickle" self.CurrentDataDir = validate_input_data_dir(data_dir) path_to_pickle = os.path.join(self.CurrentDataDir, pickle_filename) save_pickle(elrs_data, path_to_pickle, verbose=verbose) return elrs_data
[docs] def collect_mileage_file_by_elr(self, elr, parsed=True, confirmation_required=True, pickle_it=False, verbose=False): """ Collect mileage file for the given ELR from source web page. :param elr: ELR, e.g. 'CJD', 'MLA', 'FED' :type elr: str :param parsed: whether to parse the scraped mileage data :type parsed: bool :param confirmation_required: whether to prompt a message for confirmation to proceed, defaults to ``True`` :type confirmation_required: bool :param pickle_it: whether to replace the current package data with newly collected data, defaults to ``False`` :type pickle_it: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool, int :return: mileage file for the given ``elr`` :rtype: dict .. note:: - In some cases, mileages are unknown hence left blank, \ e.g. ANI2, Orton Junction with ROB (~3.05) - Mileages in parentheses are not on that ELR, but are included for reference, \ e.g. ANL, (8.67) NORTHOLT [London Underground] - As with the main ELR list, mileages preceded by a tilde (~) are approximate. **Examples**:: from pyrcs.line_data import ELRMileages em = ELRMileages() parsed = True confirmation_required = True pickle_it = False elr = 'CJD' mileage_file = em.collect_mileage_file_by_elr(elr, parsed, confirmation_required, pickle_it) # To collect mileage file for "CJD"? [No]|Yes: # >? yes print(mileage_file) # {'ELR': 'CJD', # 'Line': 'Challoch Junction to Dumfries Line', # 'Sub-Line': '', # 'CJD': <codes>, # 'Notes': <notes>} elr = 'GAM' mileage_file = em.collect_mileage_file_by_elr(elr, parsed, confirmation_required, pickle_it) # To collect mileage file of "GAM"? [No]|Yes: # >? yes print(mileage_file) # {'ELR': 'GAM', # 'Line': 'Gartness Branch (LMS)', # 'Sub-Line': '', # 'GAM': <codes>, # 'Notes': ''} elr = 'SLD' mileage_file = em.collect_mileage_file_by_elr(elr, parsed, confirmation_required, pickle_it) # To collect mileage file of "SLD"? [No]|Yes: # >? yes print(mileage_file) # {'ELR': 'SLD', # 'Line': 'Stainland Branch', # 'Sub-Line': '', # 'SLD': <codes>, # 'Notes': ''} elr = 'ZZD2' mileage_file = em.collect_mileage_file_by_elr(elr, parsed, confirmation_required, pickle_it) # To collect mileage file of "ZZD2"? [No]|Yes: # >? yes print(mileage_file) # {'ELR': 'ZZD2', # 'Line': 'Gartsherrie Freightliner Depot Sidings', # 'Sub-Line': '', # 'ZZD2': <codes>, # 'Notes': ''} elr = 'WHG?' mileage_file = em.collect_mileage_file_by_elr(elr, parsed, confirmation_required, pickle_it) # To collect mileage file of "WHG"? [No]|Yes: # >? yes print(mileage_file) # {'ELR': 'WHG', # 'Line': 'West Hartlepool Goods Branch', # 'Sub-Line': '', # 'WHG': <codes>, # 'Notes': ''} elr = 'ELR' mileage_file = em.fetch_mileage_file(elr, update, pickle_it, data_dir) # To collect mileage file of "ELR"? [No]|Yes: # >? yes print(mileage_file) # {'ELR': 'ELR', # 'Line': 'Maryhill Park Junction to Anniesland Line', # 'Sub-Line': '', # 'MLA': <codes>, # 'Notes': <notes>} """ elr = remove_punctuation(elr) if elr != '': if confirmed("To collect mileage file of \"{}\"?".format(elr.upper()), confirmation_required=confirmation_required): if verbose == 2: print("Collecting mileage file of \"{}\"".format(elr.upper()), end=" ... ") try: # The URL of the mileage file for the ELR url = self.HomeURL + '/elrs/_mileages/{}/{}.shtm'.format(elr[0].lower(), elr.lower()) source = requests.get(url, headers=fake_requests_headers()) source_text = bs4.BeautifulSoup(source.text, 'lxml') line_name, sub_line_name = source_text.find('h3').text, source_text.find('h4') if line_name == '"404" error: page not found': initial = elr[0] elr_dat = self.collect_elr_by_initial(initial, verbose=verbose)[initial] elr_dat = elr_dat[elr_dat.ELR == elr] notes = elr_dat.Notes.values[0] if re.match(r'(Now( part of)? |= |See )[A-Z]{3}(\d)?$', notes): new_elr = re.search(r'(?<= )[A-Z]{3}(\d)?', notes).group(0) mileage_file = self.collect_mileage_file_by_elr( elr=new_elr, parsed=parsed, confirmation_required=confirmation_required, pickle_it=pickle_it, verbose=verbose) return mileage_file else: line_name, mileages, datum = elr_dat[['Line name', 'Mileages', 'Datum']].values[0] if re.match(r'(\w ?)+ \((\w+ \w+)+.\)', line_name): line_name_ = re.search(r'(?<=\w \()(\w+ \w+)+.(?=\))', line_name).group(0) try: location_a, _, location_b = re.split(r' (and|&|to) ', line_name_) line_name = re.search(r'(\w+ \w+)+.(?= \((\w ?)+\))', line_name).group(0) except ValueError: location_a, _, location_b = re.split(r' (and|&|to) ', notes) line_name = line_name_ elif elr_dat.Mileages.values[0].startswith('0.00') and elr_dat.Datum.values[0] != '': location_a = elr_dat.Datum.values[0] location_b = re.split(r' (and|&|to) ', line_name)[ 2] if location_a in line_name else line_name else: try: location_a, _, location_b = re.split(r' (and|&|to|-) ', notes) except (ValueError, TypeError): pass try: location_a, _, location_b = re.split(r' (and|&|to|-) ', line_name) except (ValueError, TypeError): pass if line_name: location_a, location_b = line_name, line_name else: location_a, location_b = '', '' # location_b_ = re.sub(r' Branch| Curve', '', location_b) \ # if re.match(r'.*( Branch| Curve)$', location_b) else location_b miles_chains, locations = mileages.split(' - '), [location_a, location_b] parsed_content = [[m, l] for m, l in zip(miles_chains, locations)] else: line_name = line_name.split('\t')[1] parsed_content = [x.strip().split('\t', 1) for x in source_text.find('pre').text.splitlines() if x != ''] parsed_content = [[y.replace(' ', ' ').replace('\t', ' ') for y in x] for x in parsed_content] parsed_content = [[''] + x if (len(x) == 1) & ('Note that' not in x[0]) else x for x in parsed_content] # assert sub_headers[0] == elr sub_headers = sub_line_name.text.split('\t')[1] if sub_line_name else '' # Make a dict of line information line_info = {'ELR': elr, 'Line': line_name, 'Sub-Line': sub_headers} # Search for note note_temp = min(parsed_content, key=len) notes = note_temp[0] if len(note_temp) == 1 else '' if notes: if ' Revised distances are thus:' in notes: parsed_content[parsed_content.index(note_temp)] = ['', 'Current measure'] notes = notes.replace(' Revised distances are thus:', '') else: parsed_content.remove(note_temp) # Create a table of the mileage data mileage_data = pd.DataFrame(parsed_content, columns=['Mileage', 'Node']) # Check if there is any missing note if mileage_data.iloc[-1].Mileage == '': notes = [notes, mileage_data.iloc[-1].Node] if notes else mileage_data.iloc[-1].Node mileage_data = mileage_data[:-1] if len(mileage_data.iloc[-1].Mileage) > 6: notes = [notes, mileage_data.iloc[-1].Mileage] if notes else mileage_data.iloc[-1].Mileage mileage_data = mileage_data[:-1] # Make a dict of note note_dat = {'Notes': notes} # Identify if there are multiple (both current and former) measures in 'mileage_data' mileage_data = self.identify_multiple_measures(mileage_data) if parsed: if isinstance(mileage_data, dict) and len(mileage_data) > 1: mileage_data = {h: self.parse_mileage_data(dat) for h, dat in mileage_data.items()} else: # isinstance(dat, pd.DataFrame) mileage_data = self.parse_mileage_data(mileage_data) mileage_file = dict( pair for x in [line_info, {'Mileage': mileage_data}, note_dat] for pair in x.items()) print("Done. ") if verbose == 2 else "" if pickle_it: path_to_pickle = self.cdd_em("mileage-files", elr[0].lower(), elr + ".pickle") if os.path.basename(path_to_pickle) == "prn.pickle": path_to_pickle = path_to_pickle.replace("prn.pickle", "prn_x.pickle") save_pickle(mileage_file, path_to_pickle, verbose=verbose) except Exception as e: print("Failed. {}.".format(e)) mileage_file = None else: mileage_file = None return mileage_file
[docs] def fetch_mileage_file(self, elr, update=False, pickle_it=False, data_dir=None, verbose=False): """ Fetch mileage file for the given ELR from local backup. :param elr: elr: ELR, e.g. 'CJD', 'MLA', 'FED' :type elr: str :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param pickle_it: whether to replace the current package data with newly collected data, defaults to ``False`` :type pickle_it: bool :param data_dir: name of package data folder, defaults to ``None`` :type data_dir: str, None :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool, int :return: mileage file (codes), line name and, if any, additional information/notes :rtype: dict **Example**:: from pyrcs.line_data import ELRMileages em = ELRMileages() update = False pickle_it = False data_dir = None elr = 'MLA' mileage_file = em.fetch_mileage_file(elr, update, pickle_it, data_dir) print(mileage_file) # {'ELR': 'MLA', # 'Line': 'Maryhill Park Junction to Anniesland Line', # 'Sub-Line': '', # 'MLA': <codes>, # 'Notes': <notes>} """ path_to_pickle = self.cdd_em("mileage-files", elr[0].lower(), elr + ".pickle") if os.path.basename(path_to_pickle) == "prn.pickle": path_to_pickle = path_to_pickle.replace("prn.pickle", "prn_x.pickle") if os.path.isfile(path_to_pickle) and not update: mileage_file = load_pickle(path_to_pickle) else: verbose_ = False if data_dir or not verbose else True mileage_file = self.collect_mileage_file_by_elr(elr, parsed=True, confirmation_required=False, pickle_it=pickle_it, verbose=verbose_) if mileage_file: if pickle_it and data_dir: self.CurrentDataDir = validate_input_data_dir(data_dir) path_to_pickle = os.path.join(self.CurrentDataDir, os.path.basename(path_to_pickle)) save_pickle(mileage_file, path_to_pickle, verbose=verbose) else: print("No mileage file has been collected for \"{}\".".format(elr.upper())) return mileage_file
[docs] @staticmethod def search_conn(start_elr, start_em, end_elr, end_em): """ Search for connection between two ELR-and-mileage pairs. :param start_elr: start ELR :type start_elr: str :param start_em: mileage file of the start ELR :type start_em: pandas.DataFrame :param end_elr: end ELR :type end_elr: str :param end_em: mileage file of the end ELR :type end_em: pandas.DataFrame :return: connection, in the form (<end mileage of the start ELR>, <start mileage of the end ELR>) :rtype: tuple **Example**:: from pyrcs.line_data import ELRMileages em = ELRMileages() start_elr = 'AAM' start_mileage_file = em.collect_mileage_file_by_elr(start_elr) # To collect mileage file of "AAM"? [No]|Yes: # >? yes start_em = start_mileage_file['Mileage'] end_elr = 'ANZ' end_mileage_file = em.collect_mileage_file_by_elr(end_elr) # To collect mileage file of "ANZ"? [No]|Yes: # >? yes end_em = end_mileage_file['Mileage'] start_dest_mileage, end_orig_mileage = em.search_conn(start_elr, start_em, end_elr, end_em) print(start_dest_mileage) # 0.0396 print(end_orig_mileage) # 84.1364 """ start_mask = start_em.apply(lambda x: x.str.contains(end_elr, case=False).any(), axis=1) start_temp = start_em[start_mask] assert isinstance(start_temp, pd.DataFrame) if not start_temp.empty: # Get exact location key_idx = start_temp.index[0] mile_chain_col = [x for x in start_temp.columns if re.match(r'.*_Mile_Chain', x)][0] # Mileage of the Start ELR start_dest_mileage = start_em.loc[key_idx, 'Mileage'] # Mileage of the End ELR end_orig_mile_chain = start_temp.loc[key_idx, mile_chain_col] if end_orig_mile_chain and end_orig_mile_chain != 'Unknown': end_orig_mileage = mile_chain_to_nr_mileage(end_orig_mile_chain) else: # end_conn_mile_chain == '': end_mask = end_em.apply(lambda x: x.str.contains(start_elr, case=False).any(), axis=1) end_temp = end_em[end_mask] if not end_temp.empty: end_orig_mileage = end_temp.Mileage.iloc[0] else: end_orig_mileage = start_dest_mileage else: start_dest_mileage, end_orig_mileage = '', '' return start_dest_mileage, end_orig_mileage
[docs] def get_conn_mileages(self, start_elr, end_elr, update=False, pickle_mileage_file=False, data_dir=None, verbose=False): """ Get to end and start mileages for StartELR and EndELR, respectively, for the connection point :param start_elr: start ELR :type start_elr: str :param end_elr: end ELR :type end_elr: str :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param pickle_mileage_file: whether to replace the current mileage file with newly collected data, defaults to ``False`` :type pickle_mileage_file: bool :param data_dir: name of package data folder, defaults to ``None`` :type data_dir: str, None :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool, int :return: connection ELR and mileages between the given ``start_elr`` and ``end_elr`` :rtype: tuple **Example**:: from pyrcs.line_data import ELRMileages em = ELRMileages() update = False pickle_mileage_file = False data_dir = None verbose = True start_elr = 'NAY' end_elr = 'LTN2' start_dest_mileage, conn_elr, conn_orig_mileage, conn_dest_mileage, end_orig_mileage = \ em.get_conn_mileages(start_elr, end_elr, update, pickle_mileage_file, data_dir) print(start_dest_mileage) # 5.1606 print(conn_elr) # NOL print(conn_orig_mileage) # 5.1606 print(conn_dest_mileage) # 0.0638 print(end_orig_mileage) # 123.1320 start_elr = 'MAC3' end_elr = 'DBP1' start_dest_mileage, conn_elr, conn_orig_mileage, conn_dest_mileage, end_orig_mileage = \ em.get_conn_mileages(start_elr, end_elr, update, pickle_mileage_file, data_dir) # '' """ start_file = self.fetch_mileage_file(start_elr, update, pickle_mileage_file, data_dir, verbose=verbose) end_file = self.fetch_mileage_file(end_elr, update, pickle_mileage_file, data_dir, verbose=verbose) if start_file is not None and end_file is not None: start_elr, end_elr = start_file['ELR'], end_file['ELR'] start_em, end_em = start_file['Mileage'], end_file['Mileage'] key_pat = re.compile(r'(Current\s)|(One\s)|(Later\s)|(Usual\s)') if isinstance(start_em, dict): start_em = start_em[[k for k in start_em.keys() if re.match(key_pat, k)][0]] if isinstance(end_em, dict): end_em = end_em[[k for k in end_em.keys() if re.match(key_pat, k)][0]] # start_dest_mileage, end_orig_mileage = self.search_conn(start_elr, start_em, end_elr, end_em) conn_elr, conn_orig_mileage, conn_dest_mileage = '', '', '' if not start_dest_mileage and not end_orig_mileage: link_cols = [x for x in start_em.columns if re.match(r'Link_\d_ELR.?', x)] conn_elrs = start_em[link_cols] i = 0 while i < len(link_cols): link_col = link_cols[i] conn_temp = conn_elrs[conn_elrs.astype(bool)].dropna(how='all')[link_col].dropna() j = 0 while j < len(conn_temp): conn_elr = conn_temp.iloc[j] conn_em = self.fetch_mileage_file(conn_elr, update=update) if conn_em is not None: conn_elr = conn_em['ELR'] conn_em = conn_em['Mileage'] if isinstance(conn_em, dict): conn_em = conn_em[[k for k in conn_em.keys() if re.match(key_pat, k)][0]] # start_dest_mileage, conn_orig_mileage = \ self.search_conn(start_elr, start_em, conn_elr, conn_em) # conn_dest_mileage, end_orig_mileage = \ self.search_conn(conn_elr, conn_em, end_elr, end_em) if conn_dest_mileage and end_orig_mileage: if not start_dest_mileage: start_dest_mileage = start_em[start_em[link_col] == conn_elr].Mileage.values[0] if not conn_orig_mileage: link_col_conn = \ conn_em.where(conn_em == start_elr).dropna(axis=1, how='all').columns[0] conn_orig_mileage = conn_em[conn_em[link_col_conn] == start_elr].Mileage.values[0] break else: conn_elr = '' j += 1 if conn_elr != '': break else: i += 1 if conn_orig_mileage and not conn_elr: start_dest_mileage, conn_orig_mileage = '', '' else: start_dest_mileage, conn_elr, conn_orig_mileage, conn_dest_mileage, end_orig_mileage = [''] * 5 return start_dest_mileage, conn_elr, conn_orig_mileage, conn_dest_mileage, end_orig_mileage