Source code for trk_diagr

"""
Collect
British `railway track diagrams <http://www.railwaycodes.org.uk/track/diagrams0.shtm>`_.
"""

import copy
import os
import urllib.parse

import bs4
import pandas as pd
import requests
from pyhelpers.dir import cd, validate_input_data_dir
from pyhelpers.ops import fake_requests_headers
from pyhelpers.store import load_pickle, save_pickle

from pyrcs.utils import cd_dat, confirmed, get_last_updated_date, homepage_url, \
    print_conn_err, is_internet_connected, print_connection_error


[docs]class TrackDiagrams: """ A class for collecting British railway track diagrams. :param data_dir: name of data directory, defaults to ``None`` :type data_dir: str or None :param verbose: whether to print relevant information in console as the function runs, defaults to ``True`` :type verbose: bool or int **Example**:: >>> from pyrcs.line_data import TrackDiagrams >>> td = TrackDiagrams() >>> print(td.Name) Railway track diagrams (some samples) >>> print(td.SourceURL) http://www.railwaycodes.org.uk/track/diagrams0.shtm """ def __init__(self, data_dir=None, verbose=True): if not is_internet_connected(): print_connection_error(verbose=verbose) self.Name = 'Railway track diagrams (some samples)' self.Key = 'Track diagrams' self.HomeURL = homepage_url() self.SourceURL = urllib.parse.urljoin(self.HomeURL, '/track/diagrams0.shtm') self.LUDKey = 'Last updated date' self.Date = get_last_updated_date(url=self.SourceURL, parsed=True, as_date_type=False) if data_dir: self.DataDir = validate_input_data_dir(data_dir) else: self.DataDir = cd_dat("line-data", self.Key.lower().replace(" ", "-")) self.CurrentDataDir = copy.copy(self.DataDir) def _cdd_td(self, *sub_dir, **kwargs): """ Change directory to package data directory and sub-directories (and/or a file). The directory for this module: ``"\\dat\\line-data\\track-diagrams"``. :param sub_dir: sub-directory or sub-directories (and/or a file) :type sub_dir: str :param kwargs: optional parameters of `os.makedirs <https://docs.python.org/3/library/os.html#os.makedirs>`_, e.g. ``mode=0o777`` :return: path to the backup data directory for ``LOR`` :rtype: str :meta private: """ path = cd(self.DataDir, *sub_dir, mkdir=True, **kwargs) return path
[docs] def get_track_diagrams_items(self, update=False, verbose=False): """ Get catalogue of track diagrams. :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool or int :return: catalogue of railway station data :rtype: dict **Example**:: >>> from pyrcs.line_data import TrackDiagrams >>> td = TrackDiagrams() >>> track_diagrams_items = td.get_track_diagrams_items() >>> type(track_diagrams_items) <class 'dict'> >>> print(list(track_diagrams_items.keys())[0]) Track diagrams """ cat_json = '-'.join(x for x in urllib.parse.urlparse(self.SourceURL).path.replace( '.shtm', '.json').split('/') if x) path_to_cat = cd_dat("catalogue", cat_json) if os.path.isfile(path_to_cat) and not update: items = load_pickle(path_to_cat) else: if verbose == 2: print("Collecting a list of {} items".format(self.Key.lower()), end=" ... ") try: source = requests.get(self.SourceURL, headers=fake_requests_headers()) except requests.exceptions.ConnectionError: print("Failed. ") if verbose == 2 else "" print_conn_err(update=update, verbose=verbose) items = load_pickle(path_to_cat) else: try: soup = bs4.BeautifulSoup(source.text, 'lxml') h3 = {x.get_text(strip=True) for x in soup.find_all('h3', text=True, attrs={'class': None})} items = {self.Key: h3} print("Done. ") if verbose == 2 else "" save_pickle(items, path_to_cat, verbose=verbose) except Exception as e: print("Failed. {}".format(e)) items = None return items
[docs] def collect_sample_catalogue(self, confirmation_required=True, verbose=False): """ Collect catalogue of sample railway track diagrams from source web page. :param confirmation_required: whether to require users to confirm and proceed, defaults to ``True`` :type confirmation_required: bool :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool :return: catalogue of sample railway track diagrams and date of when the data was last updated :rtype: dict, None **Example**:: >>> from pyrcs.line_data import TrackDiagrams >>> td = TrackDiagrams() >>> track_diagrams_catalog = td.collect_sample_catalogue() To collect the catalogue of sample track diagrams? [No]|Yes: yes >>> type(track_diagrams_catalog) <class 'dict'> >>> print(list(track_diagrams_catalog.keys())) ['Track diagrams', 'Last updated date'] """ if confirmed("To collect the catalogue of sample {}?".format(self.Key.lower()), confirmation_required=confirmation_required): if verbose == 2: print("Collecting the catalogue of sample {}".format(self.Key.lower()), end=" ... ") track_diagrams_catalogue = None try: source = requests.get(self.SourceURL, headers=fake_requests_headers()) except requests.exceptions.ConnectionError: print("Failed. ") if verbose == 2 else "" print_conn_err(verbose=verbose) else: try: track_diagrams_catalogue_ = {} soup = bs4.BeautifulSoup(source.text, 'lxml') h3 = soup.find('h3', text=True, attrs={'class': None}) while h3: # Description if h3.text == 'Miscellaneous': desc = [x.text for x in h3.find_next_siblings('p')] else: desc = h3.find_next_sibling('p').text.replace('\xa0', '') # Extract details cold_soup = h3.find_next('div', attrs={'class': 'columns'}) if cold_soup: info = [x.text for x in cold_soup.find_all('p') if x.string != '\xa0'] urls = [urllib.parse.urljoin(self.SourceURL, a.get('href')) for a in cold_soup.find_all('a')] else: cold_soup = h3.find_next('a', attrs={'target': '_blank'}) info, urls = [], [] while cold_soup: info.append(cold_soup.text) urls.append(urllib.parse.urljoin( self.SourceURL, cold_soup['href'])) cold_soup = cold_soup.find_next('a') \ if h3.text == 'Miscellaneous' \ else cold_soup.find_next_sibling('a') meta = pd.DataFrame(zip(info, urls), columns=['Description', 'FileURL']) track_diagrams_catalogue_.update({h3.text: (desc, meta)}) h3 = h3.find_next_sibling('h3') track_diagrams_catalogue = {self.Key: track_diagrams_catalogue_, self.LUDKey: self.Date} print("Done. ") if verbose == 2 else "" pickle_filename = self.Key.lower().replace(" ", "-") + ".pickle" path_to_pickle = self._cdd_td(pickle_filename) save_pickle(track_diagrams_catalogue, path_to_pickle, verbose=verbose) except Exception as e: print("Failed. {}".format(e)) return track_diagrams_catalogue
[docs] def fetch_sample_catalogue(self, update=False, pickle_it=False, data_dir=None, verbose=False): """ Fetch catalogue of sample railway track diagrams from local backup. :param update: whether to check on update and proceed to update the package data, defaults to ``False`` :type update: bool :param pickle_it: whether to replace the current package data with newly collected data, defaults to ``False`` :type pickle_it: bool :param data_dir: name of package data folder, defaults to ``None`` :type data_dir: str or None :param verbose: whether to print relevant information in console as the function runs, defaults to ``False`` :type verbose: bool :return: catalogue of sample railway track diagrams and date of when the data was last updated :rtype: dict **Example**:: >>> from pyrcs.line_data import TrackDiagrams >>> td = TrackDiagrams() >>> track_diagrams_catalog = td.fetch_sample_catalogue() >>> td_dat = track_diagrams_catalog['Track diagrams'] >>> type(td_dat) <class 'dict'> >>> print(list(td_dat.keys())) ['Main line diagrams', 'Tram systems', 'London Underground', 'Miscellaneous'] """ pickle_filename = self.Key.lower().replace(" ", "-") + ".pickle" path_to_pickle = self._cdd_td(pickle_filename) if os.path.isfile(path_to_pickle) and not update: track_diagrams_catalogue = load_pickle(path_to_pickle) else: verbose_ = False if data_dir or not verbose else (2 if verbose == 2 else True) track_diagrams_catalogue = self.collect_sample_catalogue( confirmation_required=False, verbose=verbose_) if track_diagrams_catalogue: if pickle_it and data_dir: self.CurrentDataDir = validate_input_data_dir(data_dir) path_to_pickle = os.path.join(self.CurrentDataDir, pickle_filename) save_pickle(track_diagrams_catalogue, path_to_pickle, verbose=verbose) else: print("No data of the sample {} catalogue " "has been freshly collected.".format(self.Key.lower())) track_diagrams_catalogue = load_pickle(path_to_pickle) return track_diagrams_catalogue