Source code for tick.dataset.download_helper

# License: BSD 3 clause

"""Helper to download and cache datasets from the tick_datasets github
repository

Inspired from
https://github.com/scikit-learn/scikit-learn/blob/14031f6/sklearn/datasets/twenty_newsgroups.py
"""
import logging
from urllib.request import urlopen

import os

import shutil

import numpy as np
from sklearn.datasets import load_svmlight_file
import math

import warnings

logger = logging.getLogger(__name__)

BASE_URL = ("https://raw.githubusercontent.com/X-DataInitiative/tick-datasets"
            "/master/%s")

_TICK_HOME_ENV = 'TICK_DATASETS'
_TICK_DEFAULT_HOME = os.path.join('~', 'tick_datasets')


def download_dataset(dataset_url, dataset_path, data_home=None, verbose=True):
    """Downloads dataset from given URL and stores it locally

    Parameters
    ----------
    dataset_url : `str`
        URL of the dataset, for example
        "https://archive.ics.uci.edu/ml/machine-learning-databases/url/url_svmlight.tar.gz"
    dataset_path : `str`
        Path at which the dataset will be saved in the `data_home` folder
    data_home : `str`, optional, default=None
        Specify a download and cache folder for the datasets. If None
        and not configured with TICK_DATASETS environement variable
        all tick datasets are stored in '~/tick_datasets' subfolders.
    verbose : `bool`, default=True
        If True, download progress bar will be printed
    Returns
    -------
    cache_path : `str`
        File path of the downloaded data
    """
    data_home = get_data_home(data_home)
    cache_path = os.path.join(data_home, dataset_path)
    cache_dir = os.path.dirname(cache_path)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    if verbose:
        logger.warning("Downloading dataset from %s", dataset_url)
    opener = urlopen(dataset_url)
    chunk_size = 4096
    with open(cache_path, 'wb') as f:
        n_chunks = 0
        file_size = opener.length
        last_percent = -1
        while True:
            data = opener.read(chunk_size)
            if data:
                percent = chunk_size * n_chunks / file_size
                if verbose:
                    progress_bar(percent, length=file_size,
                                 last_progress=last_percent)
                    last_percent = percent
                f.write(data)
                n_chunks += 1
            else:
                if verbose:
                    progress_bar(1, length=file_size)
                break

    return cache_path


def download_tick_dataset(dataset_path, data_home=None, verbose=True):
    """Downloads dataset from tick_datasets github repository and stores it
    locally
    Parameters
    ----------
    dataset_path : `str`
        Dataset path on tick_datasets github repository. For example
        "binary/adult/adult.trn.bz2" for adult train dataset
    data_home : `str`, optional, default=None
        Specify a download and cache folder for the datasets. If None
        and not configured with TICK_DATASETS environement variable
        all tick datasets are stored in '~/tick_datasets' subfolders.
    verbose : `bool`, default=True
        If True, download progress bar will be printed
    Returns
    -------
    cache_path : `str`
        File path of the downloaded data
    """
    dataset_url = BASE_URL % dataset_path
    download_dataset(dataset_url, dataset_path, data_home=data_home,
                     verbose=verbose)


[docs]def fetch_tick_dataset(dataset_path, data_home=None, n_features=None, verbose=True): """Fetch dataset from tick_datasets github repository. Uses cache if this dataset has already been downloaded. Parameters ---------- dataset_path : `str` Dataset path on tick_datasets github repository. For example "binary/adult/adult.trn.bz2" for adult train dataset data_home : `str`, optional, default=None Specify a download and cache folder for the datasets. If None and not configured with TICK_DATASETS environement variable all tick datasets are stored in '~/tick_datasets' subfolders. n_features : `int`, optional, default=None The number of features to use. If None, it will be inferred. This argument is useful to load several files that are subsets of a bigger sliced dataset: each subset might not have examples of every feature, hence the inferred shape might vary from one slice to another. verbose : `bool`, default=True If True, download progress bar will be printed Returns ------- output : `np.ndarray` or `dict` or `tuple` Dataset. Its format will depend on queried dataset. """ data_home = get_data_home(data_home) cache_path = os.path.join(data_home, dataset_path) dataset = None if os.path.exists(cache_path): try: dataset = load_dataset(dataset_path, data_home=data_home, n_features=n_features) except Exception as e: print(80 * '_') print('Cache loading failed') print(80 * '_') print(e) if dataset is None: download_tick_dataset(dataset_path, data_home=data_home, verbose=verbose) dataset = load_dataset(dataset_path, data_home=data_home, n_features=n_features) return dataset
def load_dataset(dataset_path, data_home=None, n_features=None): """Load dataset from given path Parameters ---------- dataset_path : `str` Dataset relative path data_home : `str`, optional, default=None Specify a download and cache folder for the datasets. If None and not configured with TICK_DATASETS environement variable all tick datasets are stored in '~/tick_datasets' subfolders. n_features : `int`, optional, default=None The number of features to use. If None, it will be inferred. This argument is useful to load several files that are subsets of a bigger sliced dataset: each subset might not have examples of every feature, hence the inferred shape might vary from one slice to another. Returns ------- output : `np.ndarray` or `dict` or `tuple` Dataset. Its format will depend on queried dataset. """ data_home = get_data_home(data_home) cache_path = os.path.join(data_home, dataset_path) if cache_path.endswith(".npz"): dataset = np.load(cache_path, allow_pickle=True) # If we have only one numpy array we return it directly otherwise # we return the row dictionary if len(dataset.keys()) == 1: key_0 = dataset.keys()[0] dataset = dataset[key_0] else: dataset = dataset.items() else: dataset = load_svmlight_file(cache_path, n_features=n_features) return dataset def get_data_home(data_home=None): """Return the path of the tick data dir. This folder is used by some large dataset loaders to avoid downloading the data several times. By default the data dir is set to a folder named 'tick_datasets' in the user home folder. Alternatively, it can be set by the 'TICK_DATASETS' environment variable or programmatically by giving an explicit folder path. The '~' symbol is expanded to the user home folder. If the folder does not already exist, it is automatically created. """ if data_home is None: if _TICK_HOME_ENV in os.environ: data_home = os.environ[_TICK_HOME_ENV] else: data_home = _TICK_DEFAULT_HOME warnings.warn('{} environment variable was not set. Saving dataset' ' to the default location {}' .format(_TICK_HOME_ENV, _TICK_DEFAULT_HOME)) data_home = os.path.expanduser(data_home) if not os.path.exists(data_home): os.makedirs(data_home) return data_home def clear_dataset(dataset_path, data_home=None): """Clear dataset from cache folder """ data_home = get_data_home(data_home) cache_path = os.path.join(data_home, dataset_path) if os.path.exists(cache_path): os.remove(cache_path) def clear_data_home(data_home=None): """Delete all the content of the data home cache. """ data_home = get_data_home(data_home) shutil.rmtree(data_home) def convert_size(size_bytes): """Convert raw bytes into human readable size References ---------- http://stackoverflow.com/questions/5194057/better-way-to-convert-file-sizes-in-python """ if size_bytes == 0: return '0B' size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) s = round(size_bytes / p, 2) return '%s %s' % (s, size_name[i]) def progress_bar(progress, width=40, length=None, last_progress=None): """Print progress bar to sys.stdout Parameters ---------- progress : `float` Reached progress between 0 (just started) and 1 (finished) width : `int` Total width of the progress bar length : `int` Size in bytes of the downloaded file """ if length: size = "(%s)" % convert_size(length) else: size = '' n_bars = int(progress * width) if last_progress is not None and n_bars == int(last_progress * width): return bar = "[%s%s]" % ("=" * n_bars, " " * (width - n_bars)) print("\r%s %s %2d%%" % (size, bar, progress * 100), flush=True, end="") if progress == 1: print()