Source code for sndata.utils.downloads

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

"""The ``downloads`` module supports downloading data files in various file
formats.
"""

import sys
import tarfile
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import IO, Union

import requests
from tqdm import tqdm


[docs]def download_file(
        url: str,
        destination: Union[str, Path, IO] = None,
        force: bool = False,
        timeout: float = 15,
        verbose: bool = True):
    """Download content from a url to a file

    If ``destination`` is a path but already exists, skip the
    download unless ``force`` is also ``True``.

    Args:
        url: URL of the file to download
        destination: Path or file object to download to
        force: Re-Download locally available data (Default: False)
        timeout: Seconds before raising timeout error (Default: 15)
        verbose: Print status to stdout
    """

    destination_is_path = isinstance(destination, (str, Path))
    if destination_is_path:
        path = Path(destination)
        if (not force) and path.exists():
            return

        path.parent.mkdir(exist_ok=True, parents=True)
        destination = path.open('wb')

    if verbose:
        tqdm.write(f'Fetching {url}', file=sys.stdout)
        response = requests.get(url, stream=True, timeout=timeout)

        total = int(response.headers.get('content-length', 0))
        chunk_size = 1024
        with tqdm(total=total, unit='B', unit_scale=True,
                  unit_divisor=chunk_size, file=sys.stdout) as pbar:
            for data in response.iter_content(chunk_size=chunk_size):
                pbar.update(destination.write(data))

    else:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        destination.write(response.content)
        destination.write(response.content)

    if destination_is_path:
        destination.close()


[docs]def download_tar(
        url: str,
        out_dir: str,
        mode: str = 'r:gz',
        force: bool = False,
        timeout: float = 15,
        skip_exists: str = None
):
    """Download and unzip a .tar.gz file to a given output directory

    Args:
        url: URL of the file to download
        out_dir: The directory to unzip file contents to
        mode: Compression mode (Default: r:gz)
        force: Re-Download locally available data (Default: False)
        timeout: Seconds before raising timeout error (Default: 15)
        skip_exists: Optionally skip the download if given path exists
    """

    out_dir = Path(out_dir)

    # Skip download if file already exists or url unavailable
    if skip_exists and Path(skip_exists).exists() and not force:
        return

    # Download data to file and decompress
    with NamedTemporaryFile() as temp_file:
        download_file(url, destination=temp_file, timeout=timeout)

        # Writing to the file moves us to the end of the file
        # We move back to the beginning so we can decompress the data
        temp_file.seek(0)

        out_dir.mkdir(parents=True, exist_ok=True)
        with tarfile.open(fileobj=temp_file, mode=mode) as data_archive:
            for ffile in data_archive:
                try:
                    data_archive.extract(ffile, path=out_dir)

                except IOError:
                    # If output path already exists, delete it and try again
                    (out_dir / ffile.name).unlink()
                    data_archive.extract(ffile, path=out_dir)