Source code for sndata.utils.downloads

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

"""The ``downloads`` module supports downloading data files in various file
formats.
"""

import sys
import tarfile
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import IO, Union

import requests
from tqdm import tqdm


[docs]def download_file( url: str, destination: Union[str, Path, IO] = None, force: bool = False, timeout: float = 15, verbose: bool = True): """Download content from a url to a file If ``destination`` is a path but already exists, skip the download unless ``force`` is also ``True``. Args: url: URL of the file to download destination: Path or file object to download to force: Re-Download locally available data (Default: False) timeout: Seconds before raising timeout error (Default: 15) verbose: Print status to stdout """ destination_is_path = isinstance(destination, (str, Path)) if destination_is_path: path = Path(destination) if (not force) and path.exists(): return path.parent.mkdir(exist_ok=True, parents=True) destination = path.open('wb') if verbose: tqdm.write(f'Fetching {url}', file=sys.stdout) response = requests.get(url, stream=True, timeout=timeout) total = int(response.headers.get('content-length', 0)) chunk_size = 1024 with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=chunk_size, file=sys.stdout) as pbar: for data in response.iter_content(chunk_size=chunk_size): pbar.update(destination.write(data)) else: response = requests.get(url, timeout=timeout) response.raise_for_status() destination.write(response.content) destination.write(response.content) if destination_is_path: destination.close()
[docs]def download_tar( url: str, out_dir: str, mode: str = 'r:gz', force: bool = False, timeout: float = 15, skip_exists: str = None ): """Download and unzip a .tar.gz file to a given output directory Args: url: URL of the file to download out_dir: The directory to unzip file contents to mode: Compression mode (Default: r:gz) force: Re-Download locally available data (Default: False) timeout: Seconds before raising timeout error (Default: 15) skip_exists: Optionally skip the download if given path exists """ out_dir = Path(out_dir) # Skip download if file already exists or url unavailable if skip_exists and Path(skip_exists).exists() and not force: return # Download data to file and decompress with NamedTemporaryFile() as temp_file: download_file(url, destination=temp_file, timeout=timeout) # Writing to the file moves us to the end of the file # We move back to the beginning so we can decompress the data temp_file.seek(0) out_dir.mkdir(parents=True, exist_ok=True) with tarfile.open(fileobj=temp_file, mode=mode) as data_archive: for ffile in data_archive: try: data_archive.extract(ffile, path=out_dir) except IOError: # If output path already exists, delete it and try again (out_dir / ffile.name).unlink() data_archive.extract(ffile, path=out_dir)