Coverage for src/download_dataset.py: 0%
39 statements
« prev ^ index » next coverage.py v7.9.1, created at 2025-07-01 17:37 +0000
« prev ^ index » next coverage.py v7.9.1, created at 2025-07-01 17:37 +0000
1import zipfile
2from pathlib import Path
3from huggingface_hub import hf_hub_download
5from typing import Dict
6from omegaconf import OmegaConf
9def print_extracted_files(extract_dir: Path):
10 print(f"Successfully extracted to {extract_dir}")
12 extracted_files = Path(extract_dir).iterdir()
13 print("Extracted files:")
14 for extracted_file in list(extracted_files)[:5]:
15 print(f"- {extracted_file.stem}")
16 if len(list(extracted_files)) > 5:
17 print(f"... and {len(list(extracted_files)) - 5} more files")
20def extract_files(file_path: str, extract_dir: Path, main_subfolders: Dict):
21 with zipfile.ZipFile(file_path, "r") as zip_ref:
22 # Get list of all files in zip
23 image_file_list = zip_ref.namelist()
25 # Extract all files, modifying their paths
26 for image_file in image_file_list:
27 # Extract file with modified path
28 source = zip_ref.read(image_file)
30 target_path = extract_dir / Path(image_file).relative_to(main_subfolders["aerial_imagery"])
32 # Create directories if they don't exist
33 target_path.parent.mkdir(parents=True, exist_ok=True)
35 with open(target_path, "wb") as f:
36 f.write(source)
38 print_extracted_files(extract_dir)
41def download_data(species_folders: Dict, main_subfolders: Dict, dataset_folder: Path):
42 """
43 Function downloads specified data from HF (PureForest dataset)
44 """
46 for filename in species_folders:
47 print(f"\nProcessing {species_folders[filename]}...")
49 extract_dir = Path(dataset_folder) / filename
50 # Check if directory already contains files (skip if so)
51 if extract_dir.exists() and any(extract_dir.iterdir()):
52 print(f"\nSkipping {species_folders[filename]}: already downloaded and extracted in {extract_dir}. Remove manually for re-download")
53 continue
55 # Download file only if not already downloaded (checks hash)
56 file_path = hf_hub_download(
57 repo_id="IGNF/PureForest",
58 filename=species_folders[filename],
59 repo_type="dataset",
60 local_files_only=False, # Will check cache and hash
61 force_download=False, # Only download if hash changed
62 )
64 # Create a directory for the extracted files
65 extract_dir.mkdir(exist_ok=True, parents=True)
67 try:
68 extract_files(file_path, extract_dir, main_subfolders)
69 except zipfile.BadZipFile:
70 print(f"Error: {filename} is not a valid zip file")
73if __name__ == "__main__":
74 config = OmegaConf.load("src/config.yaml")
76 download_data(config.dataset.species_folders, config.dataset.main_subfolders, config.dataset.folder)