Coverage for src/download_dataset.py: 0%

39 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-07-01 17:37 +0000

1import zipfile 

2from pathlib import Path 

3from huggingface_hub import hf_hub_download 

4 

5from typing import Dict 

6from omegaconf import OmegaConf 

7 

8 

9def print_extracted_files(extract_dir: Path): 

10 print(f"Successfully extracted to {extract_dir}") 

11 

12 extracted_files = Path(extract_dir).iterdir() 

13 print("Extracted files:") 

14 for extracted_file in list(extracted_files)[:5]: 

15 print(f"- {extracted_file.stem}") 

16 if len(list(extracted_files)) > 5: 

17 print(f"... and {len(list(extracted_files)) - 5} more files") 

18 

19 

20def extract_files(file_path: str, extract_dir: Path, main_subfolders: Dict): 

21 with zipfile.ZipFile(file_path, "r") as zip_ref: 

22 # Get list of all files in zip 

23 image_file_list = zip_ref.namelist() 

24 

25 # Extract all files, modifying their paths 

26 for image_file in image_file_list: 

27 # Extract file with modified path 

28 source = zip_ref.read(image_file) 

29 

30 target_path = extract_dir / Path(image_file).relative_to(main_subfolders["aerial_imagery"]) 

31 

32 # Create directories if they don't exist 

33 target_path.parent.mkdir(parents=True, exist_ok=True) 

34 

35 with open(target_path, "wb") as f: 

36 f.write(source) 

37 

38 print_extracted_files(extract_dir) 

39 

40 

41def download_data(species_folders: Dict, main_subfolders: Dict, dataset_folder: Path): 

42 """ 

43 Function downloads specified data from HF (PureForest dataset) 

44 """ 

45 

46 for filename in species_folders: 

47 print(f"\nProcessing {species_folders[filename]}...") 

48 

49 extract_dir = Path(dataset_folder) / filename 

50 # Check if directory already contains files (skip if so) 

51 if extract_dir.exists() and any(extract_dir.iterdir()): 

52 print(f"\nSkipping {species_folders[filename]}: already downloaded and extracted in {extract_dir}. Remove manually for re-download") 

53 continue 

54 

55 # Download file only if not already downloaded (checks hash) 

56 file_path = hf_hub_download( 

57 repo_id="IGNF/PureForest", 

58 filename=species_folders[filename], 

59 repo_type="dataset", 

60 local_files_only=False, # Will check cache and hash 

61 force_download=False, # Only download if hash changed 

62 ) 

63 

64 # Create a directory for the extracted files 

65 extract_dir.mkdir(exist_ok=True, parents=True) 

66 

67 try: 

68 extract_files(file_path, extract_dir, main_subfolders) 

69 except zipfile.BadZipFile: 

70 print(f"Error: {filename} is not a valid zip file") 

71 

72 

73if __name__ == "__main__": 

74 config = OmegaConf.load("src/config.yaml") 

75 

76 download_data(config.dataset.species_folders, config.dataset.main_subfolders, config.dataset.folder)