Overview:

  1. select files in directory
  2. select a sample of files
  3. write sampled filepaths into csv
  4. zip sampled files
  5. unzip sampled files
  6. read sampled filepaths from csv
from pathlib import Path
from fastai.vision import get_image_files
import numpy as np
from zipfile import ZipFile
# working directory
path = Path('/home/dc/coronahack/source/nih-chest-xrays')

# source directory containing files to zip
src_dir   = path / 'data'

# csv filepath (to be created/overwritten)
csv_dst   = path / 'nih-chest-xrays_sample-2000.csv'

# zip filepath (to be created/overwritten)
zip_dst   = path / 'nih-chest-xrays_sample-2000.zip'

# unzip directory (to be created/overwritten)
unzip_dst = path / 'sample-2000'

Create Zip

1. Select files in specified directory

(e.g all image files in dir + subdirs)

files = sorted(get_image_files(src_dir, recurse=True))
len(files), files[:5]
(112120,
 [PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_001/images/00000001_000.png'),
  PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_001/images/00000001_001.png'),
  PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_001/images/00000001_002.png'),
  PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_001/images/00000002_000.png'),
  PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_001/images/00000003_000.png')])

2. Randomly sample n files from list

(optional: set seed)

n = 2000

seed = np.random.randint(0, 2**32-1)
# seed = 0
np.random.seed(seed)

sample_paths = np.random.choice(files, n, replace=False)
sample_paths
array([PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_007/images/00014129_003.png'),
       PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_008/images/00017368_000.png'),
       PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_003/images/00005798_002.png'),
       PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_010/images/00021488_002.png'), ...,
       PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_004/images/00007094_000.png'),
       PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_009/images/00019415_001.png'),
       PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_002/images/00002271_000.png'),
       PosixPath('/home/dc/coronahack/source/nih-chest-xrays/data/images_002/images/00003658_005.png')], dtype=object)

3. Write csv of original file paths into csv_dst file

csv_dst.exists(), csv_dst
(True,
 PosixPath('/home/dc/coronahack/source/nih-chest-xrays/nih-chest-xrays_sample-2000.csv'))
np.savetxt(csv_dst, sample_paths.astype(np.str), fmt='%s', delimiter=',')

4. Zip files in list into zip_dst file

zip_dst.exists(), zip_dst
(True,
 PosixPath('/home/dc/coronahack/source/nih-chest-xrays/nih-chest-xrays_sample-2000.zip'))
with ZipFile(zip_dst,'w') as zf:
    for fn in sample_paths: 
        zf.write(fn) 

Unzip files

5. Unzip files into unzip_dst folder

unzip_dst.mkdir(parents=True, exist_ok=True)
unzip_dst.exists(), unzip_dst
(True, PosixPath('/home/dc/coronahack/source/nih-chest-xrays/sample-2000'))
with ZipFile(zip_dst, 'r') as zf:
    # zf.printdir() # print zip contents
    zf.extractall(unzip_dst)

6. Load csv of original file paths

csv_dst.exists(), csv_dst
(True,
 PosixPath('/home/dc/coronahack/source/nih-chest-xrays/nih-chest-xrays_sample-2000.csv'))
np.loadtxt(csv_dst, dtype=np.str, delimiter=',')
array(['/home/dc/coronahack/source/nih-chest-xrays/data/images_007/images/00014129_003.png',
       '/home/dc/coronahack/source/nih-chest-xrays/data/images_008/images/00017368_000.png',
       '/home/dc/coronahack/source/nih-chest-xrays/data/images_003/images/00005798_002.png',
       '/home/dc/coronahack/source/nih-chest-xrays/data/images_010/images/00021488_002.png', ...,
       '/home/dc/coronahack/source/nih-chest-xrays/data/images_004/images/00007094_000.png',
       '/home/dc/coronahack/source/nih-chest-xrays/data/images_009/images/00019415_001.png',
       '/home/dc/coronahack/source/nih-chest-xrays/data/images_002/images/00002271_000.png',
       '/home/dc/coronahack/source/nih-chest-xrays/data/images_002/images/00003658_005.png'], dtype='<U82')