Source code for bioimageloader.collections._bbbc041

import json
import os.path
from functools import cached_property
from pathlib import Path
from typing import List, Optional, Sequence, Union

import albumentations
import numpy as np
from skimage.util import img_as_float32

from ..base import Dataset
from ..utils import imread_asarray


[docs]class BBBC041(Dataset): """P. vivax (malaria) infected human blood smears [1]_ Images are in .png or .jpg format. There are 3 sets of images consisting of 1364 images (~80,000 cells) with different researchers having prepared each one: from Brazil (Stefanie Lopes), from Southeast Asia (Benoit Malleret), and time course (Gabriel Rangel). Blood smears were stained with Giemsa reagent. These images were contributed by Jane Hung of MIT and the Broad Institute in Cambridge, MA. [1]_ There is also a Github reposity that lists malaria parasite imaging datasets (blood smears) [2]_. Parameters ---------- root_dir : str Path to root directory transforms : albumentations.Compose, optional An instance of Compose (albumentations pkg) that defines augmentation in sequence. num_samples : int, optional Useful when ```transforms``` is set. Define the total length of the dataset. If it is set, it overwrites ``__len__``. grayscale : bool, default: False Convert images to grayscale grayscale_mode : {'cv2', 'equal', Sequence[float]}, default: 'cv2' How to convert to grayscale. If set to 'cv2', it follows opencv implementation. Else if set to 'equal', it sums up values along channel axis, then divides it by the number of expected channels. training : bool, default: True Load training set if True, else load testing one Notes ----- - Label categories: all 7 cats, ['difficult', 'gametocyte', 'leukocyte', 'red blood cell', 'ring', 'schizont', 'trophozoite'] - 1208/120 training/test split. So not 1368 images as written in the description. - png and jpg extension; training images are in RGB space in PNG format, while test images are in YUV space in JPEG format. - YUV will be automatically detected when read and cast to RGB - Two resolutions; depending on training/test: (1200, 1600) for training, (1383, 1944) for test References ---------- .. [1] https://bbbc.broadinstitute.org/BBBC041 .. [2] https://github.com/tobsecret/Awesome_Malaria_Parasite_Imaging_Datasets See Also -------- MaskDataset : Super class Dataset : Base class DatasetInterface : Interface """ # Dataset's acronym acronym = 'BBBC041' def __init__( self, root_dir: str, *, transforms: Optional[albumentations.Compose] = None, num_samples: Optional[int] = None, grayscale: bool = False, grayscale_mode: Union[str, Sequence[float]] = 'cv2', # specific to this dataset training: bool = True, **kwargs ): self._root_dir = root_dir self._transforms = transforms self._num_samples = num_samples self._grayscale = grayscale self._grayscale_mode = grayscale_mode # specific to this one here self.training = training
[docs] def get_image(self, p: Path) -> np.ndarray: img = imread_asarray(p) return img_as_float32(img)
@cached_property def file_list(self) -> List[Path]: root_dir = self.root_dir file_list = [] for name in self.ids: # Dont' know why they put '/' at front in json file_list.append(root_dir / name.lstrip('/')) return file_list @cached_property def ids(self) -> list: return [d['image']['pathname'] for d in self.metadata] @cached_property def metadata(self) -> List[dict]: name = 'training.json' if self.training else 'test.json' p = self.root_dir / name with open(p, 'r') as f: data = json.load(f) return data