import json
import os.path
from functools import cached_property
from pathlib import Path
from typing import List, Optional, Sequence, Union
import albumentations
import numpy as np
from skimage.util import img_as_float32
from ..base import Dataset
from ..utils import imread_asarray
[docs]class BBBC041(Dataset):
"""P. vivax (malaria) infected human blood smears [1]_
Images are in .png or .jpg format. There are 3 sets of images consisting of
1364 images (~80,000 cells) with different researchers having prepared each
one: from Brazil (Stefanie Lopes), from Southeast Asia (Benoit Malleret),
and time course (Gabriel Rangel). Blood smears were stained with Giemsa
reagent.
These images were contributed by Jane Hung of MIT and the Broad Institute in
Cambridge, MA. [1]_
There is also a Github reposity that lists malaria parasite imaging datasets
(blood smears) [2]_.
Parameters
----------
root_dir : str
Path to root directory
transforms : albumentations.Compose, optional
An instance of Compose (albumentations pkg) that defines augmentation in
sequence.
num_samples : int, optional
Useful when ```transforms``` is set. Define the total length of the
dataset. If it is set, it overwrites ``__len__``.
grayscale : bool, default: False
Convert images to grayscale
grayscale_mode : {'cv2', 'equal', Sequence[float]}, default: 'cv2'
How to convert to grayscale. If set to 'cv2', it follows opencv
implementation. Else if set to 'equal', it sums up values along channel
axis, then divides it by the number of expected channels.
training : bool, default: True
Load training set if True, else load testing one
Notes
-----
- Label categories: all 7 cats, ['difficult', 'gametocyte', 'leukocyte',
'red blood cell', 'ring', 'schizont', 'trophozoite']
- 1208/120 training/test split. So not 1368 images as written in the
description.
- png and jpg extension; training images are in RGB space in PNG format,
while test images are in YUV space in JPEG format.
- YUV will be automatically detected when read and cast to RGB
- Two resolutions; depending on training/test: (1200, 1600) for training,
(1383, 1944) for test
References
----------
.. [1] https://bbbc.broadinstitute.org/BBBC041
.. [2] https://github.com/tobsecret/Awesome_Malaria_Parasite_Imaging_Datasets
See Also
--------
MaskDataset : Super class
Dataset : Base class
DatasetInterface : Interface
"""
# Dataset's acronym
acronym = 'BBBC041'
def __init__(
self,
root_dir: str,
*,
transforms: Optional[albumentations.Compose] = None,
num_samples: Optional[int] = None,
grayscale: bool = False,
grayscale_mode: Union[str, Sequence[float]] = 'cv2',
# specific to this dataset
training: bool = True,
**kwargs
):
self._root_dir = root_dir
self._transforms = transforms
self._num_samples = num_samples
self._grayscale = grayscale
self._grayscale_mode = grayscale_mode
# specific to this one here
self.training = training
[docs] def get_image(self, p: Path) -> np.ndarray:
img = imread_asarray(p)
return img_as_float32(img)
@cached_property
def file_list(self) -> List[Path]:
root_dir = self.root_dir
file_list = []
for name in self.ids:
# Dont' know why they put '/' at front in json
file_list.append(root_dir / name.lstrip('/'))
return file_list
@cached_property
def ids(self) -> list:
return [d['image']['pathname'] for d in self.metadata]
@cached_property
def metadata(self) -> List[dict]:
name = 'training.json' if self.training else 'test.json'
p = self.root_dir / name
with open(p, 'r') as f:
data = json.load(f)
return data