Source code for bioimageloader.collections._bbbc039

from functools import cached_property
from pathlib import Path
from typing import Dict, List, Optional

import albumentations
import cv2
import numpy as np
import scipy.ndimage as ndi
import tifffile
from PIL import Image

from ..base import MaskDataset


[docs]class BBBC039(MaskDataset):
    """Nuclei of U2OS cells in a chemical screen [1]_

    This data set has a total of 200 fields of view of nuclei captured with
    fluorescence microscopy using the Hoechst stain. These images are a sample
    of the larger BBBC022 chemical screen. The images are stored as TIFF files
    with 520x696 pixels at 16 bits.

    Parameters
    ----------
    root_dir : str
        Path to root directory
    output : {'both', 'image', 'mask'}, default: 'both'
        Change outputs. 'both' returns {'image': image, 'mask': mask}.
    transforms : albumentations.Compose, optional
        An instance of Compose (albumentations pkg) that defines augmentation in
        sequence.
    num_samples : int, optional
        Useful when ``transforms`` is set. Define the total length of the
        dataset. If it is set, it overwrites ``__len__``.
    training : bool, default: True
        Load training set if True, else load testing one

    Notes
    -----
    - Split (training/valiadation/test)
        - `training=True` combines 'training' with 'validation'
    - Annotate objs not touching each other with 1 and use 2, 3, ... for the
      touching ones. It is great and clever, but it does not follow the form of
      other instance segmented masks. ``get_mask()`` will make a instance
      labeled mask (each obj has unique labels). After labeling max label is 231
      for training, and 202 for test. So having masks of dtype UINT8 is fine.
    - Max label is 3 (in original annotation)
    - Sample of larger BBBC022 and did manual segmentation
    - Possible overlap some with DSB2018
    - Mask is png but (instance) value is only stored in RED channel
    - Maximum value is 2**12

    References
    ----------
    .. [1] https://bbbc.broadinstitute.org/BBBC039

    See Also
    --------
    MaskDataset : Super class
    Dataset : Base class
    DatasetInterface : Interface

    """

    # Dataset's acronym
    acronym = 'BBBC039'
    _max_val = 4095  # 2**12

    def __init__(
        self,
        root_dir: str,
        *,
        output: str = 'both',
        transforms: Optional[albumentations.Compose] = None,
        num_samples: Optional[int] = None,
        # specific to this dataset
        training: bool = True,
        **kwargs
    ):
        self._root_dir = root_dir
        self._output = output
        self._transforms = transforms
        self._num_samples = num_samples
        # specific to this dataset
        self.training = training

[docs]    def get_image(self, p: Path) -> np.ndarray:
        img = tifffile.imread(p)
        img = img / np.float32(self._max_val)
        return cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)

[docs]    def get_mask(self, p: Path) -> np.ndarray:
        mask = np.asarray(Image.open(p))[..., 0]
        max_val = mask.max()
        if max_val == 0:
            return mask
        else:
            inst_mask, n_labels = ndi.label(mask == 1, output='uint8')  # int32 by default
            for m in range(2, max_val+1):
                labeled, n = ndi.label(mask == m, output='uint8')
                inst_mask += np.where(labeled == 0, 0, labeled + n_labels)
                n_labels += n
        return inst_mask

    @cached_property
    def file_list(self) -> List[Path]:
        root_dir = self.root_dir
        parent = root_dir / 'images'
        file_list = []
        for name in self.ids:
            p = parent / name
            file_list.append(p.with_suffix('.tif'))
        return file_list

    @cached_property
    def anno_dict(self) -> Dict[int, Path]:
        root_dir = self.root_dir
        parent = root_dir / 'masks'
        anno_list = []
        for name in self.ids:
            p = parent / name
            anno_list.append(p)
        return dict((k, v) for k, v in enumerate(anno_list))

    @cached_property
    def ids(self) -> list:
        def _readlines(path):
            with open(path, 'r') as f:
                lines = f.readlines()
            return list(map(lambda s: s.strip(), lines))
        meta_dir = self.root_dir / 'metadata'
        if self.training:
            # Combine training and validation
            meta_file = meta_dir / 'training.txt'
            _ids = _readlines(meta_file)
            meta_file = meta_dir / 'validation.txt'
            _ids += _readlines(meta_file)
        else:
            meta_file = meta_dir / 'test.txt'
            _ids = _readlines(meta_file)
        return _ids