Source code for bioimageloader.collections._bbbc018

from functools import cached_property, partial
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Union

import albumentations
import cv2
import numpy as np
from PIL import Image
from skimage.util import img_as_float32

from ..base import MaskDataset
from ..types import BundledPath
from ..utils import bundle_list, stack_channels, stack_channels_to_rgb


[docs]class BBBC018(MaskDataset):
    """Human HT29 colon-cancer cells (diverse phenotypes)

    The image set consists of 56 fields of view (4 from each of 14 samples).
    Because there are three channels, there are 168 image files. (The samples
    were stained with Hoechst 33342, pH3, and phalloidin. Hoechst 33342 is a DNA
    stain that labels the nucleus. Phospho-histone H3 indicates mitosis.
    Phalloidin labels actin, which is present in the cytoplasm.) The samples are
    the top-scoring sample from each of Jones et al.'s classifiers, as listed in
    the file SamplesScores.zip in their supplement. The files are in DIB format,
    as produced by the Cellomics ArrayScan instrument at the Whitehead–MIT
    Bioimaging Center. We recommend using Bio-Formats to read the DIB files.
    Each image is 512 x 512 pixels.

    The filenames are of the form wellidx-channel.DIB, where wellidx is the
    five-digit well index (from Jones et al.'s supplement) and channel is either
    DNA, actin, or pH3, depending on the channel.

    Parameters
    ----------
    root_dir : str
        Path to root directory
    output : {'both', 'image', 'mask'}, default: 'both'
        Change outputs. 'both' returns {'image': image, 'mask': mask}.
    transforms : albumentations.Compose, optional
        An instance of Compose (albumentations pkg) that defines augmentation in
        sequence.
    num_samples : int, optional
        Useful when ``transforms`` is set. Define the total length of the
        dataset. If it is set, it overwrites ``__len__``.
    grayscale : bool, default: False
        Convert images to grayscale
    grayscale_mode : {'equal', 'cv2', Sequence[float]}, default: 'equal'
        How to convert to grayscale. If set to 'cv2', it follows opencv
        implementation. Else if set to 'equal', it sums up values along channel
        axis, then divides it by the number of expected channels.
    image_ch : {'DNA', 'actin', 'pH3'}, default: ('DNA', 'actin', 'pH3')
        Which channel(s) to load as image. Make sure to give it as a Sequence
        when choose a single channel.
    anno_ch : {'DNA', 'actin'}, default: ('DNA',)
        Which channel(s) to load as annotation. Make sure to give it as a
        Sequence when choose a single channel.
    drop_missing_pairs : bool, default: True
        Valid only if `output='both'`. It will drop images that do not have mask
        pairs.

    Other Parameters
    ----------------
    image_ch : {'DNA', 'actin'}, default: ('DNA', 'actin')
        Which channel(s) to load as image. Make sure to give it as a Sequence
        when choose a single channel.

    Warnings
    --------
    BBBC018_v1_images/10779 annotation is missing. len(anno_dict) =
    len(file_list) - 1; ind={26}

        - PosixPath('BBBC018_v1_images/10779-DNA.DIB')
        - PosixPath('BBBC018_v1_images/10779-actin.DIB')
        - PosixPath('BBBC018_v1_images/10779-pH3.DIB')

    This one is not properly saved after annotation. It has annotation overlaid
    on top to image. Need to filter ``mask==255`.
        - 'BBBC018_v1_outlines/17675-nuclei.png'

    Notes
    -----
    - Every DIB has 3 channels (Order = (DNA,actin,pH3)). The second one is the
      object.
    - DNA -> Nuceli
    - Actin -> Cell
    - Annotation is outline one, but every anno is closed so binary_fill_holes
      works fine
    - For some reason annotation is y inverted

    References
    ----------
    .. [1] https://bbbc.broadinstitute.org/BBBC018

    See Also
    --------
    MaskDataset : Super class
    Dataset : Base class
    DatasetInterface : Interface

    """

    # Dataset's acronym
    acronym = 'BBBC018'

    def __init__(
        self,
        root_dir: str,
        *,
        output: str = 'both',
        transforms: Optional[albumentations.Compose] = None,
        num_samples: Optional[int] = None,
        grayscale: bool = False,
        grayscale_mode: Union[str, Sequence[float]] = 'equal',
        # specific to this dataset
        image_ch: Sequence[str] = ('DNA', 'actin', 'pH3'),
        anno_ch: Sequence[str] = ('DNA',),
        drop_missing_pairs: bool = True,
        **kwargs
    ):
        self._root_dir = root_dir
        self._output = output
        self._transforms = transforms
        self._num_samples = num_samples
        self._grayscale = grayscale
        self._grayscale_mode = grayscale_mode
        self.image_ch = image_ch
        self.anno_ch = anno_ch
        self.drop_missing_pairs = drop_missing_pairs

        if self.output == 'both' and self.drop_missing_pairs:
            self.file_list, self.anno_dict = self._drop_missing_pairs()

    @staticmethod
    def _imread_handler(p: Path) -> np.ndarray:
        img = Image.open(p)
        return np.asarray(img)[..., 1]

[docs]    def get_image(self, p: Union[Path, BundledPath]) -> np.ndarray:
        # Second channel has objects
        # Order = (DNA,actin,pH3)
        if isinstance(p, Path):
            img = self._imread_handler(p)
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
            return img_as_float32(img)
        else:
            # order = {
            #     'DNA': 2,
            #     'actin': 0,
            #     'pH3': 1,
            # }
            if len(ch := self.image_ch) == 3:
                img = stack_channels_to_rgb(self._imread_handler, p, 2, 0, 1)
            else:
                raise NotImplementedError
        return img_as_float32(img)

[docs]    def get_mask(self, p: Union[Path, BundledPath]) -> np.ndarray:
        if isinstance(p, Path):
            mask = np.asarray(Image.open(p))
            if p.name == '17675-nuclei.png':
                # 'BBBC018_v1_outlines/17675-nuclei.png' See Notes
                mask = mask == 255
        else:
            mask = stack_channels(Image.open, p)
        if mask.dtype == 'bool':
            mask = 255 * mask.astype(np.uint8)
        # For some reason mask is -y
        return np.ascontiguousarray(mask[::-1, ...])

    @cached_property
    def file_list(self) -> Union[List[Path], List[BundledPath]]:
        root_dir = self.root_dir
        parent = 'BBBC018_v1_images'
        # Order = (DNA,actin,pH3)
        if len(ch := self.image_ch) == 3:
            _file_list = sorted(root_dir.glob(f'{parent}/*.DIB'))
            return bundle_list(_file_list, 3)
        elif len(ch) == 2:
            raise NotImplementedError
        elif len(ch) == 1:
            file_list = sorted((root_dir / parent).glob(f'*-{ch[0]}.DIB'))
        else:
            raise ValueError("Set `image_ch` in ('DNA', 'actin', 'pH3')")
        return file_list

    @cached_property
    def anno_dict(self) -> Union[Dict[int, Path], Dict[int, BundledPath]]:
        root_dir = self.root_dir
        parent = 'BBBC018_v1_outlines'
        # _anno_list = sorted(root_dir.glob(f'{parent}/*.png'))
        stain_to_target = {'DNA': 'nuclei',
                           'actin': 'cells'}
        if len(ch := self.anno_ch) == 1:
            anno_dict: Dict[int, Path] = {}
            target = stain_to_target[ch[0]]
            for i, p in enumerate(self.file_list):
                name = p[0].stem.split('-')[0] if isinstance(p, list) else p.stem.split('-')[0]
                fn = root_dir / parent / f'{name}-{target}.png'
                if fn.exists():
                    anno_dict[i] = fn
            return anno_dict
        elif len(ch) == 2:
            anno_bdict: Dict[int, BundledPath] = {}
            for i, p in enumerate(self.file_list):
                name = p[0].stem.split('-')[0] if isinstance(p, list) else p.stem.split('-')[0]
                # name = p[0].stem.split('-')[0]
                lst_fn = []
                for c in ch:
                    target = stain_to_target[c]
                    fn = root_dir / parent / f'{name}-{target}.png'
                    if fn.exists():
                        lst_fn.append(fn)
                if lst_fn:
                    anno_bdict[i] = lst_fn
        else:
            raise ValueError
        return anno_bdict