Source code for bioimageloader.collections._livecell

import warnings
from functools import cached_property
from pathlib import Path
from typing import Dict, List, Optional

import albumentations
import cv2
import numpy as np
import tifffile
from pycocotools import coco
from skimage.util import img_as_float32

from ..base import MaskDataset


[docs]class LIVECell(MaskDataset):
    """LIVECEll: A large-scale dataset for label-free live cell segmentation
    [1]_

    “LIVECell - A large-scale dataset for label-free live cell segmentation” by
    Edlund et. al. 2021 [2]_

    Light microscopy is a cheap, accessible, non-invasive modality that when
    combined with well-established protocols of two-dimensional cell culture
    facilitates high-throughput quantitative imaging to study biological
    phenomena. Accurate segmentation of individual cells enables exploration of
    complex biological questions, but this requires sophisticated imaging
    processing pipelines due to the low contrast and high object density. Deep
    learning-based methods are considered state-of-the-art for most computer
    vision problems but require vast amounts of annotated data, for which there
    is no suitable resource available in the field of label-free cellular
    imaging. To address this gap we present LIVECell, a high-quality, manually
    annotated and expert-validated dataset that is the largest of its kind to
    date, consisting of over 1.6 million cells from a diverse set of cell
    morphologies and culture densities. To further demonstrate its utility, we
    provide convolutional neural network-based models trained and evaluated on
    LIVECell.

    Parameters
    ----------
    root_dir : str
        Path to root directory
    output : {'both', 'image', 'mask'}, default: 'both'
        Change outputs. 'both' returns {'image': image, 'mask': mask}.
    transforms : albumentations.Compose, optional
        An instance of Compose (albumentations pkg) that defines augmentation in
        sequence.
    num_samples : int, optional
        Useful when ``transforms`` is set. Define the total length of the
        dataset. If it is set, it overwrites ``__len__``.
    training : bool, default: True
        Load training set if True, else load testing one
    mask_tif : bool, default: False
        Use saved COCO annotations as tif mask images in a new ./root_dir/masks
        directory. It will greatly improve loading speed. Available after
        calling ``save_coco_to_tif()``.

    Notes
    -----
    - Annotation in MS COCO format [3]_. Parsing it takes time`.
    - Currently not supporting dynamically parsing COCO annotation due to slow
      speed. Pre-parse masks in .tif format by calling ``save_coco_to_tif()``.
    - Validation set is originally separted from training set. Currently they
      are combined ``training=True``.
    - Single cells subsets are not covered

    References
    ----------
    .. [1] https://sartorius-research.github.io/LIVECell/
    .. [2] https://www.nature.com/articles/s41592-021-01249-6
    .. [3] https://cocodataset.org/

    See Also
    --------
    MaskDataset : Super class
    Dataset : Base class
    DatasetInterface : Interface

    """
    # Set acronym
    acronym = 'LIVECell'

    def __init__(
        self,
        root_dir: str,
        *,
        output: str = 'both',
        transforms: Optional[albumentations.Compose] = None,
        num_samples: Optional[int] = None,
        # specific to this dataset
        training: bool = True,
        mask_tif: bool = False,
        **kwargs
    ):
        self._root_dir = root_dir
        self._output = output
        self._transforms = transforms
        self._num_samples = num_samples
        # specific to this one here
        self.training = training
        self.mask_tif = mask_tif

        if not self.mask_tif:
            msg = ("LIVECell dataset does not currently support dynamically "
                   "parsing annotation in MS COCO format due to performance "
                   "issue. Please create masks in .tif format by calling "
                   "`save_coco_to_tif()` and then set `mask_tif=True`")
            warnings.warn(msg, stacklevel=2)

        if self.mask_tif:
            # check
            mask_train_dir = self.root_dir / 'masks' / 'livecell_train_val_masks'
            mask_test_dir = self.root_dir / 'masks' / 'livecell_test_masks'
            if not mask_train_dir.exists() or not mask_test_dir.exists():
                raise Exception("No masks in .tif format.")
            if not any(mask_train_dir.iterdir()) or not any(mask_test_dir.iterdir()):
                raise Exception("No masks in .tif format.")

[docs]    def get_image(self, p: Path) -> np.ndarray:
        img = tifffile.imread(p)
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
        return img_as_float32(img)

[docs]    def get_mask(self, p: Path) -> np.ndarray:
        mask = tifffile.imread(p)
        return mask

    @cached_property
    def file_list(self) -> List[Path]:
        # Call MaskDataset.root_dir
        root_dir = self.root_dir
        parent = (root_dir / 'images' / 'livecell_train_val_images' if self.training
                  else root_dir / 'images' / 'livecell_test_masks')
        return sorted(parent.glob('*.tif'))

    @cached_property
    def anno_dict(self) -> Dict[int, Path]:
        root_dir = self.root_dir
        # parent = 'masks/livecell_train_val_masks' if self.training else 'masks/livecell_test_masks'
        parent = (root_dir / 'masks' / 'livecell_train_val_masks' if self.training
                  else root_dir / 'masks' / 'livecell_test_masks')
        return dict((k, v) for k, v in enumerate(
            sorted(parent.glob('*.tif'))
        ))

[docs]    def save_coco_to_tif(self):
        """Save the masks as tif files

        Read training/val or test annotations from json. Make tif files under
        'masks/livecell_train_val_masks' and 'masks/livecell_test_masks'.

        Initialize a new instance with setting ``mask_tif=True`` to load saved
        masks.
        """
        print("making instances masks and saving as tif files")
        # makedirs
        mask_train_dir = self.root_dir / 'masks' / 'livecell_train_val_masks'
        mask_test_dir = self.root_dir / 'masks' / 'livecell_test_masks'
        mask_train_dir.mkdir(parents=True, exist_ok=True)
        mask_test_dir.mkdir(parents=True, exist_ok=True)
        # training
        self.coco_tr = coco.COCO(self.root_dir  / 'livecell_coco_train.json')
        img_tr = self.coco_tr.loadImgs(self.coco_tr.getImgIds())
        self.coco_val = coco.COCO(self.root_dir / 'livecell_coco_val.json')
        img_val = self.coco_val.loadImgs(self.coco_val.getImgIds())
        self.anno_dictionary = img_val + img_tr
        # loop
        print(f'training total: {len(self.anno_dictionary)}')
        for ind, img in enumerate(self.anno_dictionary, 1):
            try:
                annIds = self.coco_tr.getAnnIds(imgIds=img["id"], iscrowd=None)
                anns = self.coco_tr.loadAnns(annIds)
                mask = self.coco_tr.annToMask(anns[0])
                mask = mask.astype(np.int32)
                for i in range(len(anns)):
                    mask |= self.coco_tr.annToMask(anns[i]) * i
            except:
                annIds = self.coco_val.getAnnIds(imgIds=img["id"], iscrowd=None)
                anns = self.coco_val.loadAnns(annIds)
                mask = self.coco_val.annToMask(anns[0])
                mask = mask.astype(np.int32)
                for i in range(len(anns)):
                    mask |= self.coco_val.annToMask(anns[i]) * i
            tifffile.imsave(mask_train_dir / img['file_name'], mask)
            print(ind, end=' ')
        print("Done!")
        # test
        print("making instances masks and saving as tif files")
        self.coco_te = coco.COCO(self.root_dir / 'livecell_coco_test.json')
        img_te = self.coco_te.loadImgs(self.coco_te.getImgIds())
        self.anno_dictionary = img_te
        print(f'test total: {len(self.anno_dictionary)}')
        for ind, img in enumerate(self.anno_dictionary, 1):
            annIds = self.coco_te.getAnnIds(imgIds=img['id'], iscrowd=None)
            anns = self.coco_te.loadAnns(annIds)
            mask = self.coco_te.annToMask(anns[0])
            mask = mask.astype(np.int32)
            for i in range(len(anns)):
                mask |= self.coco_te.annToMask(anns[i]) * i
            tifffile.imsave(mask_test_dir / img['file_name'], mask)
            print(ind, end=' ')
        print("Done!")