Source code for bioimageloader.collections._ucsb

import re
from functools import cached_property, partial
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Union

import albumentations
import numpy as np
import tifffile
from skimage.util import img_as_float32

from ..base import MaskDataset


[docs]class UCSB(MaskDataset):
    """A biosegmentation benchmark for evaluation of bioimage analysis methods

    Parameters
    ----------
    root_dir : str
        Path to root directory
    output : {'both',' image', 'mask'}, default: 'both'
        Change outputs. 'both' returns {'image': image, 'mask': mask}.
    transforms : albumentations.Compose, optional
        An instance of Compose (albumentations pkg) that defines
        augmentation in sequence.
    num_samples : int, optional
        Useful when ``transforms`` is set. Define the total length of the
        dataset. If it is set, it overwrites ``__len__``.
    grayscale : bool, default: False
        Convert images to grayscale
    grayscale_mode : {'cv2', 'equal', Sequence[float]}, default: 'cv2'
        How to convert to grayscale. If set to 'cv2', it follows opencv
        implementation. Else if set to 'equal', it sums up values along
        channel axis, then divides it by the number of expected channels.
    category : {'benign', 'malignant'}, default: ('malignant',)
        Select which category of output you want

    See Also
    --------
    MaskDataset : Super class
    Dataset : Base class
    DatasetInterface : Interface

    Notes
    -----
    - 32 'benign', 26 'malignant' images (58 images in total)
    - 58x768x896 -> ~600 patches. Thus, the defulat `num_samples=900` (x1.5).
    - Images are not fully annotated

    References
    ----------
    .. [1] E. Drelie Gelasca, B. Obara, D. Fedorov, K. Kvilekval, and B.
       Manjunath, “A biosegmentation benchmark for evaluation of bioimage
       analysis methods,” BMC Bioinformatics, vol. 10, p. 368, Nov. 2009, doi:
       10.1186/1471-2105-10-368.
    """
    # Dataset's acronym
    acronym = 'UCSB'

    def __init__(
        self,
        root_dir: str,
        *,
        output: str = 'both',
        transforms: Optional[albumentations.Compose] = None,
        num_samples: Optional[int] = None,
        grayscale: bool = False,
        grayscale_mode: Union[str, Sequence[float]] = 'cv2',
        # specific to this dataset
        category: Sequence[str] = ('malignant',),
        **kwargs
    ):
        self._root_dir = root_dir
        self._output = output
        self._transforms = transforms
        self._num_samples = num_samples
        self._grayscale = grayscale
        self._grayscale_mode = grayscale_mode
        # specific to this dataset
        self.category = category
        if not any([cat in ('benign', 'malignant') for cat in category]):
            raise ValueError("Set `category` in ('benign', 'malignant') in sequence")

[docs]    def get_image(self, p: Path) -> np.ndarray:
        tif = tifffile.imread(p)
        return img_as_float32(tif)

[docs]    def get_mask(self, p: Path) -> np.ndarray:
        tif = tifffile.imread(p)
        return tif

    @staticmethod
    def _filter_category(p: Path, category: str):
        return re.search(category, p.stem)

    @cached_property
    def file_list(self) -> List[Path]:
        root_dir = self.root_dir
        parent = 'Breast Cancer Cells'
        file_list = root_dir.glob(f'{parent}/*.tif')
        if len(cat := self.category) == 1:
            file_list = filter(partial(self._filter_category, category=cat[0]),
                               file_list)
        return sorted(file_list)

    @cached_property
    def anno_dict(self) -> Dict[int, Path]:
        root_dir = self.root_dir
        parent = 'Breast Cancer Cells GroundTruth'
        anno_list = sorted(root_dir.glob(f'{parent}/*.TIF'))
        if len(cat := self.category) == 1:
            anno_list = filter(partial(self._filter_category, category=cat[0]),
                               anno_list)
        anno_dict = dict((k, v) for k, v in enumerate(anno_list))
        return anno_dict