Source code for bioimageloader.collections._bbbc039

from functools import cached_property
from pathlib import Path
from typing import Dict, List, Optional

import albumentations
import cv2
import numpy as np
import scipy.ndimage as ndi
import tifffile
from PIL import Image

from ..base import MaskDataset


[docs]class BBBC039(MaskDataset): """Nuclei of U2OS cells in a chemical screen [1]_ This data set has a total of 200 fields of view of nuclei captured with fluorescence microscopy using the Hoechst stain. These images are a sample of the larger BBBC022 chemical screen. The images are stored as TIFF files with 520x696 pixels at 16 bits. Parameters ---------- root_dir : str Path to root directory output : {'both', 'image', 'mask'}, default: 'both' Change outputs. 'both' returns {'image': image, 'mask': mask}. transforms : albumentations.Compose, optional An instance of Compose (albumentations pkg) that defines augmentation in sequence. num_samples : int, optional Useful when ``transforms`` is set. Define the total length of the dataset. If it is set, it overwrites ``__len__``. training : bool, default: True Load training set if True, else load testing one Notes ----- - Split (training/valiadation/test) - `training=True` combines 'training' with 'validation' - Annotate objs not touching each other with 1 and use 2, 3, ... for the touching ones. It is great and clever, but it does not follow the form of other instance segmented masks. ``get_mask()`` will make a instance labeled mask (each obj has unique labels). After labeling max label is 231 for training, and 202 for test. So having masks of dtype UINT8 is fine. - Max label is 3 (in original annotation) - Sample of larger BBBC022 and did manual segmentation - Possible overlap some with DSB2018 - Mask is png but (instance) value is only stored in RED channel - Maximum value is 2**12 References ---------- .. [1] https://bbbc.broadinstitute.org/BBBC039 See Also -------- MaskDataset : Super class Dataset : Base class DatasetInterface : Interface """ # Dataset's acronym acronym = 'BBBC039' _max_val = 4095 # 2**12 def __init__( self, root_dir: str, *, output: str = 'both', transforms: Optional[albumentations.Compose] = None, num_samples: Optional[int] = None, # specific to this dataset training: bool = True, **kwargs ): self._root_dir = root_dir self._output = output self._transforms = transforms self._num_samples = num_samples # specific to this dataset self.training = training
[docs] def get_image(self, p: Path) -> np.ndarray: img = tifffile.imread(p) img = img / np.float32(self._max_val) return cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
[docs] def get_mask(self, p: Path) -> np.ndarray: mask = np.asarray(Image.open(p))[..., 0] max_val = mask.max() if max_val == 0: return mask else: inst_mask, n_labels = ndi.label(mask == 1, output='uint8') # int32 by default for m in range(2, max_val+1): labeled, n = ndi.label(mask == m, output='uint8') inst_mask += np.where(labeled == 0, 0, labeled + n_labels) n_labels += n return inst_mask
@cached_property def file_list(self) -> List[Path]: root_dir = self.root_dir parent = root_dir / 'images' file_list = [] for name in self.ids: p = parent / name file_list.append(p.with_suffix('.tif')) return file_list @cached_property def anno_dict(self) -> Dict[int, Path]: root_dir = self.root_dir parent = root_dir / 'masks' anno_list = [] for name in self.ids: p = parent / name anno_list.append(p) return dict((k, v) for k, v in enumerate(anno_list)) @cached_property def ids(self) -> list: def _readlines(path): with open(path, 'r') as f: lines = f.readlines() return list(map(lambda s: s.strip(), lines)) meta_dir = self.root_dir / 'metadata' if self.training: # Combine training and validation meta_file = meta_dir / 'training.txt' _ids = _readlines(meta_file) meta_file = meta_dir / 'validation.txt' _ids += _readlines(meta_file) else: meta_file = meta_dir / 'test.txt' _ids = _readlines(meta_file) return _ids