import os.path
from functools import cached_property
from pathlib import Path
from typing import Dict, List, Optional
import albumentations
import numpy as np
from PIL import Image
from skimage.util import img_as_float32
try:
from gimpformats.gimpXcfDocument import GimpDocument
except ModuleNotFoundError as e:
print("Install `gimpformats` pkg")
raise e
from ..base import MaskDataset
from ..utils import imread_asarray
[docs]class MurphyLab(MaskDataset):
"""Nuclei Segmentation In Microscope Cell Images: A Hand-Segmented Dataset
And Comparison Of Algorithms [1]_
Parameters
----------
root_dir : str or pathlib.Path
Path to root directory
output : {'both', 'image', 'mask'}, default: 'both'
Change outputs. 'both' returns {'image': image, 'mask': mask}.
transforms : albumentations.Compose, optional
An instance of Compose (albumentations pkg) that defines
augmentation in sequence.
num_samples : int, optional
Useful when ``transforms`` is set. Define the total length of the
dataset. If it is set, it overwrites ``__len__``.
drop_missing_pairs : bool, default: True
Valid only if `output='both'`. It will drop images that do not have
mask pairs.
drop_broken_files : bool, default: True
Drop broken files that cannot be read
filled_mask : bool, default: False
Use saved filled masks through `fill_save_mask()` method instead of
default boundary masks. If one would want to use manually modified
masks, the annotation files should have the same name as '*.xcf'
with modified suffix by '.png'.
Warnings
--------
This dataset has many issues whose details can be found below. The simpleset
way is to drop those that cause isseus. It is recommended to not opt out
``drop_missing_pairs()`` and ``drop_broken_files()``. Otherwise, it will
meet exceptions.
If one wants filled hole, ``fill_save_mask()`` function will fill holes with
some tricks to handle edge cases and save them as .png format. Then set
``filled_mask`` argument to True to load them.
Read more in Notes section
Notes
-----
- 4 channel PNG format annotation mask even though mask is binary. But it is
not grayscale binary. They put value 255 only in red channel.
- Two annotation formats; Photoshop and GIMP. It seems that two annotators
worked separately. segmented-ashariff will be ignored. In total, 97
segmented images (out of 100)
- 3 missing segmentations: ind={31, 43, 75}
./data/images/dna-images/gnf/dna-31.png
./data/images/dna-images/gnf/dna-43.png
./data/images/dna-images/ic100/dna-25.png
- Manually filled annotation to make masks using GIMP
- 2009_ISBI_2DNuclei_code_data/data/images/segmented-lpc/ic100/dna-15.xcf
does not have 'borders' layer like the others. This one alone has
'border' layer.
References
----------
.. [1] L. P. Coelho, A. Shariff, and R. F. Murphy, “Nuclear segmentation in
microscope cell images: A hand-segmented dataset and comparison of
algorithms,” in 2009 IEEE International Symposium on Biomedical Imaging:
From Nano to Macro, Jun. 2009, pp. 518–521, doi:
10.1109/ISBI.2009.5193098.
See Also
--------
MaskDataset : Super class
Dataset : Base class
DatasetInterface : Interface
"""
# Dataset's acronym
acronym = 'MurphyLab'
def __init__(
self,
# Interface requirement
root_dir: str,
*,
output: str = 'both',
transforms: Optional[albumentations.Compose] = None,
num_samples: Optional[int] = None,
# Specific to this dataset
drop_missing_pairs: bool = True,
drop_broken_files: bool = True,
filled_mask: bool = False,
**kwargs
):
# Interface and super-class arguments
self._root_dir = os.path.join(root_dir, 'data', 'images')
self._output = output
self._transforms = transforms
self._num_samples = num_samples
# Specific to this dataset
self.drop_missing_pairs = drop_missing_pairs
self.drop_broken_files = drop_broken_files
self.filled_mask = filled_mask
if self.output == 'both' and self.drop_missing_pairs:
self.file_list, self.anno_dict = self._drop_missing_pairs()
if self.output == 'both' and self.drop_broken_files:
self.file_list, self.anno_dict = self._drop_broken_files()
[docs] def get_image(self, p: Path) -> np.ndarray:
img = imread_asarray(p)
return img_as_float32(img)
[docs] def get_mask(self, p: Path) -> np.ndarray:
if self.filled_mask:
mask = Image.open(p)
return np.asarray(mask, dtype=np.float32)[..., 0]
doc = GimpDocument(p.as_posix())
layers = [layer.name for layer in doc.layers]
try:
ind_layer = layers.index('borders')
except ValueError:
# '/data/images/segmented-lpc/ic100/dna-15.xcf' do not have
# 'borders' but 'border' layer.
ind_layer = layers.index('border')
# get image
layer_borders = doc.getLayer(ind_layer)
mask = layer_borders.image
return np.asarray(mask)[..., 0]
@cached_property
def file_list(self) -> List[Path]:
root_dir = self.root_dir
parent = 'dna-images'
# dna-images
file_list = sorted(
root_dir.glob(f'{parent}/*/*.png'), key=self._sort_key
)
return file_list
@staticmethod
def _sort_key(p, zfill=2):
split = p.stem.split('-')
return '-'.join([p.parent.stem] + [s.zfill(zfill) for s in split])
@cached_property
def anno_dict(self) -> Dict[int, Path]:
ext = '.png' if self.filled_mask else '.xcf'
anno_dict = {}
for i, p in enumerate(self.file_list):
stem = p.stem + '-filled' if self.filled_mask else p.stem
p_anno = '/'.join([p.parent.stem, stem + ext])
# Ignore 'segmented-ashariff`. It seems that Ashariff got bored
# after 10 images.
anno = p.parents[2] / 'segmented-lpc' / p_anno
if anno.exists():
anno_dict[i] = anno
return anno_dict
def _drop_broken_files(self):
"""Drop broken files
'/data/images/segmented-lpc/ic100/dna-46.xcf' cannnot be read by
``gimpformats``
"""
file_list = self.file_list
anno_dict = self.anno_dict
for i, p in anno_dict.items():
# if p.name == 'dna-46.xcf':
if '/'.join([p.parent.name, p.name]) == 'ic100/dna-46.xcf':
file_list.pop(i)
anno_dict.pop(i)
break
anno_dict = dict((i, v) for i, v in enumerate(anno_dict.values()))
return file_list, anno_dict
[docs] def fill_save_mask(self):
"""Fill holes from boundary mask with some tricks
Requires scipy and scikit-image. Install depencency with pip option
``pip install bioimageloader[process]``.
Note that this does not result perfect filled masks. Those not entirely
closed by this algorithm (36, 40, 63).
Other issues: ``ind=63``: 'border' not 'borders', ``ind=93``
``GimpDocument`` cannot read it...
"""
from scipy.ndimage import binary_fill_holes
from skimage.morphology import dilation, erosion
def fill_holes(
img: np.ndarray,
w_edge: int,
w_pad: int
) -> np.ndarray:
# cut edges because many are not closed
edge_cut = img[w_edge:-w_edge, w_edge:-w_edge]
# dilate to connect some boundaries
dilated = dilation(edge_cut)
# pad with refletion mode to close bounary at edges
filled_pad = binary_fill_holes(
np.pad(dilated, w_pad, mode='reflect')
)
# back to original shape
w_rev = w_pad - w_edge
filled_pad = filled_pad[w_rev:-w_rev, w_rev:-w_rev]
# erode, because we dilated
filled_pad = erosion(filled_pad)
return filled_pad
for k, p in self.anno_dict.items():
# read gimp document .xcf
try:
doc = GimpDocument(p.as_posix())
except TypeError:
# '/data/images/segmented-lpc/ic100/dna-46.xcf' cannnot be read
# through gimpformats library. You can open it GIMP though.
print(f"Cannot open '{k}: {p}' with gimpformats lib")
continue
# get 'borders' layer
layers = [layer.name for layer in doc.layers]
try:
ind_layer = layers.index('borders')
except ValueError:
# '/data/images/segmented-lpc/ic100/dna-15.xcf' do not have
# 'borders' but 'border' layer.
print(f"Exception layer['border'] '{k}: {p}'")
ind_layer = layers.index('border')
# get image
layer_borders = doc.getLayer(ind_layer)
mask = layer_borders.image
mask = np.asarray(mask)
# print(k, mask.shape)
# fill_holes, it has 3 channels and red channel has annotation
filled = fill_holes(mask[..., 0], w_edge=15, w_pad=200)
# save
img = Image.fromarray(filled)
print(f"Saving '{k}: {p}'")
img.save(p.with_suffix('.png'))