import os.path
from functools import cached_property
from pathlib import Path
from typing import Dict, List, Optional
import albumentations
import numpy as np
from PIL import Image
from skimage.util import img_as_float32
from gimpformats.gimpXcfDocument import GimpDocument
except ModuleNotFoundError as e:
print("Install `gimpformats` pkg")
raise e
from ..base import MaskDataset
from ..utils import imread_asarray
[docs]class MurphyLab(MaskDataset):
"""Nuclei Segmentation In Microscope Cell Images: A Hand-Segmented Dataset
And Comparison Of Algorithms [1]_
root_dir : str or pathlib.Path
Path to root directory
output : {'both', 'image', 'mask'}, default: 'both'
Change outputs. 'both' returns {'image': image, 'mask': mask}.
transforms : albumentations.Compose, optional
An instance of Compose (albumentations pkg) that defines
augmentation in sequence.
num_samples : int, optional
Useful when ``transforms`` is set. Define the total length of the
dataset. If it is set, it overwrites ``__len__``.
drop_missing_pairs : bool, default: True
Valid only if `output='both'`. It will drop images that do not have
mask pairs.
drop_broken_files : bool, default: True
Drop broken files that cannot be read
filled_mask : bool, default: False
Use saved filled masks through `fill_save_mask()` method instead of
default boundary masks. If one would want to use manually modified
masks, the annotation files should have the same name as '*.xcf'
with modified suffix by '.png'.
This dataset has many issues whose details can be found below. The simpleset
way is to drop those that cause isseus. It is recommended to not opt out
``drop_missing_pairs()`` and ``drop_broken_files()``. Otherwise, it will
meet exceptions.
If one wants filled hole, ``fill_save_mask()`` function will fill holes with
some tricks to handle edge cases and save them as .png format. Then set
``filled_mask`` argument to True to load them.
Read more in Notes section
- 4 channel PNG format annotation mask even though mask is binary. But it is
not grayscale binary. They put value 255 only in red channel.
- Two annotation formats; Photoshop and GIMP. It seems that two annotators
worked separately. segmented-ashariff will be ignored. In total, 97
segmented images (out of 100)
- 3 missing segmentations: ind={31, 43, 75}
- Manually filled annotation to make masks using GIMP
- 2009_ISBI_2DNuclei_code_data/data/images/segmented-lpc/ic100/dna-15.xcf
does not have 'borders' layer like the others. This one alone has
'border' layer.
.. [1] L. P. Coelho, A. Shariff, and R. F. Murphy, “Nuclear segmentation in
microscope cell images: A hand-segmented dataset and comparison of
algorithms,” in 2009 IEEE International Symposium on Biomedical Imaging:
From Nano to Macro, Jun. 2009, pp. 518–521, doi:
See Also
MaskDataset : Super class
Dataset : Base class
DatasetInterface : Interface
# Dataset's acronym
acronym = 'MurphyLab'
def __init__(
# Interface requirement
root_dir: str,
output: str = 'both',
transforms: Optional[albumentations.Compose] = None,
num_samples: Optional[int] = None,
# Specific to this dataset
drop_missing_pairs: bool = True,
drop_broken_files: bool = True,
filled_mask: bool = False,
# Interface and super-class arguments
self._root_dir = os.path.join(root_dir, 'data', 'images')
self._output = output
self._transforms = transforms
self._num_samples = num_samples
# Specific to this dataset
self.drop_missing_pairs = drop_missing_pairs
self.drop_broken_files = drop_broken_files
self.filled_mask = filled_mask
if self.output == 'both' and self.drop_missing_pairs:
self.file_list, self.anno_dict = self._drop_missing_pairs()
if self.output == 'both' and self.drop_broken_files:
self.file_list, self.anno_dict = self._drop_broken_files()
[docs] def get_image(self, p: Path) -> np.ndarray:
img = imread_asarray(p)
return img_as_float32(img)
[docs] def get_mask(self, p: Path) -> np.ndarray:
if self.filled_mask:
mask =
return np.asarray(mask, dtype=np.float32)[..., 0]
doc = GimpDocument(p.as_posix())
layers = [ for layer in doc.layers]
ind_layer = layers.index('borders')
except ValueError:
# '/data/images/segmented-lpc/ic100/dna-15.xcf' do not have
# 'borders' but 'border' layer.
ind_layer = layers.index('border')
# get image
layer_borders = doc.getLayer(ind_layer)
mask = layer_borders.image
return np.asarray(mask)[..., 0]
def file_list(self) -> List[Path]:
root_dir = self.root_dir
parent = 'dna-images'
# dna-images
file_list = sorted(
root_dir.glob(f'{parent}/*/*.png'), key=self._sort_key
return file_list
def _sort_key(p, zfill=2):
split = p.stem.split('-')
return '-'.join([p.parent.stem] + [s.zfill(zfill) for s in split])
def anno_dict(self) -> Dict[int, Path]:
ext = '.png' if self.filled_mask else '.xcf'
anno_dict = {}
for i, p in enumerate(self.file_list):
stem = p.stem + '-filled' if self.filled_mask else p.stem
p_anno = '/'.join([p.parent.stem, stem + ext])
# Ignore 'segmented-ashariff`. It seems that Ashariff got bored
# after 10 images.
anno = p.parents[2] / 'segmented-lpc' / p_anno
if anno.exists():
anno_dict[i] = anno
return anno_dict
def _drop_broken_files(self):
"""Drop broken files
'/data/images/segmented-lpc/ic100/dna-46.xcf' cannnot be read by
file_list = self.file_list
anno_dict = self.anno_dict
for i, p in anno_dict.items():
# if == 'dna-46.xcf':
if '/'.join([,]) == 'ic100/dna-46.xcf':
anno_dict = dict((i, v) for i, v in enumerate(anno_dict.values()))
return file_list, anno_dict
[docs] def fill_save_mask(self):
"""Fill holes from boundary mask with some tricks
Requires scipy and scikit-image. Install depencency with pip option
``pip install bioimageloader[process]``.
Note that this does not result perfect filled masks. Those not entirely
closed by this algorithm (36, 40, 63).
Other issues: ``ind=63``: 'border' not 'borders', ``ind=93``
``GimpDocument`` cannot read it...
from scipy.ndimage import binary_fill_holes
from skimage.morphology import dilation, erosion
def fill_holes(
img: np.ndarray,
w_edge: int,
w_pad: int
) -> np.ndarray:
# cut edges because many are not closed
edge_cut = img[w_edge:-w_edge, w_edge:-w_edge]
# dilate to connect some boundaries
dilated = dilation(edge_cut)
# pad with refletion mode to close bounary at edges
filled_pad = binary_fill_holes(
np.pad(dilated, w_pad, mode='reflect')
# back to original shape
w_rev = w_pad - w_edge
filled_pad = filled_pad[w_rev:-w_rev, w_rev:-w_rev]
# erode, because we dilated
filled_pad = erosion(filled_pad)
return filled_pad
for k, p in self.anno_dict.items():
# read gimp document .xcf
doc = GimpDocument(p.as_posix())
except TypeError:
# '/data/images/segmented-lpc/ic100/dna-46.xcf' cannnot be read
# through gimpformats library. You can open it GIMP though.
print(f"Cannot open '{k}: {p}' with gimpformats lib")
# get 'borders' layer
layers = [ for layer in doc.layers]
ind_layer = layers.index('borders')
except ValueError:
# '/data/images/segmented-lpc/ic100/dna-15.xcf' do not have
# 'borders' but 'border' layer.
print(f"Exception layer['border'] '{k}: {p}'")
ind_layer = layers.index('border')
# get image
layer_borders = doc.getLayer(ind_layer)
mask = layer_borders.image
mask = np.asarray(mask)
# print(k, mask.shape)
# fill_holes, it has 3 channels and red channel has annotation
filled = fill_holes(mask[..., 0], w_edge=15, w_pad=200)
# save
img = Image.fromarray(filled)
print(f"Saving '{k}: {p}'")'.png'))