Source code for bioimageloader.collections._compath

import xml.etree.ElementTree as ET
from functools import cached_property
from pathlib import Path
from typing import Dict, Optional, Sequence, Union

import albumentations
import numpy as np
import tifffile
from PIL import Image
from skimage.draw import polygon
from skimage.util import img_as_float32

from ..base import MaskDataset


[docs]class ComputationalPathology(MaskDataset): """A Dataset and a Technique for Generalized Nuclear Segmentation for Computational Pathology [1]_ Parameters ---------- root_dir : str Path to root directory output : {'both', 'image', 'mask'}, default: 'both' Change outputs. 'both' returns {'image': image, 'mask': mask}. transforms : albumentations.Compose, optional An instance of Compose (albumentations pkg) that defines augmentation in sequence. num_samples : int, optional Useful when ```transforms``` is set. Define the total length of the dataset. If it is set, it overwrites ``__len__``. grayscale : bool, default: False Convert images to grayscale grayscale_mode : {'cv2', 'equal', Sequence[float]}, default: 'cv2' How to convert to grayscale. If set to 'cv2', it follows opencv implementation. Else if set to 'equal', it sums up values along channel axis, then divides it by the number of expected channels. mask_tif : bool, default: False Instead of parsing every xml file to reconstruct mask image arrays, use pre-drawn mask tif files which should reside in the same folder as annotation xml files. Notes ----- - Resolution of all images is (1000,1000) - gt is converted from annotation recorded in xml format - gt has dtype of torch.float64, converted from numpy.uint16, and it has value 'num_objects' * 255 because it is base-transformed - The origianl dataset provides annotation in xml format, which takes long time to parse and to reconstruct mask images dynamically during training. Drawing masks beforehand makes training much faster. Use ``mask_tif`` in that case. - When ``augmenters`` is provided, set the ``num_samples`` argument 30x1000x1000 -> 16x30=480 patches. Thus, the default ``num_samples=720`` (x1.5) - dtype of 'gt' is int16. However, to make batching easier, it will be casted to float32 - Be careful about types of augmenters; avoid interpolation References ---------- .. [1] N. Kumar, R. Verma, S. Sharma, S. Bhargava, A. Vahadane, and A. Sethi, “A Dataset and a Technique for Generalized Nuclear Segmentation for Computational Pathology,” IEEE Transactions on Medical Imaging, vol. 36, no. 7, pp. 1550–1560, Jul. 2017, doi: 10.1109/TMI.2017.2677499. See Also -------- MaskDataset : Super class Dataset : Base class DatasetInterface : Interface """ # Dataset's acronym acronym = 'ComPath' # Hard code resolution to parse annotation (.xml) _resolution = (1000, 1000) def __init__( self, root_dir: str, *, output: str = 'both', transforms: Optional[albumentations.Compose] = None, num_samples: Optional[int] = None, grayscale: bool = False, grayscale_mode: Union[str, Sequence[float]] = 'cv2', # specific to this dataset mask_tif: bool = False, **kwargs ): self._root_dir = root_dir self._output = output self._transforms = transforms self._num_samples = num_samples self._grayscale = grayscale self._grayscale_mode = grayscale_mode # specific to this dataset self.mask_tif = mask_tif
[docs] def get_image(self, p: Path) -> np.ndarray: img = Image.open(p) if img.mode == 'RGBA': img = img.convert(mode='RGB') return img_as_float32(np.asarray(img))
[docs] def get_mask(self, p: Path) -> np.ndarray: if self.mask_tif: if p.suffix == '.xml': raise ValueError( "Use `save_xml_to_tif()` then set `mask_tif` to True" ) mask = tifffile.imread(p) return mask.astype(np.int16) # Parse xml mask = self._parse_xml_to_array(p) return mask
@classmethod def _parse_xml_to_array(cls, f_anno) -> np.ndarray: """This dataset provides annotation in .xml format Consider pre-generating mask image using ``save_xml_to_tif()`` """ tree = ET.parse(f_anno) root = tree.getroot() rr = [] cc = [] for region in root.iter('Region'): r = [] c = [] # print(region.attrib) # print(region.find('Vertices')) if (vertices := region.find('Vertices')) is not None: for v in vertices: # print(v.attrib) r.append(v.attrib['Y']) c.append(v.attrib['X']) rr.append(np.array(r, dtype=np.float16)) cc.append(np.array(c, dtype=np.float16)) # X, Y = anno['X'], anno['Y'] mask = np.zeros(cls._resolution, dtype=np.int16) for i, (x, y) in enumerate(zip(cc, rr), 1): r, c = polygon(y, x, shape=cls._resolution) if len(rr) == 0 and len(cc) == 0: continue mask[r, c] = i return mask # return {'X': cc, 'Y': rr}
[docs] def save_xml_to_tif(self): """Parse .xml to mask and write it as tiff file Having masks in images is much faster than parsing .xml for each call. This func iterates through ``anno_dict``, parse and save each in .tif format in the same annotation directory. Re-initiate an instance with ``mask_tif`` argument to load them. """ if self.output not in ['mask', 'both']: raise ValueError("Set output either to 'mask' or 'both'") for i, p in self.anno_dict.items(): mask = self._parse_xml_to_array(p) fname = p.with_suffix('.tif') tifffile.imwrite( fname, data=mask, compression='zlib' ) print(f"[{i}/{len(self.anno_dict) - 1}] Wrote '{fname}'")
@cached_property def file_list(self) -> list: root_dir = self.root_dir parent = 'Tissue images' file_list = sorted(root_dir.glob(f'{parent}/*.png')) return file_list @cached_property def anno_dict(self) -> Dict[int, Path]: root_dir = self.root_dir parent = 'Annotations' ext = 'xml' if self.mask_tif: ext = 'tif' anno_dict = dict((k, v) for k, v in enumerate( sorted(root_dir.glob(f'{parent}/*.{ext}')) )) return anno_dict