changeset 1:aba3655fdef0 draft

"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit 897a7dc7cb43e45d6f0fdfe2b2970e59f20f8853"
author watsocam
date Fri, 11 Mar 2022 23:35:52 +0000
parents 928db0f952e3
children 46b897eb2c8e
files ParseInput.py SingleCellDataExtraction.py macros.xml quantification.xml
diffstat 4 files changed, 206 insertions(+), 163 deletions(-) [+]
line wrap: on
line diff
--- a/ParseInput.py	Fri Mar 12 00:19:24 2021 +0000
+++ b/ParseInput.py	Fri Mar 11 23:35:52 2022 +0000
@@ -8,15 +8,39 @@
 
 #if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-   parser.add_argument('--masks',nargs='*')
-   parser.add_argument('--image')
-   parser.add_argument('--channel_names')
-   parser.add_argument('--output')
+   parser.add_argument('--masks',nargs='+', required=True)
+   parser.add_argument('--image', required=True)
+   parser.add_argument('--channel_names', required=True)
+   parser.add_argument('--output', required=True)
+   parser.add_argument(
+      '--mask_props', nargs = "+",
+      help="""
+         Space separated list of additional metrics to be calculated for every mask.
+         This is for metrics that depend only on the cell mask. If the metric depends
+         on signal intensity, use --intensity-props instead.
+         See list at https://scikit-image.org/docs/dev/api/skimage.measure.html#regionprops
+      """
+   )
+   parser.add_argument(
+      '--intensity_props', nargs = "+",
+      help="""
+         Space separated list of additional metrics to be calculated for every marker separately.
+         By default only mean intensity is calculated.
+         If the metric doesn't depend on signal intensity, use --mask-props instead.
+         See list at https://scikit-image.org/docs/dev/api/skimage.measure.html#regionprops
+         Additionally available is gini_index, which calculates a single number
+         between 0 and 1, representing how unequal the signal is distributed in each region.
+         See https://en.wikipedia.org/wiki/Gini_coefficient
+      """
+   )
    #parser.add_argument('--suffix')
    args = parser.parse_args()
    #Create a dictionary object to pass to the next function
    dict = {'masks': args.masks, 'image': args.image,\
-    'channel_names': args.channel_names,'output':args.output}
+    'channel_names': args.channel_names,'output':args.output,
+    'intensity_props': set(args.intensity_props if args.intensity_props is not None else []).union(["intensity_mean"]),
+    'mask_props': args.mask_props,
+   }
    #Print the dictionary object
    print(dict)
    #Return the dictionary
--- a/SingleCellDataExtraction.py	Fri Mar 12 00:19:24 2021 +0000
+++ b/SingleCellDataExtraction.py	Fri Mar 11 23:35:52 2022 +0000
@@ -8,83 +8,93 @@
 import numpy as np
 import os
 import skimage.measure as measure
+import tifffile
+
 from pathlib import Path
-import csv
 
 import sys
 
 
-def MaskChannel(mask_loaded,image_loaded_z):
+def gini_index(mask, intensity):
+    x = intensity[mask]
+    sorted_x = np.sort(x)
+    n = len(x)
+    cumx = np.cumsum(sorted_x, dtype=float)
+    return (n + 1 - 2 * np.sum(cumx) / cumx[-1]) / n
+
+def intensity_median(mask, intensity):
+    return np.median(intensity[mask])
+
+def MaskChannel(mask_loaded, image_loaded_z, intensity_props=["intensity_mean"]):
     """Function for quantifying a single channel image
 
     Returns a table with CellID according to the mask and the mean pixel intensity
     for the given channel for each cell"""
-    print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr)
-    print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr)
-    dat = measure.regionprops(mask_loaded, image_loaded_z)
-    n = len(dat)
-    intensity_z = np.empty(n)
-    for i in range(n):
-        intensity_z[i] = dat[i].mean_intensity
-        # Clear reference to avoid memory leak -- see MaskIDs for explanation.
-        dat[i] = None
-    return intensity_z
+    # Look for regionprops in skimage
+    builtin_props = set(intensity_props).intersection(measure._regionprops.PROP_VALS)
+    # Otherwise look for them in this module
+    extra_props = set(intensity_props).difference(measure._regionprops.PROP_VALS)
+    dat = measure.regionprops_table(
+        mask_loaded, image_loaded_z,
+        properties = tuple(builtin_props),
+        extra_properties = [globals()[n] for n in extra_props]
+    )
+    return dat
 
 
-def MaskIDs(mask):
+def MaskIDs(mask, mask_props=None):
     """This function will extract the CellIDs and the XY positions for each
     cell based on that cells centroid
 
     Returns a dictionary object"""
 
-    dat = measure.regionprops(mask)
-    n = len(dat)
+    all_mask_props = set(["label", "centroid", "area", "major_axis_length", "minor_axis_length", "eccentricity", "solidity", "extent", "orientation"])
+    if mask_props is not None:
+        all_mask_props = all_mask_props.union(mask_props)
 
-    # Pre-allocate numpy arrays for all properties we'll calculate.
-    labels = np.empty(n, int)
-    xcoords = np.empty(n)
-    ycoords = np.empty(n)
-    area = np.empty(n, int)
-    minor_axis_length = np.empty(n)
-    major_axis_length = np.empty(n)
-    eccentricity = np.empty(n)
-    solidity = np.empty(n)
-    extent = np.empty(n)
-    orientation = np.empty(n)
+    dat = measure.regionprops_table(
+        mask,
+        properties=all_mask_props
+    )
 
-    for i in range(n):
-        labels[i] = dat[i].label
-        xcoords[i] = dat[i].centroid[1]
-        ycoords[i] = dat[i].centroid[0]
-        area[i] = dat[i].area
-        major_axis_length[i] = dat[i].major_axis_length
-        minor_axis_length[i] = dat[i].minor_axis_length
-        eccentricity[i] = dat[i].eccentricity
-        solidity[i] = dat[i].solidity
-        extent[i] = dat[i].extent
-        orientation[i] = dat[i].orientation
-        # By clearing the reference to each RegionProperties object, we allow it
-        # and its cache to be garbage collected immediately. Otherwise memory
-        # usage creeps up needlessly while this function is executing.
-        dat[i] = None
+    name_map = {
+        "CellID": "label",
+        "X_centroid": "centroid-1",
+        "Y_centroid": "centroid-0",
+        "Area": "area",
+        "MajorAxisLength": "major_axis_length",
+        "MinorAxisLength": "minor_axis_length",
+        "Eccentricity": "eccentricity",
+        "Solidity": "solidity",
+        "Extent": "extent",
+        "Orientation": "orientation",
+    }
+    for new_name, old_name in name_map.items():
+        dat[new_name] = dat[old_name]
+    for old_name in set(name_map.values()):
+        del dat[old_name]
+
+    return dat
 
-    IDs = {
-        "CellID": labels,
-        "X_centroid": xcoords,
-        "Y_centroid": ycoords,
-        "column_centroid": xcoords,
-        "row_centroid": ycoords,
-        "Area": area,
-        "MajorAxisLength": major_axis_length,
-        "MinorAxisLength": minor_axis_length,
-        "Eccentricity": eccentricity,
-        "Solidity": solidity,
-        "Extent": extent,
-        "Orientation": orientation,
-    }
+def n_channels(image):
+    """Returns the number of channel in the input image. Supports [OME]TIFF and HDF5."""
+
+    image_path = Path(image)
 
-    return IDs
+    if image_path.suffix in ['.tiff', '.tif', '.btf']:
+        s = tifffile.TiffFile(image).series[0]
+        ndim = len(s.shape)
+        if ndim == 2: return 1
+        elif ndim == 3: return min(s.shape)
+        else: raise Exception('mcquant supports only 2D/3D images.')
 
+    elif image_path.suffix in ['.h5', '.hdf5']:
+        f = h5py.File(image, 'r')
+        dat_name = list(f.keys())[0]
+        return f[dat_name].shape[3]
+
+    else:
+        raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only')
 
 def PrepareData(image,z):
     """Function for preparing input for maskzstack function. Connecting function
@@ -94,38 +104,26 @@
     print(f'{image_path} at {z}', file=sys.stderr)
 
     #Check to see if image tif(f)
-    if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf':
-        #Check to see if the image is ome.tif(f)
-        if  image.endswith(('.ome.tif','.ome.tiff')):
-            #Read the image
-            image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
-            #print('OME TIF(F) found')
-        else:
-            #Read the image
-            image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
-            #print('TIF(F) found')
-            # Remove extra axis
-            #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4]))
+    if image_path.suffix in ['.tiff', '.tif', '.btf']:
+        image_loaded_z = tifffile.imread(image, key=z)
 
     #Check to see if image is hdf5
-    elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5':
+    elif image_path.suffix in ['.h5', '.hdf5']:
         #Read the image
-        f = h5py.File(image,'r+')
+        f = h5py.File(image,'r')
         #Get the dataset name from the h5 file
         dat_name = list(f.keys())[0]
-        ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data'
-        #Get the image data
-        image_loaded = np.array(f[dat_name])
-        #Remove the first axis (ilastik convention)
-        image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3]))
-        ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be
-        ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1)
+        #Retrieve the z^th channel
+        image_loaded_z = f[dat_name][0,:,:,z]
+
+    else:
+        raise Exception('mcquant currently supports [OME]TIFF and HDF5 formats only')
 
     #Return the objects
     return image_loaded_z
 
 
-def MaskZstack(masks_loaded,image,channel_names_loaded):
+def MaskZstack(masks_loaded,image,channel_names_loaded, mask_props=None, intensity_props=["intensity_mean"]):
     """This function will extract the stats for each cell mask through each channel
     in the input image
 
@@ -135,8 +133,7 @@
 
     #Get the names of the keys for the masks dictionary
     mask_names = list(masks_loaded.keys())
-    #Get the CellIDs for this dataset by using only a single mask (first mask)
-    IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]]))
+
     #Create empty dictionary to store channel results per mask
     dict_of_chan = {m_name: [] for m_name in mask_names}
     #Get the z channel and the associated channel name from list of channel names
@@ -149,84 +146,79 @@
         #Iterate through number of masks to extract single cell data
         for nm in range(len(mask_names)):
             #Use the above information to mask z stack
-            dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z))
+            dict_of_chan[mask_names[nm]].append(
+                MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z, intensity_props=intensity_props)
+            )
         #Print progress
         print("Finished "+str(z))
 
-    #Iterate through the rest of the masks to modify names of channels and convert to data table
+    # Column order according to histoCAT convention (Move xy position to end with spatial information)
+    last_cols = (
+        "X_centroid",
+        "Y_centroid",
+        "column_centroid",
+        "row_centroid",
+        "Area",
+        "MajorAxisLength",
+        "MinorAxisLength",
+        "Eccentricity",
+        "Solidity",
+        "Extent",
+        "Orientation",
+    )
+    def col_sort(x):
+        if x == "CellID":
+            return -2
+        try:
+            return last_cols.index(x)
+        except ValueError:
+            return -1
+
+    #Iterate through the masks and format quantifications for each mask and property
     for nm in mask_names:
-        #Check if this is the first mask
-        if nm == mask_names[0]:
-            #Create channel names for this mask
-            new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
-            #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY
-            dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1)
-            #Get the name of the columns in the dataframe so we can reorder to histoCAT convention
-            cols = list(dict_of_chan[nm].columns.values)
-            #Reorder the list (Move xy position to end with spatial information)
-            cols.append(cols.pop(cols.index("X_centroid")))
-            cols.append(cols.pop(cols.index("Y_centroid")))
-            cols.append(cols.pop(cols.index("column_centroid")))
-            cols.append(cols.pop(cols.index("row_centroid")))
-            cols.append(cols.pop(cols.index("Area")))
-            cols.append(cols.pop(cols.index("MajorAxisLength")))
-            cols.append(cols.pop(cols.index("MinorAxisLength")))
-            cols.append(cols.pop(cols.index("Eccentricity")))
-            cols.append(cols.pop(cols.index("Solidity")))
-            cols.append(cols.pop(cols.index("Extent")))
-            cols.append(cols.pop(cols.index("Orientation")))
-            #Reindex the dataframe with new order
-            dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols)
-        #Otherwise, add no spatial information
-        else:
-            #Create channel names for this mask
-            new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
-            #Use the above information to mask z stack
-            dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))
+        mask_dict = {}
+        # Mean intensity is default property, stored without suffix
+        mask_dict.update(
+            zip(channel_names_loaded, [x["intensity_mean"] for x in dict_of_chan[nm]])
+        )
+        # All other properties are suffixed with their names
+        for prop_n in set(dict_of_chan[nm][0].keys()).difference(["intensity_mean"]):
+            mask_dict.update(
+                zip([f"{n}_{prop_n}" for n in channel_names_loaded], [x[prop_n] for x in dict_of_chan[nm]])
+            )
+        # Get the cell IDs and mask properties
+        mask_properties = pd.DataFrame(MaskIDs(masks_loaded[nm], mask_props=mask_props))
+        mask_dict.update(mask_properties)
+        dict_of_chan[nm] = pd.DataFrame(mask_dict).reindex(columns=sorted(mask_dict.keys(), key=col_sort))
 
-    #Concatenate all data from all masks to return
-    dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1)
+    # Return the dict of dataframes for each mask
+    return dict_of_chan
 
-    #Return the dataframe
-    return dat
-
-
-def ExtractSingleCells(masks,image,channel_names,output):
+def ExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]):
     """Function for extracting single cell information from input
     path containing single-cell masks, z_stack path, and channel_names path."""
 
     #Create pathlib object for output
     output = Path(output)
 
-    #Check if header available
-    #sniffer = csv.Sniffer()
-    #sniffer.has_header(open(channel_names).readline())
-    #If header not available
-    #if not sniffer:
-        #If header available
-        #channel_names_loaded = pd.read_csv(channel_names)
-        #channel_names_loaded_list = list(channel_names_loaded.marker_name)
-    #else:
-        #print("negative")
-        #old one column version
-        #channel_names_loaded = pd.read_csv(channel_names,header=None)
-        #Add a column index for ease
-        #channel_names_loaded.columns = ["marker"]
-        #channel_names_loaded = list(channel_names_loaded.marker.values)
-
     #Read csv channel names
     channel_names_loaded = pd.read_csv(channel_names)
-    #Check for size of columns
-    if channel_names_loaded.shape[1] > 1:
+    #Check for the presence of `marker_name` column
+    if 'marker_name' in channel_names_loaded:
         #Get the marker_name column if more than one column (CyCIF structure)
         channel_names_loaded_list = list(channel_names_loaded.marker_name)
-    else:
-        #old one column version -- re-read the csv file and add column name
+    #Consider the old one-marker-per-line plain text format
+    elif channel_names_loaded.shape[1] == 1:
+        #re-read the csv file and add column name
         channel_names_loaded = pd.read_csv(channel_names, header = None)
-        #Add a column index for ease and for standardization
-        channel_names_loaded.columns = ["marker"]
-        channel_names_loaded_list = list(channel_names_loaded.marker)
+        channel_names_loaded_list = list(channel_names_loaded.iloc[:,0])
+    else:
+        raise Exception('%s must contain the marker_name column'%channel_names)
 
+    #Contrast against the number of markers in the image
+    if len(channel_names_loaded_list) != n_channels(image):
+        raise Exception("The number of channels in %s doesn't match the image"%channel_names)
+    
     #Check for unique marker names -- create new list to store new names
     channel_names_loaded_checked = []
     for idx,val in enumerate(channel_names_loaded_list):
@@ -238,9 +230,6 @@
             #Otherwise, leave channel name
             channel_names_loaded_checked.append(val)
 
-    #Clear small memory amount by clearing old channel names
-    channel_names_loaded, channel_names_loaded_list = None, None
-
     #Read the masks
     masks_loaded = {}
     #iterate through mask paths and read images to add to dictionary object
@@ -249,22 +238,30 @@
         m_name = m_full_name.split('.')[0]
         masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')})
 
-    scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked)
+    scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked, mask_props=mask_props, intensity_props=intensity_props)
     #Write the singe cell data to a csv file using the image name
 
     im_full_name = os.path.basename(image)
     im_name = im_full_name.split('.')[0]
-    scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False)
+
+    # iterate through each mask and export csv with mask name as suffix
+    for k,v in scdata_z.items():
+        # export the csv for this mask name
+        scdata_z[k].to_csv(
+                            str(Path(os.path.join(str(output),
+                            str(im_name+"_{}"+".csv").format(k)))),
+                            index=False
+                            )
 
 
-def MultiExtractSingleCells(masks,image,channel_names,output):
+def MultiExtractSingleCells(masks,image,channel_names,output, mask_props=None, intensity_props=["intensity_mean"]):
     """Function for iterating over a list of z_stacks and output locations to
     export single-cell data from image masks"""
 
     print("Extracting single-cell data for "+str(image)+'...')
 
     #Run the ExtractSingleCells function for this image
-    ExtractSingleCells(masks,image,channel_names,output)
+    ExtractSingleCells(masks,image,channel_names,output, mask_props=mask_props, intensity_props=intensity_props)
 
     #Print update
     im_full_name = os.path.basename(image)
--- a/macros.xml	Fri Mar 12 00:19:24 2021 +0000
+++ b/macros.xml	Fri Mar 11 23:35:52 2022 +0000
@@ -2,12 +2,13 @@
 <macros>
     <xml name="requirements">
         <requirements>
-            <requirement type="package" version="3.6.10">python</requirement>
-            <requirement type="package" version="0.17.2">scikit-image</requirement>
-            <requirement type="package" version="2.10.0">h5py</requirement>
-            <requirement type="package" version="1.0.4">pandas</requirement>
-            <requirement type="package" version="1.18.5">numpy</requirement>
-            <requirement type="package" version="1.0.1">pathlib</requirement>
+            <container type="docker">labsyspharm/quantification:@VERSION@</container>
+            <requirement type="package" version="3.9">python</requirement>
+            <requirement type="package" version="0.18.0">scikit-image</requirement>
+            <requirement type="package">h5py</requirement>
+            <requirement type="package">pandas</requirement>
+            <requirement type="package">numpy</requirement>
+            <requirement type="package">pathlib</requirement>
         </requirements>
     </xml>
 
@@ -19,6 +20,13 @@
         </citations>
     </xml>
 
-    <token name="@VERSION@">1.3.1</token>
-    <token name="@CMD_BEGIN@">python ${__tool_directory__}/CommandSingleCellExtraction.py</token>
+    <token name="@VERSION@">1.5.1</token>
+    <token name="@CMD_BEGIN@"><![CDATA[
+    QUANT_PATH="";
+    if [ -f "/app/CommandSingleCellExtraction.py" ]; then
+        export QUANT_PATH="/app/CommandSingleCellExtraction.py";
+    else
+        export QUANT_PATH="${__tool_directory__}/CommandSingleCellExtraction.py";
+    fi;
+    ]]></token>
 </macros>
--- a/quantification.xml	Fri Mar 12 00:19:24 2021 +0000
+++ b/quantification.xml	Fri Mar 11 23:35:52 2022 +0000
@@ -18,6 +18,7 @@
 
     @CMD_BEGIN@
     
+    python \$QUANT_PATH
     --masks 
     '${primary_mask.name}'.ome.tiff
     #if $supp_masks
@@ -28,9 +29,17 @@
 
     --image '${image.name}'.ome.tiff
     --output ./tool_out
+   
+    #if $mask_props
+    --mask_props $mask_props
+    #end if
+    #if $intensity_props
+    --intensity_props $intensity_props
+    #end if
+
     --channel_names '$channel_names';
 
-    mv ./tool_out/*.csv ./tool_out/quantified.csv;
+    cp tool_out/*cellMasks.csv cellMasks.csv
     ]]></command>
 
     <inputs>
@@ -38,11 +47,16 @@
         <param name="primary_mask" type="data" format="tiff" label="Primary Cell Mask"/>
         <param name="supp_masks" type="data" multiple="true" optional="true" format="tiff" label="Additional Cell Masks"/>
         <param name="channel_names" type="data" format="csv" label="Marker Channels"/>
+        <param name="mask_props" type="text" label="Mask Metrics" help="Space separated list of additional metrics to be calculated for every mask."/>
+        <param name="intensity_props" type="text" label="Intensity Metrics" help="Space separated list of additional metrics to be calculated for every marker separately."/>
     </inputs>
 
     <outputs>
-        <data format="csv" name="quant_out" from_work_dir="./tool_out/quantified.csv" label="${tool.name} on ${on_string}"/>
-    </outputs>
+        <data format="csv" name="cellmask" from_work_dir="cellMasks.csv" label="CellMaskQuant"/>
+        <collection type="list" name="quantification" label="${tool.name} on ${on_string}">
+            <discover_datasets pattern="__designation_and_ext__" format="csv" directory="tool_out/" visible="true"/>
+        </collection>
+     </outputs>
     <help><![CDATA[
 # Single cell quantification
 Module for single-cell data extraction given a segmentation mask and multi-channel image. The CSV structure is aligned with histoCAT output.