view SingleCellDataExtraction.py @ 0:928db0f952e3 draft

"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit a4349062e9177b5e60fb7c49115c57299e0d648d-dirty"
author perssond
date Fri, 12 Mar 2021 00:19:24 +0000
parents
children aba3655fdef0
line wrap: on
line source

#Functions for reading in single cell imaging data
#Joshua Hess

#Import necessary modules
import skimage.io
import h5py
import pandas as pd
import numpy as np
import os
import skimage.measure as measure
from pathlib import Path
import csv

import sys


def MaskChannel(mask_loaded,image_loaded_z):
    """Function for quantifying a single channel image

    Returns a table with CellID according to the mask and the mean pixel intensity
    for the given channel for each cell"""
    print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr)
    print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr)
    dat = measure.regionprops(mask_loaded, image_loaded_z)
    n = len(dat)
    intensity_z = np.empty(n)
    for i in range(n):
        intensity_z[i] = dat[i].mean_intensity
        # Clear reference to avoid memory leak -- see MaskIDs for explanation.
        dat[i] = None
    return intensity_z


def MaskIDs(mask):
    """This function will extract the CellIDs and the XY positions for each
    cell based on that cells centroid

    Returns a dictionary object"""

    dat = measure.regionprops(mask)
    n = len(dat)

    # Pre-allocate numpy arrays for all properties we'll calculate.
    labels = np.empty(n, int)
    xcoords = np.empty(n)
    ycoords = np.empty(n)
    area = np.empty(n, int)
    minor_axis_length = np.empty(n)
    major_axis_length = np.empty(n)
    eccentricity = np.empty(n)
    solidity = np.empty(n)
    extent = np.empty(n)
    orientation = np.empty(n)

    for i in range(n):
        labels[i] = dat[i].label
        xcoords[i] = dat[i].centroid[1]
        ycoords[i] = dat[i].centroid[0]
        area[i] = dat[i].area
        major_axis_length[i] = dat[i].major_axis_length
        minor_axis_length[i] = dat[i].minor_axis_length
        eccentricity[i] = dat[i].eccentricity
        solidity[i] = dat[i].solidity
        extent[i] = dat[i].extent
        orientation[i] = dat[i].orientation
        # By clearing the reference to each RegionProperties object, we allow it
        # and its cache to be garbage collected immediately. Otherwise memory
        # usage creeps up needlessly while this function is executing.
        dat[i] = None

    IDs = {
        "CellID": labels,
        "X_centroid": xcoords,
        "Y_centroid": ycoords,
        "column_centroid": xcoords,
        "row_centroid": ycoords,
        "Area": area,
        "MajorAxisLength": major_axis_length,
        "MinorAxisLength": minor_axis_length,
        "Eccentricity": eccentricity,
        "Solidity": solidity,
        "Extent": extent,
        "Orientation": orientation,
    }

    return IDs


def PrepareData(image,z):
    """Function for preparing input for maskzstack function. Connecting function
    to use with mc micro ilastik pipeline"""

    image_path = Path(image)
    print(f'{image_path} at {z}', file=sys.stderr)

    #Check to see if image tif(f)
    if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf':
        #Check to see if the image is ome.tif(f)
        if  image.endswith(('.ome.tif','.ome.tiff')):
            #Read the image
            image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
            #print('OME TIF(F) found')
        else:
            #Read the image
            image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
            #print('TIF(F) found')
            # Remove extra axis
            #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4]))

    #Check to see if image is hdf5
    elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5':
        #Read the image
        f = h5py.File(image,'r+')
        #Get the dataset name from the h5 file
        dat_name = list(f.keys())[0]
        ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data'
        #Get the image data
        image_loaded = np.array(f[dat_name])
        #Remove the first axis (ilastik convention)
        image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3]))
        ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be
        ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1)

    #Return the objects
    return image_loaded_z


def MaskZstack(masks_loaded,image,channel_names_loaded):
    """This function will extract the stats for each cell mask through each channel
    in the input image

    mask_loaded: dictionary containing Tiff masks that represents the cells in your image.

    z_stack: Multichannel z stack image"""

    #Get the names of the keys for the masks dictionary
    mask_names = list(masks_loaded.keys())
    #Get the CellIDs for this dataset by using only a single mask (first mask)
    IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]]))
    #Create empty dictionary to store channel results per mask
    dict_of_chan = {m_name: [] for m_name in mask_names}
    #Get the z channel and the associated channel name from list of channel names
    print(f'channels: {channel_names_loaded}', file=sys.stderr)
    print(f'num channels: {len(channel_names_loaded)}', file=sys.stderr)
    for z in range(len(channel_names_loaded)):
        #Run the data Prep function
        image_loaded_z = PrepareData(image,z)

        #Iterate through number of masks to extract single cell data
        for nm in range(len(mask_names)):
            #Use the above information to mask z stack
            dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z))
        #Print progress
        print("Finished "+str(z))

    #Iterate through the rest of the masks to modify names of channels and convert to data table
    for nm in mask_names:
        #Check if this is the first mask
        if nm == mask_names[0]:
            #Create channel names for this mask
            new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
            #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY
            dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1)
            #Get the name of the columns in the dataframe so we can reorder to histoCAT convention
            cols = list(dict_of_chan[nm].columns.values)
            #Reorder the list (Move xy position to end with spatial information)
            cols.append(cols.pop(cols.index("X_centroid")))
            cols.append(cols.pop(cols.index("Y_centroid")))
            cols.append(cols.pop(cols.index("column_centroid")))
            cols.append(cols.pop(cols.index("row_centroid")))
            cols.append(cols.pop(cols.index("Area")))
            cols.append(cols.pop(cols.index("MajorAxisLength")))
            cols.append(cols.pop(cols.index("MinorAxisLength")))
            cols.append(cols.pop(cols.index("Eccentricity")))
            cols.append(cols.pop(cols.index("Solidity")))
            cols.append(cols.pop(cols.index("Extent")))
            cols.append(cols.pop(cols.index("Orientation")))
            #Reindex the dataframe with new order
            dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols)
        #Otherwise, add no spatial information
        else:
            #Create channel names for this mask
            new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
            #Use the above information to mask z stack
            dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))

    #Concatenate all data from all masks to return
    dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1)

    #Return the dataframe
    return dat


def ExtractSingleCells(masks,image,channel_names,output):
    """Function for extracting single cell information from input
    path containing single-cell masks, z_stack path, and channel_names path."""

    #Create pathlib object for output
    output = Path(output)

    #Check if header available
    #sniffer = csv.Sniffer()
    #sniffer.has_header(open(channel_names).readline())
    #If header not available
    #if not sniffer:
        #If header available
        #channel_names_loaded = pd.read_csv(channel_names)
        #channel_names_loaded_list = list(channel_names_loaded.marker_name)
    #else:
        #print("negative")
        #old one column version
        #channel_names_loaded = pd.read_csv(channel_names,header=None)
        #Add a column index for ease
        #channel_names_loaded.columns = ["marker"]
        #channel_names_loaded = list(channel_names_loaded.marker.values)

    #Read csv channel names
    channel_names_loaded = pd.read_csv(channel_names)
    #Check for size of columns
    if channel_names_loaded.shape[1] > 1:
        #Get the marker_name column if more than one column (CyCIF structure)
        channel_names_loaded_list = list(channel_names_loaded.marker_name)
    else:
        #old one column version -- re-read the csv file and add column name
        channel_names_loaded = pd.read_csv(channel_names, header = None)
        #Add a column index for ease and for standardization
        channel_names_loaded.columns = ["marker"]
        channel_names_loaded_list = list(channel_names_loaded.marker)

    #Check for unique marker names -- create new list to store new names
    channel_names_loaded_checked = []
    for idx,val in enumerate(channel_names_loaded_list):
        #Check for unique value
        if channel_names_loaded_list.count(val) > 1:
            #If unique count greater than one, add suffix
            channel_names_loaded_checked.append(val + "_"+ str(channel_names_loaded_list[:idx].count(val) + 1))
        else:
            #Otherwise, leave channel name
            channel_names_loaded_checked.append(val)

    #Clear small memory amount by clearing old channel names
    channel_names_loaded, channel_names_loaded_list = None, None

    #Read the masks
    masks_loaded = {}
    #iterate through mask paths and read images to add to dictionary object
    for m in masks:
        m_full_name = os.path.basename(m)
        m_name = m_full_name.split('.')[0]
        masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')})

    scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked)
    #Write the singe cell data to a csv file using the image name

    im_full_name = os.path.basename(image)
    im_name = im_full_name.split('.')[0]
    scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False)


def MultiExtractSingleCells(masks,image,channel_names,output):
    """Function for iterating over a list of z_stacks and output locations to
    export single-cell data from image masks"""

    print("Extracting single-cell data for "+str(image)+'...')

    #Run the ExtractSingleCells function for this image
    ExtractSingleCells(masks,image,channel_names,output)

    #Print update
    im_full_name = os.path.basename(image)
    im_name = im_full_name.split('.')[0]
    print("Finished "+str(im_name))