Mercurial > repos > perssond > quantification
diff SingleCellDataExtraction.py @ 0:928db0f952e3 draft
"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit a4349062e9177b5e60fb7c49115c57299e0d648d-dirty"
author | perssond |
---|---|
date | Fri, 12 Mar 2021 00:19:24 +0000 |
parents | |
children | aba3655fdef0 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SingleCellDataExtraction.py Fri Mar 12 00:19:24 2021 +0000 @@ -0,0 +1,272 @@ +#Functions for reading in single cell imaging data +#Joshua Hess + +#Import necessary modules +import skimage.io +import h5py +import pandas as pd +import numpy as np +import os +import skimage.measure as measure +from pathlib import Path +import csv + +import sys + + +def MaskChannel(mask_loaded,image_loaded_z): + """Function for quantifying a single channel image + + Returns a table with CellID according to the mask and the mean pixel intensity + for the given channel for each cell""" + print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr) + print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr) + dat = measure.regionprops(mask_loaded, image_loaded_z) + n = len(dat) + intensity_z = np.empty(n) + for i in range(n): + intensity_z[i] = dat[i].mean_intensity + # Clear reference to avoid memory leak -- see MaskIDs for explanation. + dat[i] = None + return intensity_z + + +def MaskIDs(mask): + """This function will extract the CellIDs and the XY positions for each + cell based on that cells centroid + + Returns a dictionary object""" + + dat = measure.regionprops(mask) + n = len(dat) + + # Pre-allocate numpy arrays for all properties we'll calculate. + labels = np.empty(n, int) + xcoords = np.empty(n) + ycoords = np.empty(n) + area = np.empty(n, int) + minor_axis_length = np.empty(n) + major_axis_length = np.empty(n) + eccentricity = np.empty(n) + solidity = np.empty(n) + extent = np.empty(n) + orientation = np.empty(n) + + for i in range(n): + labels[i] = dat[i].label + xcoords[i] = dat[i].centroid[1] + ycoords[i] = dat[i].centroid[0] + area[i] = dat[i].area + major_axis_length[i] = dat[i].major_axis_length + minor_axis_length[i] = dat[i].minor_axis_length + eccentricity[i] = dat[i].eccentricity + solidity[i] = dat[i].solidity + extent[i] = dat[i].extent + orientation[i] = dat[i].orientation + # By clearing the reference to each RegionProperties object, we allow it + # and its cache to be garbage collected immediately. Otherwise memory + # usage creeps up needlessly while this function is executing. + dat[i] = None + + IDs = { + "CellID": labels, + "X_centroid": xcoords, + "Y_centroid": ycoords, + "column_centroid": xcoords, + "row_centroid": ycoords, + "Area": area, + "MajorAxisLength": major_axis_length, + "MinorAxisLength": minor_axis_length, + "Eccentricity": eccentricity, + "Solidity": solidity, + "Extent": extent, + "Orientation": orientation, + } + + return IDs + + +def PrepareData(image,z): + """Function for preparing input for maskzstack function. Connecting function + to use with mc micro ilastik pipeline""" + + image_path = Path(image) + print(f'{image_path} at {z}', file=sys.stderr) + + #Check to see if image tif(f) + if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf': + #Check to see if the image is ome.tif(f) + if image.endswith(('.ome.tif','.ome.tiff')): + #Read the image + image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') + #print('OME TIF(F) found') + else: + #Read the image + image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') + #print('TIF(F) found') + # Remove extra axis + #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4])) + + #Check to see if image is hdf5 + elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5': + #Read the image + f = h5py.File(image,'r+') + #Get the dataset name from the h5 file + dat_name = list(f.keys())[0] + ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data' + #Get the image data + image_loaded = np.array(f[dat_name]) + #Remove the first axis (ilastik convention) + image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3])) + ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be + ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1) + + #Return the objects + return image_loaded_z + + +def MaskZstack(masks_loaded,image,channel_names_loaded): + """This function will extract the stats for each cell mask through each channel + in the input image + + mask_loaded: dictionary containing Tiff masks that represents the cells in your image. + + z_stack: Multichannel z stack image""" + + #Get the names of the keys for the masks dictionary + mask_names = list(masks_loaded.keys()) + #Get the CellIDs for this dataset by using only a single mask (first mask) + IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]])) + #Create empty dictionary to store channel results per mask + dict_of_chan = {m_name: [] for m_name in mask_names} + #Get the z channel and the associated channel name from list of channel names + print(f'channels: {channel_names_loaded}', file=sys.stderr) + print(f'num channels: {len(channel_names_loaded)}', file=sys.stderr) + for z in range(len(channel_names_loaded)): + #Run the data Prep function + image_loaded_z = PrepareData(image,z) + + #Iterate through number of masks to extract single cell data + for nm in range(len(mask_names)): + #Use the above information to mask z stack + dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z)) + #Print progress + print("Finished "+str(z)) + + #Iterate through the rest of the masks to modify names of channels and convert to data table + for nm in mask_names: + #Check if this is the first mask + if nm == mask_names[0]: + #Create channel names for this mask + new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] + #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY + dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1) + #Get the name of the columns in the dataframe so we can reorder to histoCAT convention + cols = list(dict_of_chan[nm].columns.values) + #Reorder the list (Move xy position to end with spatial information) + cols.append(cols.pop(cols.index("X_centroid"))) + cols.append(cols.pop(cols.index("Y_centroid"))) + cols.append(cols.pop(cols.index("column_centroid"))) + cols.append(cols.pop(cols.index("row_centroid"))) + cols.append(cols.pop(cols.index("Area"))) + cols.append(cols.pop(cols.index("MajorAxisLength"))) + cols.append(cols.pop(cols.index("MinorAxisLength"))) + cols.append(cols.pop(cols.index("Eccentricity"))) + cols.append(cols.pop(cols.index("Solidity"))) + cols.append(cols.pop(cols.index("Extent"))) + cols.append(cols.pop(cols.index("Orientation"))) + #Reindex the dataframe with new order + dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols) + #Otherwise, add no spatial information + else: + #Create channel names for this mask + new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] + #Use the above information to mask z stack + dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm]))) + + #Concatenate all data from all masks to return + dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1) + + #Return the dataframe + return dat + + +def ExtractSingleCells(masks,image,channel_names,output): + """Function for extracting single cell information from input + path containing single-cell masks, z_stack path, and channel_names path.""" + + #Create pathlib object for output + output = Path(output) + + #Check if header available + #sniffer = csv.Sniffer() + #sniffer.has_header(open(channel_names).readline()) + #If header not available + #if not sniffer: + #If header available + #channel_names_loaded = pd.read_csv(channel_names) + #channel_names_loaded_list = list(channel_names_loaded.marker_name) + #else: + #print("negative") + #old one column version + #channel_names_loaded = pd.read_csv(channel_names,header=None) + #Add a column index for ease + #channel_names_loaded.columns = ["marker"] + #channel_names_loaded = list(channel_names_loaded.marker.values) + + #Read csv channel names + channel_names_loaded = pd.read_csv(channel_names) + #Check for size of columns + if channel_names_loaded.shape[1] > 1: + #Get the marker_name column if more than one column (CyCIF structure) + channel_names_loaded_list = list(channel_names_loaded.marker_name) + else: + #old one column version -- re-read the csv file and add column name + channel_names_loaded = pd.read_csv(channel_names, header = None) + #Add a column index for ease and for standardization + channel_names_loaded.columns = ["marker"] + channel_names_loaded_list = list(channel_names_loaded.marker) + + #Check for unique marker names -- create new list to store new names + channel_names_loaded_checked = [] + for idx,val in enumerate(channel_names_loaded_list): + #Check for unique value + if channel_names_loaded_list.count(val) > 1: + #If unique count greater than one, add suffix + channel_names_loaded_checked.append(val + "_"+ str(channel_names_loaded_list[:idx].count(val) + 1)) + else: + #Otherwise, leave channel name + channel_names_loaded_checked.append(val) + + #Clear small memory amount by clearing old channel names + channel_names_loaded, channel_names_loaded_list = None, None + + #Read the masks + masks_loaded = {} + #iterate through mask paths and read images to add to dictionary object + for m in masks: + m_full_name = os.path.basename(m) + m_name = m_full_name.split('.')[0] + masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')}) + + scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked) + #Write the singe cell data to a csv file using the image name + + im_full_name = os.path.basename(image) + im_name = im_full_name.split('.')[0] + scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False) + + +def MultiExtractSingleCells(masks,image,channel_names,output): + """Function for iterating over a list of z_stacks and output locations to + export single-cell data from image masks""" + + print("Extracting single-cell data for "+str(image)+'...') + + #Run the ExtractSingleCells function for this image + ExtractSingleCells(masks,image,channel_names,output) + + #Print update + im_full_name = os.path.basename(image) + im_name = im_full_name.split('.')[0] + print("Finished "+str(im_name))