Mercurial > repos > perssond > quantification
comparison SingleCellDataExtraction.py @ 0:928db0f952e3 draft
"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit a4349062e9177b5e60fb7c49115c57299e0d648d-dirty"
| author | perssond |
|---|---|
| date | Fri, 12 Mar 2021 00:19:24 +0000 |
| parents | |
| children | aba3655fdef0 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:928db0f952e3 |
|---|---|
| 1 #Functions for reading in single cell imaging data | |
| 2 #Joshua Hess | |
| 3 | |
| 4 #Import necessary modules | |
| 5 import skimage.io | |
| 6 import h5py | |
| 7 import pandas as pd | |
| 8 import numpy as np | |
| 9 import os | |
| 10 import skimage.measure as measure | |
| 11 from pathlib import Path | |
| 12 import csv | |
| 13 | |
| 14 import sys | |
| 15 | |
| 16 | |
| 17 def MaskChannel(mask_loaded,image_loaded_z): | |
| 18 """Function for quantifying a single channel image | |
| 19 | |
| 20 Returns a table with CellID according to the mask and the mean pixel intensity | |
| 21 for the given channel for each cell""" | |
| 22 print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr) | |
| 23 print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr) | |
| 24 dat = measure.regionprops(mask_loaded, image_loaded_z) | |
| 25 n = len(dat) | |
| 26 intensity_z = np.empty(n) | |
| 27 for i in range(n): | |
| 28 intensity_z[i] = dat[i].mean_intensity | |
| 29 # Clear reference to avoid memory leak -- see MaskIDs for explanation. | |
| 30 dat[i] = None | |
| 31 return intensity_z | |
| 32 | |
| 33 | |
| 34 def MaskIDs(mask): | |
| 35 """This function will extract the CellIDs and the XY positions for each | |
| 36 cell based on that cells centroid | |
| 37 | |
| 38 Returns a dictionary object""" | |
| 39 | |
| 40 dat = measure.regionprops(mask) | |
| 41 n = len(dat) | |
| 42 | |
| 43 # Pre-allocate numpy arrays for all properties we'll calculate. | |
| 44 labels = np.empty(n, int) | |
| 45 xcoords = np.empty(n) | |
| 46 ycoords = np.empty(n) | |
| 47 area = np.empty(n, int) | |
| 48 minor_axis_length = np.empty(n) | |
| 49 major_axis_length = np.empty(n) | |
| 50 eccentricity = np.empty(n) | |
| 51 solidity = np.empty(n) | |
| 52 extent = np.empty(n) | |
| 53 orientation = np.empty(n) | |
| 54 | |
| 55 for i in range(n): | |
| 56 labels[i] = dat[i].label | |
| 57 xcoords[i] = dat[i].centroid[1] | |
| 58 ycoords[i] = dat[i].centroid[0] | |
| 59 area[i] = dat[i].area | |
| 60 major_axis_length[i] = dat[i].major_axis_length | |
| 61 minor_axis_length[i] = dat[i].minor_axis_length | |
| 62 eccentricity[i] = dat[i].eccentricity | |
| 63 solidity[i] = dat[i].solidity | |
| 64 extent[i] = dat[i].extent | |
| 65 orientation[i] = dat[i].orientation | |
| 66 # By clearing the reference to each RegionProperties object, we allow it | |
| 67 # and its cache to be garbage collected immediately. Otherwise memory | |
| 68 # usage creeps up needlessly while this function is executing. | |
| 69 dat[i] = None | |
| 70 | |
| 71 IDs = { | |
| 72 "CellID": labels, | |
| 73 "X_centroid": xcoords, | |
| 74 "Y_centroid": ycoords, | |
| 75 "column_centroid": xcoords, | |
| 76 "row_centroid": ycoords, | |
| 77 "Area": area, | |
| 78 "MajorAxisLength": major_axis_length, | |
| 79 "MinorAxisLength": minor_axis_length, | |
| 80 "Eccentricity": eccentricity, | |
| 81 "Solidity": solidity, | |
| 82 "Extent": extent, | |
| 83 "Orientation": orientation, | |
| 84 } | |
| 85 | |
| 86 return IDs | |
| 87 | |
| 88 | |
| 89 def PrepareData(image,z): | |
| 90 """Function for preparing input for maskzstack function. Connecting function | |
| 91 to use with mc micro ilastik pipeline""" | |
| 92 | |
| 93 image_path = Path(image) | |
| 94 print(f'{image_path} at {z}', file=sys.stderr) | |
| 95 | |
| 96 #Check to see if image tif(f) | |
| 97 if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf': | |
| 98 #Check to see if the image is ome.tif(f) | |
| 99 if image.endswith(('.ome.tif','.ome.tiff')): | |
| 100 #Read the image | |
| 101 image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') | |
| 102 #print('OME TIF(F) found') | |
| 103 else: | |
| 104 #Read the image | |
| 105 image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile') | |
| 106 #print('TIF(F) found') | |
| 107 # Remove extra axis | |
| 108 #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4])) | |
| 109 | |
| 110 #Check to see if image is hdf5 | |
| 111 elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5': | |
| 112 #Read the image | |
| 113 f = h5py.File(image,'r+') | |
| 114 #Get the dataset name from the h5 file | |
| 115 dat_name = list(f.keys())[0] | |
| 116 ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data' | |
| 117 #Get the image data | |
| 118 image_loaded = np.array(f[dat_name]) | |
| 119 #Remove the first axis (ilastik convention) | |
| 120 image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3])) | |
| 121 ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be | |
| 122 ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1) | |
| 123 | |
| 124 #Return the objects | |
| 125 return image_loaded_z | |
| 126 | |
| 127 | |
| 128 def MaskZstack(masks_loaded,image,channel_names_loaded): | |
| 129 """This function will extract the stats for each cell mask through each channel | |
| 130 in the input image | |
| 131 | |
| 132 mask_loaded: dictionary containing Tiff masks that represents the cells in your image. | |
| 133 | |
| 134 z_stack: Multichannel z stack image""" | |
| 135 | |
| 136 #Get the names of the keys for the masks dictionary | |
| 137 mask_names = list(masks_loaded.keys()) | |
| 138 #Get the CellIDs for this dataset by using only a single mask (first mask) | |
| 139 IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]])) | |
| 140 #Create empty dictionary to store channel results per mask | |
| 141 dict_of_chan = {m_name: [] for m_name in mask_names} | |
| 142 #Get the z channel and the associated channel name from list of channel names | |
| 143 print(f'channels: {channel_names_loaded}', file=sys.stderr) | |
| 144 print(f'num channels: {len(channel_names_loaded)}', file=sys.stderr) | |
| 145 for z in range(len(channel_names_loaded)): | |
| 146 #Run the data Prep function | |
| 147 image_loaded_z = PrepareData(image,z) | |
| 148 | |
| 149 #Iterate through number of masks to extract single cell data | |
| 150 for nm in range(len(mask_names)): | |
| 151 #Use the above information to mask z stack | |
| 152 dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z)) | |
| 153 #Print progress | |
| 154 print("Finished "+str(z)) | |
| 155 | |
| 156 #Iterate through the rest of the masks to modify names of channels and convert to data table | |
| 157 for nm in mask_names: | |
| 158 #Check if this is the first mask | |
| 159 if nm == mask_names[0]: | |
| 160 #Create channel names for this mask | |
| 161 new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] | |
| 162 #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY | |
| 163 dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1) | |
| 164 #Get the name of the columns in the dataframe so we can reorder to histoCAT convention | |
| 165 cols = list(dict_of_chan[nm].columns.values) | |
| 166 #Reorder the list (Move xy position to end with spatial information) | |
| 167 cols.append(cols.pop(cols.index("X_centroid"))) | |
| 168 cols.append(cols.pop(cols.index("Y_centroid"))) | |
| 169 cols.append(cols.pop(cols.index("column_centroid"))) | |
| 170 cols.append(cols.pop(cols.index("row_centroid"))) | |
| 171 cols.append(cols.pop(cols.index("Area"))) | |
| 172 cols.append(cols.pop(cols.index("MajorAxisLength"))) | |
| 173 cols.append(cols.pop(cols.index("MinorAxisLength"))) | |
| 174 cols.append(cols.pop(cols.index("Eccentricity"))) | |
| 175 cols.append(cols.pop(cols.index("Solidity"))) | |
| 176 cols.append(cols.pop(cols.index("Extent"))) | |
| 177 cols.append(cols.pop(cols.index("Orientation"))) | |
| 178 #Reindex the dataframe with new order | |
| 179 dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols) | |
| 180 #Otherwise, add no spatial information | |
| 181 else: | |
| 182 #Create channel names for this mask | |
| 183 new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))] | |
| 184 #Use the above information to mask z stack | |
| 185 dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm]))) | |
| 186 | |
| 187 #Concatenate all data from all masks to return | |
| 188 dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1) | |
| 189 | |
| 190 #Return the dataframe | |
| 191 return dat | |
| 192 | |
| 193 | |
| 194 def ExtractSingleCells(masks,image,channel_names,output): | |
| 195 """Function for extracting single cell information from input | |
| 196 path containing single-cell masks, z_stack path, and channel_names path.""" | |
| 197 | |
| 198 #Create pathlib object for output | |
| 199 output = Path(output) | |
| 200 | |
| 201 #Check if header available | |
| 202 #sniffer = csv.Sniffer() | |
| 203 #sniffer.has_header(open(channel_names).readline()) | |
| 204 #If header not available | |
| 205 #if not sniffer: | |
| 206 #If header available | |
| 207 #channel_names_loaded = pd.read_csv(channel_names) | |
| 208 #channel_names_loaded_list = list(channel_names_loaded.marker_name) | |
| 209 #else: | |
| 210 #print("negative") | |
| 211 #old one column version | |
| 212 #channel_names_loaded = pd.read_csv(channel_names,header=None) | |
| 213 #Add a column index for ease | |
| 214 #channel_names_loaded.columns = ["marker"] | |
| 215 #channel_names_loaded = list(channel_names_loaded.marker.values) | |
| 216 | |
| 217 #Read csv channel names | |
| 218 channel_names_loaded = pd.read_csv(channel_names) | |
| 219 #Check for size of columns | |
| 220 if channel_names_loaded.shape[1] > 1: | |
| 221 #Get the marker_name column if more than one column (CyCIF structure) | |
| 222 channel_names_loaded_list = list(channel_names_loaded.marker_name) | |
| 223 else: | |
| 224 #old one column version -- re-read the csv file and add column name | |
| 225 channel_names_loaded = pd.read_csv(channel_names, header = None) | |
| 226 #Add a column index for ease and for standardization | |
| 227 channel_names_loaded.columns = ["marker"] | |
| 228 channel_names_loaded_list = list(channel_names_loaded.marker) | |
| 229 | |
| 230 #Check for unique marker names -- create new list to store new names | |
| 231 channel_names_loaded_checked = [] | |
| 232 for idx,val in enumerate(channel_names_loaded_list): | |
| 233 #Check for unique value | |
| 234 if channel_names_loaded_list.count(val) > 1: | |
| 235 #If unique count greater than one, add suffix | |
| 236 channel_names_loaded_checked.append(val + "_"+ str(channel_names_loaded_list[:idx].count(val) + 1)) | |
| 237 else: | |
| 238 #Otherwise, leave channel name | |
| 239 channel_names_loaded_checked.append(val) | |
| 240 | |
| 241 #Clear small memory amount by clearing old channel names | |
| 242 channel_names_loaded, channel_names_loaded_list = None, None | |
| 243 | |
| 244 #Read the masks | |
| 245 masks_loaded = {} | |
| 246 #iterate through mask paths and read images to add to dictionary object | |
| 247 for m in masks: | |
| 248 m_full_name = os.path.basename(m) | |
| 249 m_name = m_full_name.split('.')[0] | |
| 250 masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')}) | |
| 251 | |
| 252 scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked) | |
| 253 #Write the singe cell data to a csv file using the image name | |
| 254 | |
| 255 im_full_name = os.path.basename(image) | |
| 256 im_name = im_full_name.split('.')[0] | |
| 257 scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False) | |
| 258 | |
| 259 | |
| 260 def MultiExtractSingleCells(masks,image,channel_names,output): | |
| 261 """Function for iterating over a list of z_stacks and output locations to | |
| 262 export single-cell data from image masks""" | |
| 263 | |
| 264 print("Extracting single-cell data for "+str(image)+'...') | |
| 265 | |
| 266 #Run the ExtractSingleCells function for this image | |
| 267 ExtractSingleCells(masks,image,channel_names,output) | |
| 268 | |
| 269 #Print update | |
| 270 im_full_name = os.path.basename(image) | |
| 271 im_name = im_full_name.split('.')[0] | |
| 272 print("Finished "+str(im_name)) |
