comparison SingleCellDataExtraction.py @ 0:928db0f952e3 draft

"planemo upload for repository https://github.com/ohsu-comp-bio/quantification commit a4349062e9177b5e60fb7c49115c57299e0d648d-dirty"
author perssond
date Fri, 12 Mar 2021 00:19:24 +0000
parents
children aba3655fdef0
comparison
equal deleted inserted replaced
-1:000000000000 0:928db0f952e3
1 #Functions for reading in single cell imaging data
2 #Joshua Hess
3
4 #Import necessary modules
5 import skimage.io
6 import h5py
7 import pandas as pd
8 import numpy as np
9 import os
10 import skimage.measure as measure
11 from pathlib import Path
12 import csv
13
14 import sys
15
16
17 def MaskChannel(mask_loaded,image_loaded_z):
18 """Function for quantifying a single channel image
19
20 Returns a table with CellID according to the mask and the mean pixel intensity
21 for the given channel for each cell"""
22 print(f'Mask loaded: {mask_loaded.shape}', file=sys.stderr)
23 print(f'Image loaded: {image_loaded_z.shape}', file=sys.stderr)
24 dat = measure.regionprops(mask_loaded, image_loaded_z)
25 n = len(dat)
26 intensity_z = np.empty(n)
27 for i in range(n):
28 intensity_z[i] = dat[i].mean_intensity
29 # Clear reference to avoid memory leak -- see MaskIDs for explanation.
30 dat[i] = None
31 return intensity_z
32
33
34 def MaskIDs(mask):
35 """This function will extract the CellIDs and the XY positions for each
36 cell based on that cells centroid
37
38 Returns a dictionary object"""
39
40 dat = measure.regionprops(mask)
41 n = len(dat)
42
43 # Pre-allocate numpy arrays for all properties we'll calculate.
44 labels = np.empty(n, int)
45 xcoords = np.empty(n)
46 ycoords = np.empty(n)
47 area = np.empty(n, int)
48 minor_axis_length = np.empty(n)
49 major_axis_length = np.empty(n)
50 eccentricity = np.empty(n)
51 solidity = np.empty(n)
52 extent = np.empty(n)
53 orientation = np.empty(n)
54
55 for i in range(n):
56 labels[i] = dat[i].label
57 xcoords[i] = dat[i].centroid[1]
58 ycoords[i] = dat[i].centroid[0]
59 area[i] = dat[i].area
60 major_axis_length[i] = dat[i].major_axis_length
61 minor_axis_length[i] = dat[i].minor_axis_length
62 eccentricity[i] = dat[i].eccentricity
63 solidity[i] = dat[i].solidity
64 extent[i] = dat[i].extent
65 orientation[i] = dat[i].orientation
66 # By clearing the reference to each RegionProperties object, we allow it
67 # and its cache to be garbage collected immediately. Otherwise memory
68 # usage creeps up needlessly while this function is executing.
69 dat[i] = None
70
71 IDs = {
72 "CellID": labels,
73 "X_centroid": xcoords,
74 "Y_centroid": ycoords,
75 "column_centroid": xcoords,
76 "row_centroid": ycoords,
77 "Area": area,
78 "MajorAxisLength": major_axis_length,
79 "MinorAxisLength": minor_axis_length,
80 "Eccentricity": eccentricity,
81 "Solidity": solidity,
82 "Extent": extent,
83 "Orientation": orientation,
84 }
85
86 return IDs
87
88
89 def PrepareData(image,z):
90 """Function for preparing input for maskzstack function. Connecting function
91 to use with mc micro ilastik pipeline"""
92
93 image_path = Path(image)
94 print(f'{image_path} at {z}', file=sys.stderr)
95
96 #Check to see if image tif(f)
97 if image_path.suffix == '.tiff' or image_path.suffix == '.tif' or image_path.suffix == '.btf':
98 #Check to see if the image is ome.tif(f)
99 if image.endswith(('.ome.tif','.ome.tiff')):
100 #Read the image
101 image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
102 #print('OME TIF(F) found')
103 else:
104 #Read the image
105 image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
106 #print('TIF(F) found')
107 # Remove extra axis
108 #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4]))
109
110 #Check to see if image is hdf5
111 elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5':
112 #Read the image
113 f = h5py.File(image,'r+')
114 #Get the dataset name from the h5 file
115 dat_name = list(f.keys())[0]
116 ###If the hdf5 is exported from ilastik fiji plugin, the dat_name will be 'data'
117 #Get the image data
118 image_loaded = np.array(f[dat_name])
119 #Remove the first axis (ilastik convention)
120 image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[2],image_loaded.shape[3]))
121 ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be
122 ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1)
123
124 #Return the objects
125 return image_loaded_z
126
127
128 def MaskZstack(masks_loaded,image,channel_names_loaded):
129 """This function will extract the stats for each cell mask through each channel
130 in the input image
131
132 mask_loaded: dictionary containing Tiff masks that represents the cells in your image.
133
134 z_stack: Multichannel z stack image"""
135
136 #Get the names of the keys for the masks dictionary
137 mask_names = list(masks_loaded.keys())
138 #Get the CellIDs for this dataset by using only a single mask (first mask)
139 IDs = pd.DataFrame(MaskIDs(masks_loaded[mask_names[0]]))
140 #Create empty dictionary to store channel results per mask
141 dict_of_chan = {m_name: [] for m_name in mask_names}
142 #Get the z channel and the associated channel name from list of channel names
143 print(f'channels: {channel_names_loaded}', file=sys.stderr)
144 print(f'num channels: {len(channel_names_loaded)}', file=sys.stderr)
145 for z in range(len(channel_names_loaded)):
146 #Run the data Prep function
147 image_loaded_z = PrepareData(image,z)
148
149 #Iterate through number of masks to extract single cell data
150 for nm in range(len(mask_names)):
151 #Use the above information to mask z stack
152 dict_of_chan[mask_names[nm]].append(MaskChannel(masks_loaded[mask_names[nm]],image_loaded_z))
153 #Print progress
154 print("Finished "+str(z))
155
156 #Iterate through the rest of the masks to modify names of channels and convert to data table
157 for nm in mask_names:
158 #Check if this is the first mask
159 if nm == mask_names[0]:
160 #Create channel names for this mask
161 new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
162 #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY
163 dict_of_chan[nm] = pd.concat([IDs,pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))],axis=1)
164 #Get the name of the columns in the dataframe so we can reorder to histoCAT convention
165 cols = list(dict_of_chan[nm].columns.values)
166 #Reorder the list (Move xy position to end with spatial information)
167 cols.append(cols.pop(cols.index("X_centroid")))
168 cols.append(cols.pop(cols.index("Y_centroid")))
169 cols.append(cols.pop(cols.index("column_centroid")))
170 cols.append(cols.pop(cols.index("row_centroid")))
171 cols.append(cols.pop(cols.index("Area")))
172 cols.append(cols.pop(cols.index("MajorAxisLength")))
173 cols.append(cols.pop(cols.index("MinorAxisLength")))
174 cols.append(cols.pop(cols.index("Eccentricity")))
175 cols.append(cols.pop(cols.index("Solidity")))
176 cols.append(cols.pop(cols.index("Extent")))
177 cols.append(cols.pop(cols.index("Orientation")))
178 #Reindex the dataframe with new order
179 dict_of_chan[nm] = dict_of_chan[nm].reindex(columns=cols)
180 #Otherwise, add no spatial information
181 else:
182 #Create channel names for this mask
183 new_names = [channel_names_loaded[i]+"_"+str(nm) for i in range(len(channel_names_loaded))]
184 #Use the above information to mask z stack
185 dict_of_chan[nm] = pd.DataFrame(dict(zip(new_names,dict_of_chan[nm])))
186
187 #Concatenate all data from all masks to return
188 dat = pd.concat([dict_of_chan[nm] for nm in mask_names],axis=1)
189
190 #Return the dataframe
191 return dat
192
193
194 def ExtractSingleCells(masks,image,channel_names,output):
195 """Function for extracting single cell information from input
196 path containing single-cell masks, z_stack path, and channel_names path."""
197
198 #Create pathlib object for output
199 output = Path(output)
200
201 #Check if header available
202 #sniffer = csv.Sniffer()
203 #sniffer.has_header(open(channel_names).readline())
204 #If header not available
205 #if not sniffer:
206 #If header available
207 #channel_names_loaded = pd.read_csv(channel_names)
208 #channel_names_loaded_list = list(channel_names_loaded.marker_name)
209 #else:
210 #print("negative")
211 #old one column version
212 #channel_names_loaded = pd.read_csv(channel_names,header=None)
213 #Add a column index for ease
214 #channel_names_loaded.columns = ["marker"]
215 #channel_names_loaded = list(channel_names_loaded.marker.values)
216
217 #Read csv channel names
218 channel_names_loaded = pd.read_csv(channel_names)
219 #Check for size of columns
220 if channel_names_loaded.shape[1] > 1:
221 #Get the marker_name column if more than one column (CyCIF structure)
222 channel_names_loaded_list = list(channel_names_loaded.marker_name)
223 else:
224 #old one column version -- re-read the csv file and add column name
225 channel_names_loaded = pd.read_csv(channel_names, header = None)
226 #Add a column index for ease and for standardization
227 channel_names_loaded.columns = ["marker"]
228 channel_names_loaded_list = list(channel_names_loaded.marker)
229
230 #Check for unique marker names -- create new list to store new names
231 channel_names_loaded_checked = []
232 for idx,val in enumerate(channel_names_loaded_list):
233 #Check for unique value
234 if channel_names_loaded_list.count(val) > 1:
235 #If unique count greater than one, add suffix
236 channel_names_loaded_checked.append(val + "_"+ str(channel_names_loaded_list[:idx].count(val) + 1))
237 else:
238 #Otherwise, leave channel name
239 channel_names_loaded_checked.append(val)
240
241 #Clear small memory amount by clearing old channel names
242 channel_names_loaded, channel_names_loaded_list = None, None
243
244 #Read the masks
245 masks_loaded = {}
246 #iterate through mask paths and read images to add to dictionary object
247 for m in masks:
248 m_full_name = os.path.basename(m)
249 m_name = m_full_name.split('.')[0]
250 masks_loaded.update({str(m_name):skimage.io.imread(m,plugin='tifffile')})
251
252 scdata_z = MaskZstack(masks_loaded,image,channel_names_loaded_checked)
253 #Write the singe cell data to a csv file using the image name
254
255 im_full_name = os.path.basename(image)
256 im_name = im_full_name.split('.')[0]
257 scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False)
258
259
260 def MultiExtractSingleCells(masks,image,channel_names,output):
261 """Function for iterating over a list of z_stacks and output locations to
262 export single-cell data from image masks"""
263
264 print("Extracting single-cell data for "+str(image)+'...')
265
266 #Run the ExtractSingleCells function for this image
267 ExtractSingleCells(masks,image,channel_names,output)
268
269 #Print update
270 im_full_name = os.path.basename(image)
271 im_name = im_full_name.split('.')[0]
272 print("Finished "+str(im_name))