Mercurial > repos > ecology > xarray_select
comparison xarray_tool.py @ 2:123a9a629bef draft
"planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/data_manipulation/xarray/ commit 57b6d23e3734d883e71081c78e77964d61be82ba"
author | ecology |
---|---|
date | Sun, 06 Jun 2021 08:51:41 +0000 |
parents | 6baac361495b |
children | bf595d613af4 |
comparison
equal
deleted
inserted
replaced
1:6baac361495b | 2:123a9a629bef |
---|---|
2 # - getting metadata information | 2 # - getting metadata information |
3 # - select data and save results in csv file for further post-processing | 3 # - select data and save results in csv file for further post-processing |
4 | 4 |
5 import argparse | 5 import argparse |
6 import csv | 6 import csv |
7 import os | |
7 import warnings | 8 import warnings |
8 | 9 |
9 import geopandas as gdp | 10 import geopandas as gdp |
10 | 11 |
11 import pandas as pd | 12 import pandas as pd |
19 class XarrayTool (): | 20 class XarrayTool (): |
20 def __init__(self, infile, outfile_info="", outfile_summary="", | 21 def __init__(self, infile, outfile_info="", outfile_summary="", |
21 select="", outfile="", outputdir="", latname="", | 22 select="", outfile="", outputdir="", latname="", |
22 latvalN="", latvalS="", lonname="", lonvalE="", | 23 latvalN="", latvalS="", lonname="", lonvalE="", |
23 lonvalW="", filter_list="", coords="", time="", | 24 lonvalW="", filter_list="", coords="", time="", |
24 verbose=False | 25 verbose=False, no_missing=False, coords_info=None, |
25 ): | 26 tolerance=None): |
26 self.infile = infile | 27 self.infile = infile |
27 self.outfile_info = outfile_info | 28 self.outfile_info = outfile_info |
28 self.outfile_summary = outfile_summary | 29 self.outfile_summary = outfile_summary |
29 self.select = select | 30 self.select = select |
30 self.outfile = outfile | 31 self.outfile = outfile |
31 self.outputdir = outputdir | 32 self.outputdir = outputdir |
32 self.latname = latname | 33 self.latname = latname |
34 if tolerance != "" and tolerance is not None: | |
35 self.tolerance = float(tolerance) | |
36 else: | |
37 self.tolerance = -1 | |
33 if latvalN != "" and latvalN is not None: | 38 if latvalN != "" and latvalN is not None: |
34 self.latvalN = float(latvalN) | 39 self.latvalN = float(latvalN) |
35 else: | 40 else: |
36 self.latvalN = "" | 41 self.latvalN = "" |
37 if latvalS != "" and latvalS is not None: | 42 if latvalS != "" and latvalS is not None: |
49 self.lonvalW = "" | 54 self.lonvalW = "" |
50 self.filter = filter_list | 55 self.filter = filter_list |
51 self.time = time | 56 self.time = time |
52 self.coords = coords | 57 self.coords = coords |
53 self.verbose = verbose | 58 self.verbose = verbose |
59 self.no_missing = no_missing | |
54 # initialization | 60 # initialization |
55 self.dset = None | 61 self.dset = None |
56 self.gset = None | 62 self.gset = None |
63 self.coords_info = coords_info | |
57 if self.verbose: | 64 if self.verbose: |
58 print("infile: ", self.infile) | 65 print("infile: ", self.infile) |
59 print("outfile_info: ", self.outfile_info) | 66 print("outfile_info: ", self.outfile_info) |
60 print("outfile_summary: ", self.outfile_summary) | 67 print("outfile_summary: ", self.outfile_summary) |
61 print("outfile: ", self.outfile) | 68 print("outfile: ", self.outfile) |
69 print("lonvalE: ", self.lonvalE) | 76 print("lonvalE: ", self.lonvalE) |
70 print("lonvalW: ", self.lonvalW) | 77 print("lonvalW: ", self.lonvalW) |
71 print("filter: ", self.filter) | 78 print("filter: ", self.filter) |
72 print("time: ", self.time) | 79 print("time: ", self.time) |
73 print("coords: ", self.coords) | 80 print("coords: ", self.coords) |
81 print("coords_info: ", self.coords_info) | |
74 | 82 |
75 def info(self): | 83 def info(self): |
76 f = open(self.outfile_info, 'w') | 84 f = open(self.outfile_info, 'w') |
77 ds = xr.open_dataset(self.infile) | 85 ds = xr.open_dataset(self.infile) |
78 ds.info(f) | 86 ds.info(f) |
111 if (op == 'bi'): | 119 if (op == 'bi'): |
112 rl = float(split_filter[3]) | 120 rl = float(split_filter[3]) |
113 if filter_varname == self.select: | 121 if filter_varname == self.select: |
114 # filter on values of the selected variable | 122 # filter on values of the selected variable |
115 if op == 'bi': | 123 if op == 'bi': |
116 self.dset = self.dset.where((self.dset <= rl) & (self.dset >= ll)) | 124 self.dset = self.dset.where( |
125 (self.dset <= rl) & (self.dset >= ll) | |
126 ) | |
117 elif op == 'le': | 127 elif op == 'le': |
118 self.dset = self.dset.where(self.dset <= ll) | 128 self.dset = self.dset.where(self.dset <= ll) |
119 elif op == 'ge': | 129 elif op == 'ge': |
120 self.dset = self.dset.where(self.dset >= ll) | 130 self.dset = self.dset.where(self.dset >= ll) |
121 elif op == 'e': | 131 elif op == 'e': |
139 self.datetime_selection() | 149 self.datetime_selection() |
140 if self.filter: | 150 if self.filter: |
141 self.filter_selection() | 151 self.filter_selection() |
142 | 152 |
143 self.area_selection() | 153 self.area_selection() |
144 # convert to dataframe | 154 if self.gset.count() > 1: |
145 self.gset = self.gset.to_dataframe().dropna(how='all').reset_index() | 155 # convert to dataframe if several rows and cols |
146 self.gset.to_csv(self.outfile, header=True, sep='\t') | 156 self.gset = self.gset.to_dataframe().dropna(how='all'). \ |
157 reset_index() | |
158 self.gset.to_csv(self.outfile, header=True, sep='\t') | |
159 else: | |
160 data = { | |
161 self.latname: [self.gset[self.latname].values], | |
162 self.lonname: [self.gset[self.lonname].values], | |
163 self.select: [self.gset.values] | |
164 } | |
165 | |
166 df = pd.DataFrame(data, columns=[self.latname, self.lonname, | |
167 self.select]) | |
168 df.to_csv(self.outfile, header=True, sep='\t') | |
147 | 169 |
148 def datetime_selection(self): | 170 def datetime_selection(self): |
149 split_filter = self.time.split('#') | 171 split_filter = self.time.split('#') |
150 time_varname = split_filter[0] | 172 time_varname = split_filter[0] |
151 op = split_filter[1] | 173 op = split_filter[1] |
163 def filter_selection(self): | 185 def filter_selection(self): |
164 for single_filter in self.filter: | 186 for single_filter in self.filter: |
165 self.rowfilter(single_filter) | 187 self.rowfilter(single_filter) |
166 | 188 |
167 def area_selection(self): | 189 def area_selection(self): |
190 | |
168 if self.latvalS != "" and self.lonvalW != "": | 191 if self.latvalS != "" and self.lonvalW != "": |
169 # Select geographical area | 192 # Select geographical area |
170 self.gset = self.dset.sel({self.latname: | 193 self.gset = self.dset.sel({self.latname: |
171 slice(self.latvalS, self.latvalN), | 194 slice(self.latvalS, self.latvalN), |
172 self.lonname: | 195 self.lonname: |
173 slice(self.lonvalW, self.lonvalE)}) | 196 slice(self.lonvalW, self.lonvalE)}) |
174 elif self.latvalN != "" and self.lonvalE != "": | 197 elif self.latvalN != "" and self.lonvalE != "": |
175 # select nearest location | 198 # select nearest location |
176 self.nearest_location() # find nearest location without NaN values | 199 if self.no_missing: |
177 self.gset = self.dset.sel({self.latname: self.nearest_latvalN, | 200 self.nearest_latvalN = self.latvalN |
178 self.lonname: self.nearest_lonvalE}, | 201 self.nearest_lonvalE = self.lonvalE |
179 method='nearest') | 202 else: |
203 # find nearest location without NaN values | |
204 self.nearest_location() | |
205 if self.tolerance > 0: | |
206 self.gset = self.dset.sel({self.latname: self.nearest_latvalN, | |
207 self.lonname: self.nearest_lonvalE}, | |
208 method='nearest', | |
209 tolerance=self.tolerance) | |
210 else: | |
211 self.gset = self.dset.sel({self.latname: self.nearest_latvalN, | |
212 self.lonname: self.nearest_lonvalE}, | |
213 method='nearest') | |
180 else: | 214 else: |
181 self.gset = self.dset | 215 self.gset = self.dset |
182 | 216 |
183 def nearest_location(self): | 217 def nearest_location(self): |
184 # Build a geopandas dataframe with all first elements in each dimension | 218 # Build a geopandas dataframe with all first elements in each dimension |
204 def selection_from_coords(self): | 238 def selection_from_coords(self): |
205 fcoords = pd.read_csv(self.coords, sep='\t') | 239 fcoords = pd.read_csv(self.coords, sep='\t') |
206 for row in fcoords.itertuples(): | 240 for row in fcoords.itertuples(): |
207 self.latvalN = row[0] | 241 self.latvalN = row[0] |
208 self.lonvalE = row[1] | 242 self.lonvalE = row[1] |
209 self.outfile = (self.outputdir + '/' + self.select + '_' + str(row.Index) + '.tabular') | 243 self.outfile = (os.path.join(self.outputdir, |
244 self.select + '_' + | |
245 str(row.Index) + '.tabular')) | |
210 self.selection() | 246 self.selection() |
247 | |
248 def get_coords_info(self): | |
249 ds = xr.open_dataset(self.infile) | |
250 for c in ds.coords: | |
251 filename = os.path.join(self.coords_info, | |
252 c.strip() + | |
253 '.tabular') | |
254 pd = ds.coords[c].to_pandas() | |
255 pd.index = range(len(pd)) | |
256 pd.to_csv(filename, header=False, sep='\t') | |
211 | 257 |
212 | 258 |
213 if __name__ == '__main__': | 259 if __name__ == '__main__': |
214 warnings.filterwarnings("ignore") | 260 warnings.filterwarnings("ignore") |
215 parser = argparse.ArgumentParser() | 261 parser = argparse.ArgumentParser() |
253 parser.add_argument( | 299 parser.add_argument( |
254 '--lonvalW', | 300 '--lonvalW', |
255 help='West longitude value' | 301 help='West longitude value' |
256 ) | 302 ) |
257 parser.add_argument( | 303 parser.add_argument( |
304 '--tolerance', | |
305 help='Maximum distance between original and selected value for ' | |
306 ' inexact matches e.g. abs(index[indexer] - target) <= tolerance' | |
307 ) | |
308 parser.add_argument( | |
258 '--coords', | 309 '--coords', |
259 help='Input file containing Latitude and Longitude' | 310 help='Input file containing Latitude and Longitude' |
260 'for geographical selection' | 311 'for geographical selection' |
261 ) | 312 ) |
262 parser.add_argument( | 313 parser.add_argument( |
314 '--coords_info', | |
315 help='output-folder where for each coordinate, coordinate values ' | |
316 ' are being printed in the corresponding outputfile' | |
317 ) | |
318 parser.add_argument( | |
263 '--filter', | 319 '--filter', |
264 nargs="*", | 320 nargs="*", |
265 help='Filter list variable#operator#value_s#value_e' | 321 help='Filter list variable#operator#value_s#value_e' |
266 ) | 322 ) |
267 parser.add_argument( | 323 parser.add_argument( |
279 '(valid only when --select)' | 335 '(valid only when --select)' |
280 ) | 336 ) |
281 parser.add_argument( | 337 parser.add_argument( |
282 "-v", "--verbose", | 338 "-v", "--verbose", |
283 help="switch on verbose mode", | 339 help="switch on verbose mode", |
340 action="store_true" | |
341 ) | |
342 parser.add_argument( | |
343 "--no_missing", | |
344 help="""Do not take into account possible null/missing values | |
345 (only valid for single location)""", | |
284 action="store_true" | 346 action="store_true" |
285 ) | 347 ) |
286 args = parser.parse_args() | 348 args = parser.parse_args() |
287 | 349 |
288 p = XarrayTool(args.infile, args.info, args.summary, args.select, | 350 p = XarrayTool(args.infile, args.info, args.summary, args.select, |
289 args.outfile, args.outputdir, args.latname, | 351 args.outfile, args.outputdir, args.latname, |
290 args.latvalN, args.latvalS, args.lonname, | 352 args.latvalN, args.latvalS, args.lonname, |
291 args.lonvalE, args.lonvalW, args.filter, | 353 args.lonvalE, args.lonvalW, args.filter, |
292 args.coords, args.time, args.verbose) | 354 args.coords, args.time, args.verbose, |
355 args.no_missing, args.coords_info, args.tolerance) | |
293 if args.info: | 356 if args.info: |
294 p.info() | 357 p.info() |
295 if args.summary: | 358 if args.summary: |
296 p.summary() | 359 p.summary() |
297 if args.coords: | 360 if args.coords: |
298 p.selection_from_coords() | 361 p.selection_from_coords() |
299 elif args.select: | 362 elif args.select: |
300 p.selection() | 363 p.selection() |
364 elif args.coords_info: | |
365 p.get_coords_info() |