comparison xarray_select.py @ 4:b393815e4cb7 draft default tip

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/data_manipulation/xarray/ commit fd8ad4d97db7b1fd3876ff63e14280474e06fdf7
author ecology
date Sun, 31 Jul 2022 21:20:41 +0000
parents
children
comparison
equal deleted inserted replaced
3:bf595d613af4 4:b393815e4cb7
1 # xarray tool for:
2 # - getting metadata information
3 # - select data and save results in csv file for further post-processing
4
5 import argparse
6 import os
7 import warnings
8
9 import geopandas as gdp
10
11 import pandas as pd
12
13 from shapely.geometry import Point
14 from shapely.ops import nearest_points
15
16 import xarray as xr
17
18
19 class XarraySelect ():
20 def __init__(self, infile, select="", outfile="", outputdir="",
21 latname="", latvalN="", latvalS="", lonname="",
22 lonvalE="", lonvalW="", filter_list="", coords="",
23 time="", verbose=False, no_missing=False,
24 tolerance=None):
25 self.infile = infile
26 self.select = select
27 self.outfile = outfile
28 self.outputdir = outputdir
29 self.latname = latname
30 if tolerance != "" and tolerance is not None:
31 self.tolerance = float(tolerance)
32 else:
33 self.tolerance = -1
34 if latvalN != "" and latvalN is not None:
35 self.latvalN = float(latvalN)
36 else:
37 self.latvalN = ""
38 if latvalS != "" and latvalS is not None:
39 self.latvalS = float(latvalS)
40 else:
41 self.latvalS = ""
42 self.lonname = lonname
43 if lonvalE != "" and lonvalE is not None:
44 self.lonvalE = float(lonvalE)
45 else:
46 self.lonvalE = ""
47 if lonvalW != "" and lonvalW is not None:
48 self.lonvalW = float(lonvalW)
49 else:
50 self.lonvalW = ""
51 self.filter = filter_list
52 self.time = time
53 self.coords = coords
54 self.verbose = verbose
55 self.no_missing = no_missing
56 # initialization
57 self.dset = None
58 self.gset = None
59 if self.verbose:
60 print("infile: ", self.infile)
61 print("outfile: ", self.outfile)
62 print("select: ", self.select)
63 print("outfile: ", self.outfile)
64 print("outputdir: ", self.outputdir)
65 print("latname: ", self.latname)
66 print("latvalN: ", self.latvalN)
67 print("latvalS: ", self.latvalS)
68 print("lonname: ", self.lonname)
69 print("lonvalE: ", self.lonvalE)
70 print("lonvalW: ", self.lonvalW)
71 print("filter: ", self.filter)
72 print("time: ", self.time)
73 print("coords: ", self.coords)
74
75 def rowfilter(self, single_filter):
76 split_filter = single_filter.split('#')
77 filter_varname = split_filter[0]
78 op = split_filter[1]
79 ll = float(split_filter[2])
80 if (op == 'bi'):
81 rl = float(split_filter[3])
82 if filter_varname == self.select:
83 # filter on values of the selected variable
84 if op == 'bi':
85 self.dset = self.dset.where(
86 (self.dset <= rl) & (self.dset >= ll)
87 )
88 elif op == 'le':
89 self.dset = self.dset.where(self.dset <= ll)
90 elif op == 'ge':
91 self.dset = self.dset.where(self.dset >= ll)
92 elif op == 'e':
93 self.dset = self.dset.where(self.dset == ll)
94 else: # filter on other dimensions of the selected variable
95 if op == 'bi':
96 self.dset = self.dset.sel({filter_varname: slice(ll, rl)})
97 elif op == 'le':
98 self.dset = self.dset.sel({filter_varname: slice(None, ll)})
99 elif op == 'ge':
100 self.dset = self.dset.sel({filter_varname: slice(ll, None)})
101 elif op == 'e':
102 self.dset = self.dset.sel({filter_varname: ll},
103 method='nearest')
104
105 def selection(self):
106 if self.dset is None:
107 self.ds = xr.open_dataset(self.infile)
108 self.dset = self.ds[self.select] # select variable
109 if self.time:
110 self.datetime_selection()
111 if self.filter:
112 self.filter_selection()
113
114 self.area_selection()
115 if self.gset.count() > 1:
116 # convert to dataframe if several rows and cols
117 self.gset = self.gset.to_dataframe().dropna(how='all'). \
118 reset_index()
119 self.gset.to_csv(self.outfile, header=True, sep='\t')
120 else:
121 data = {
122 self.latname: [self.gset[self.latname].values],
123 self.lonname: [self.gset[self.lonname].values],
124 self.select: [self.gset.values]
125 }
126
127 df = pd.DataFrame(data, columns=[self.latname, self.lonname,
128 self.select])
129 df.to_csv(self.outfile, header=True, sep='\t')
130
131 def datetime_selection(self):
132 split_filter = self.time.split('#')
133 time_varname = split_filter[0]
134 op = split_filter[1]
135 ll = split_filter[2]
136 if (op == 'sl'):
137 rl = split_filter[3]
138 self.dset = self.dset.sel({time_varname: slice(ll, rl)})
139 elif (op == 'to'):
140 self.dset = self.dset.sel({time_varname: slice(None, ll)})
141 elif (op == 'from'):
142 self.dset = self.dset.sel({time_varname: slice(ll, None)})
143 elif (op == 'is'):
144 self.dset = self.dset.sel({time_varname: ll}, method='nearest')
145
146 def filter_selection(self):
147 for single_filter in self.filter:
148 self.rowfilter(single_filter)
149
150 def area_selection(self):
151
152 if self.latvalS != "" and self.lonvalW != "":
153 # Select geographical area
154 self.gset = self.dset.sel({self.latname:
155 slice(self.latvalS, self.latvalN),
156 self.lonname:
157 slice(self.lonvalW, self.lonvalE)})
158 elif self.latvalN != "" and self.lonvalE != "":
159 # select nearest location
160 if self.no_missing:
161 self.nearest_latvalN = self.latvalN
162 self.nearest_lonvalE = self.lonvalE
163 else:
164 # find nearest location without NaN values
165 self.nearest_location()
166 if self.tolerance > 0:
167 self.gset = self.dset.sel({self.latname: self.nearest_latvalN,
168 self.lonname: self.nearest_lonvalE},
169 method='nearest',
170 tolerance=self.tolerance)
171 else:
172 self.gset = self.dset.sel({self.latname: self.nearest_latvalN,
173 self.lonname: self.nearest_lonvalE},
174 method='nearest')
175 else:
176 self.gset = self.dset
177
178 def nearest_location(self):
179 # Build a geopandas dataframe with all first elements in each dimension
180 # so we assume null values correspond to a mask that is the same for
181 # all dimensions in the dataset.
182 dsel_frame = self.dset
183 for dim in self.dset.dims:
184 if dim != self.latname and dim != self.lonname:
185 dsel_frame = dsel_frame.isel({dim: 0})
186 # transform to pandas dataframe
187 dff = dsel_frame.to_dataframe().dropna().reset_index()
188 # transform to geopandas to collocate
189 gdf = gdp.GeoDataFrame(dff,
190 geometry=gdp.points_from_xy(dff[self.lonname],
191 dff[self.latname]))
192 # Find nearest location where values are not null
193 point = Point(self.lonvalE, self.latvalN)
194 multipoint = gdf.geometry.unary_union
195 queried_geom, nearest_geom = nearest_points(point, multipoint)
196 self.nearest_latvalN = nearest_geom.y
197 self.nearest_lonvalE = nearest_geom.x
198
199 def selection_from_coords(self):
200 fcoords = pd.read_csv(self.coords, sep='\t')
201 for row in fcoords.itertuples():
202 self.latvalN = row[0]
203 self.lonvalE = row[1]
204 self.outfile = (os.path.join(self.outputdir,
205 self.select + '_' +
206 str(row.Index) + '.tabular'))
207 self.selection()
208
209
210 if __name__ == '__main__':
211 warnings.filterwarnings("ignore")
212 parser = argparse.ArgumentParser()
213
214 parser.add_argument(
215 'infile',
216 help='netCDF input filename'
217 )
218 parser.add_argument(
219 '--select',
220 help='Variable name to select'
221 )
222 parser.add_argument(
223 '--latname',
224 help='Latitude name'
225 )
226 parser.add_argument(
227 '--latvalN',
228 help='North latitude value'
229 )
230 parser.add_argument(
231 '--latvalS',
232 help='South latitude value'
233 )
234 parser.add_argument(
235 '--lonname',
236 help='Longitude name'
237 )
238 parser.add_argument(
239 '--lonvalE',
240 help='East longitude value'
241 )
242 parser.add_argument(
243 '--lonvalW',
244 help='West longitude value'
245 )
246 parser.add_argument(
247 '--tolerance',
248 help='Maximum distance between original and selected value for '
249 ' inexact matches e.g. abs(index[indexer] - target) <= tolerance'
250 )
251 parser.add_argument(
252 '--coords',
253 help='Input file containing Latitude and Longitude'
254 'for geographical selection'
255 )
256 parser.add_argument(
257 '--filter',
258 nargs="*",
259 help='Filter list variable#operator#value_s#value_e'
260 )
261 parser.add_argument(
262 '--time',
263 help='select timeseries variable#operator#value_s[#value_e]'
264 )
265 parser.add_argument(
266 '--outfile',
267 help='csv outfile for storing results of the selection'
268 '(valid only when --select)'
269 )
270 parser.add_argument(
271 '--outputdir',
272 help='folder name for storing results with multiple selections'
273 '(valid only when --select)'
274 )
275 parser.add_argument(
276 "-v", "--verbose",
277 help="switch on verbose mode",
278 action="store_true"
279 )
280 parser.add_argument(
281 "--no_missing",
282 help="""Do not take into account possible null/missing values
283 (only valid for single location)""",
284 action="store_true"
285 )
286 args = parser.parse_args()
287
288 p = XarraySelect(args.infile, args.select, args.outfile, args.outputdir,
289 args.latname, args.latvalN, args.latvalS, args.lonname,
290 args.lonvalE, args.lonvalW, args.filter,
291 args.coords, args.time, args.verbose,
292 args.no_missing, args.tolerance)
293 if args.select:
294 p.selection()