Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/galaxy/tool_util/verify/__init__.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 """Module of utilities for verifying test results.""" | |
2 | |
3 import difflib | |
4 import filecmp | |
5 import hashlib | |
6 import json | |
7 import logging | |
8 import os | |
9 import os.path | |
10 import re | |
11 import shutil | |
12 import tempfile | |
13 | |
14 try: | |
15 import pysam | |
16 except ImportError: | |
17 pysam = None | |
18 | |
19 from galaxy.tool_util.parser.util import ( | |
20 DEFAULT_DELTA, | |
21 DEFAULT_DELTA_FRAC | |
22 ) | |
23 from galaxy.util import unicodify | |
24 from galaxy.util.compression_utils import get_fileobj | |
25 from .asserts import verify_assertions | |
26 from .test_data import TestDataResolver | |
27 | |
28 log = logging.getLogger(__name__) | |
29 | |
30 DEFAULT_TEST_DATA_RESOLVER = TestDataResolver() | |
31 | |
32 | |
33 def verify( | |
34 item_label, | |
35 output_content, | |
36 attributes, | |
37 filename=None, | |
38 get_filecontent=None, | |
39 get_filename=None, | |
40 keep_outputs_dir=None, | |
41 verify_extra_files=None, | |
42 mode='file', | |
43 ): | |
44 """Verify the content of a test output using test definitions described by attributes. | |
45 | |
46 Throw an informative assertion error if any of these tests fail. | |
47 """ | |
48 if get_filename is None: | |
49 if get_filecontent is None: | |
50 get_filecontent = DEFAULT_TEST_DATA_RESOLVER.get_filecontent | |
51 | |
52 def get_filename(filename): | |
53 file_content = get_filecontent(filename) | |
54 local_name = make_temp_fname(fname=filename) | |
55 with open(local_name, 'wb') as f: | |
56 f.write(file_content) | |
57 return local_name | |
58 | |
59 # Check assertions... | |
60 assertions = attributes.get("assert_list", None) | |
61 if attributes is not None and assertions is not None: | |
62 try: | |
63 verify_assertions(output_content, attributes["assert_list"]) | |
64 except AssertionError as err: | |
65 errmsg = '%s different than expected\n' % (item_label) | |
66 errmsg += unicodify(err) | |
67 raise AssertionError(errmsg) | |
68 | |
69 # Verify checksum attributes... | |
70 # works with older Galaxy style md5=<expected_sum> or cwltest | |
71 # style checksum=<hash_type>$<hash>. | |
72 expected_checksum_type = None | |
73 expected_checksum = None | |
74 if attributes is not None and attributes.get("md5", None) is not None: | |
75 expected_checksum_type = "md5" | |
76 expected_checksum = attributes.get("md5") | |
77 elif attributes is not None and attributes.get("checksum", None) is not None: | |
78 checksum_value = attributes.get("checksum", None) | |
79 expected_checksum_type, expected_checksum = checksum_value.split("$", 1) | |
80 | |
81 if expected_checksum_type: | |
82 try: | |
83 _verify_checksum(output_content, expected_checksum_type, expected_checksum) | |
84 except AssertionError as err: | |
85 errmsg = '%s different than expected\n' % (item_label) | |
86 errmsg += unicodify(err) | |
87 raise AssertionError(errmsg) | |
88 | |
89 if attributes is None: | |
90 attributes = {} | |
91 | |
92 # expected object might be None, so don't pull unless available | |
93 has_expected_object = 'object' in attributes | |
94 if has_expected_object: | |
95 assert filename is None | |
96 expected_object = attributes.get('object') | |
97 actual_object = json.loads(output_content) | |
98 | |
99 expected_object_type = type(expected_object) | |
100 actual_object_type = type(actual_object) | |
101 | |
102 if expected_object_type != actual_object_type: | |
103 message = f"Type mismatch between expected object ({expected_object_type}) and actual object ({actual_object_type})" | |
104 raise AssertionError(message) | |
105 | |
106 if expected_object != actual_object: | |
107 message = f"Expected object ({expected_object}) does not match actual object ({actual_object})" | |
108 raise AssertionError(message) | |
109 | |
110 elif filename is not None: | |
111 temp_name = make_temp_fname(fname=filename) | |
112 with open(temp_name, 'wb') as f: | |
113 f.write(output_content) | |
114 | |
115 # If the server's env has GALAXY_TEST_SAVE, save the output file to that | |
116 # directory. | |
117 # This needs to be done before the call to `get_filename()` because that | |
118 # may raise an exception if `filename` does not exist (e.g. when | |
119 # generating a tool output file from scratch with | |
120 # `planemo test --update_test_data`). | |
121 if keep_outputs_dir: | |
122 ofn = os.path.join(keep_outputs_dir, filename) | |
123 out_dir = os.path.dirname(ofn) | |
124 if not os.path.exists(out_dir): | |
125 os.makedirs(out_dir) | |
126 log.debug('keep_outputs_dir: %s, ofn: %s', keep_outputs_dir, ofn) | |
127 try: | |
128 shutil.copy(temp_name, ofn) | |
129 except Exception: | |
130 log.exception('Could not save output file %s to %s', temp_name, ofn) | |
131 else: | |
132 log.debug('## GALAXY_TEST_SAVE=%s. saved %s', keep_outputs_dir, ofn) | |
133 | |
134 if mode == 'directory': | |
135 # if verifying a file inside a extra_files_path directory | |
136 # filename already point to a file that exists on disk | |
137 local_name = filename | |
138 else: | |
139 local_name = get_filename(filename) | |
140 | |
141 compare = attributes.get('compare', 'diff') | |
142 try: | |
143 if attributes.get('ftype', None) in ['bam', 'qname_sorted.bam', 'qname_input_sorted.bam', 'unsorted.bam', 'cram']: | |
144 try: | |
145 local_fh, temp_name = _bam_to_sam(local_name, temp_name) | |
146 local_name = local_fh.name | |
147 except Exception as e: | |
148 log.warning("%s. Will compare BAM files", unicodify(e)) | |
149 if compare == 'diff': | |
150 files_diff(local_name, temp_name, attributes=attributes) | |
151 elif compare == 're_match': | |
152 files_re_match(local_name, temp_name, attributes=attributes) | |
153 elif compare == 're_match_multiline': | |
154 files_re_match_multiline(local_name, temp_name, attributes=attributes) | |
155 elif compare == 'sim_size': | |
156 files_delta(local_name, temp_name, attributes=attributes) | |
157 elif compare == "contains": | |
158 files_contains(local_name, temp_name, attributes=attributes) | |
159 else: | |
160 raise Exception('Unimplemented Compare type: %s' % compare) | |
161 except AssertionError as err: | |
162 errmsg = f'{item_label} different than expected, difference (using {compare}):\n' | |
163 errmsg += f"( {local_name} v. {temp_name} )\n" | |
164 errmsg += unicodify(err) | |
165 raise AssertionError(errmsg) | |
166 finally: | |
167 if 'GALAXY_TEST_NO_CLEANUP' not in os.environ: | |
168 os.remove(temp_name) | |
169 | |
170 if verify_extra_files: | |
171 extra_files = attributes.get('extra_files', None) | |
172 if extra_files: | |
173 verify_extra_files(extra_files) | |
174 | |
175 | |
176 def make_temp_fname(fname=None): | |
177 """Safe temp name - preserve the file extension for tools that interpret it.""" | |
178 suffix = os.path.split(fname)[-1] # ignore full path | |
179 with tempfile.NamedTemporaryFile(prefix='tmp', suffix=suffix, delete=False) as temp: | |
180 return temp.name | |
181 | |
182 | |
183 def _bam_to_sam(local_name, temp_name): | |
184 temp_local = tempfile.NamedTemporaryFile(suffix='.sam', prefix='local_bam_converted_to_sam_') | |
185 with tempfile.NamedTemporaryFile(suffix='.sam', prefix='history_bam_converted_to_sam_', delete=False) as temp: | |
186 try: | |
187 pysam.view('-h', '-o%s' % temp_local.name, local_name) | |
188 except Exception as e: | |
189 msg = "Converting local (test-data) BAM to SAM failed: %s" % unicodify(e) | |
190 raise Exception(msg) | |
191 try: | |
192 pysam.view('-h', '-o%s' % temp.name, temp_name) | |
193 except Exception as e: | |
194 msg = "Converting history BAM to SAM failed: %s" % unicodify(e) | |
195 raise Exception(msg) | |
196 os.remove(temp_name) | |
197 return temp_local, temp.name | |
198 | |
199 | |
200 def _verify_checksum(data, checksum_type, expected_checksum_value): | |
201 if checksum_type not in ["md5", "sha1", "sha256", "sha512"]: | |
202 raise Exception("Unimplemented hash algorithm [%s] encountered." % checksum_type) | |
203 | |
204 h = hashlib.new(checksum_type) | |
205 h.update(data) | |
206 actual_checksum_value = h.hexdigest() | |
207 if expected_checksum_value != actual_checksum_value: | |
208 template = "Output checksum [%s] does not match expected [%s] (using hash algorithm %s)." | |
209 message = template % (actual_checksum_value, expected_checksum_value, checksum_type) | |
210 raise AssertionError(message) | |
211 | |
212 | |
213 def files_delta(file1, file2, attributes=None): | |
214 """Check the contents of 2 files for size differences.""" | |
215 if attributes is None: | |
216 attributes = {} | |
217 delta = attributes.get('delta', DEFAULT_DELTA) | |
218 delta_frac = attributes.get('delta_frac', DEFAULT_DELTA_FRAC) | |
219 s1 = os.path.getsize(file1) | |
220 s2 = os.path.getsize(file2) | |
221 if abs(s1 - s2) > delta: | |
222 raise AssertionError('Files %s=%db but %s=%db - compare by size (delta=%s) failed' % (file1, s1, file2, s2, delta)) | |
223 if delta_frac is not None and not (s1 - (s1 * delta_frac) <= s2 <= s1 + (s1 * delta_frac)): | |
224 raise AssertionError('Files %s=%db but %s=%db - compare by size (delta_frac=%s) failed' % (file1, s1, file2, s2, delta_frac)) | |
225 | |
226 | |
227 def files_diff(file1, file2, attributes=None): | |
228 """Check the contents of 2 files for differences.""" | |
229 def get_lines_diff(diff): | |
230 count = 0 | |
231 for line in diff: | |
232 if (line.startswith('+') and not line.startswith('+++')) or (line.startswith('-') and not line.startswith('---')): | |
233 count += 1 | |
234 return count | |
235 | |
236 if not filecmp.cmp(file1, file2, shallow=False): | |
237 if attributes is None: | |
238 attributes = {} | |
239 decompress = attributes.get("decompress", None) | |
240 if decompress: | |
241 # None means all compressed formats are allowed | |
242 compressed_formats = None | |
243 else: | |
244 compressed_formats = [] | |
245 is_pdf = False | |
246 try: | |
247 with get_fileobj(file2, compressed_formats=compressed_formats) as fh: | |
248 history_data = fh.readlines() | |
249 with get_fileobj(file1, compressed_formats=compressed_formats) as fh: | |
250 local_file = fh.readlines() | |
251 except UnicodeDecodeError: | |
252 if file1.endswith('.pdf') or file2.endswith('.pdf'): | |
253 is_pdf = True | |
254 # Replace non-Unicode characters using unicodify(), | |
255 # difflib.unified_diff doesn't work on list of bytes | |
256 history_data = [unicodify(l) for l in get_fileobj(file2, mode='rb', compressed_formats=compressed_formats)] | |
257 local_file = [unicodify(l) for l in get_fileobj(file1, mode='rb', compressed_formats=compressed_formats)] | |
258 else: | |
259 raise AssertionError("Binary data detected, not displaying diff") | |
260 if attributes.get('sort', False): | |
261 local_file.sort() | |
262 history_data.sort() | |
263 allowed_diff_count = int(attributes.get('lines_diff', 0)) | |
264 diff = list(difflib.unified_diff(local_file, history_data, "local_file", "history_data")) | |
265 diff_lines = get_lines_diff(diff) | |
266 if diff_lines > allowed_diff_count: | |
267 if 'GALAXY_TEST_RAW_DIFF' in os.environ: | |
268 diff_slice = diff | |
269 else: | |
270 if len(diff) < 60: | |
271 diff_slice = diff[0:40] | |
272 else: | |
273 diff_slice = diff[:25] + ["********\n", "*SNIP *\n", "********\n"] + diff[-25:] | |
274 # FIXME: This pdf stuff is rather special cased and has not been updated to consider lines_diff | |
275 # due to unknown desired behavior when used in conjunction with a non-zero lines_diff | |
276 # PDF forgiveness can probably be handled better by not special casing by __extension__ here | |
277 # and instead using lines_diff or a regular expression matching | |
278 # or by creating and using a specialized pdf comparison function | |
279 if is_pdf: | |
280 # PDF files contain creation dates, modification dates, ids and descriptions that change with each | |
281 # new file, so we need to handle these differences. As long as the rest of the PDF file does | |
282 # not differ we're ok. | |
283 valid_diff_strs = ['description', 'createdate', 'creationdate', 'moddate', 'id', 'producer', 'creator'] | |
284 valid_diff = False | |
285 invalid_diff_lines = 0 | |
286 for line in diff_slice: | |
287 # Make sure to lower case strings before checking. | |
288 line = line.lower() | |
289 # Diff lines will always start with a + or - character, but handle special cases: '--- local_file \n', '+++ history_data \n' | |
290 if (line.startswith('+') or line.startswith('-')) and line.find('local_file') < 0 and line.find('history_data') < 0: | |
291 for vdf in valid_diff_strs: | |
292 if line.find(vdf) < 0: | |
293 valid_diff = False | |
294 else: | |
295 valid_diff = True | |
296 # Stop checking as soon as we know we have a valid difference | |
297 break | |
298 if not valid_diff: | |
299 invalid_diff_lines += 1 | |
300 log.info("## files diff on '%s' and '%s': lines_diff = %d, found diff = %d, found pdf invalid diff = %d" % (file1, file2, allowed_diff_count, diff_lines, invalid_diff_lines)) | |
301 if invalid_diff_lines > allowed_diff_count: | |
302 # Print out diff_slice so we can see what failed | |
303 log.info("###### diff_slice ######") | |
304 raise AssertionError("".join(diff_slice)) | |
305 else: | |
306 log.info("## files diff on '%s' and '%s': lines_diff = %d, found diff = %d" % (file1, file2, allowed_diff_count, diff_lines)) | |
307 raise AssertionError("".join(diff_slice)) | |
308 | |
309 | |
310 def files_re_match(file1, file2, attributes=None): | |
311 """Check the contents of 2 files for differences using re.match.""" | |
312 join_char = '' | |
313 to_strip = os.linesep | |
314 try: | |
315 with open(file2, encoding='utf-8') as fh: | |
316 history_data = fh.readlines() | |
317 with open(file1, encoding='utf-8') as fh: | |
318 local_file = fh.readlines() | |
319 except UnicodeDecodeError: | |
320 join_char = b'' | |
321 to_strip = os.linesep.encode('utf-8') | |
322 with open(file2, 'rb') as fh: | |
323 history_data = fh.readlines() | |
324 with open(file1, 'rb') as fh: | |
325 local_file = fh.readlines() | |
326 assert len(local_file) == len(history_data), 'Data File and Regular Expression File contain a different number of lines (%d != %d)\nHistory Data (first 40 lines):\n%s' % (len(local_file), len(history_data), join_char.join(history_data[:40])) | |
327 if attributes is None: | |
328 attributes = {} | |
329 if attributes.get('sort', False): | |
330 history_data.sort() | |
331 lines_diff = int(attributes.get('lines_diff', 0)) | |
332 line_diff_count = 0 | |
333 diffs = [] | |
334 for regex_line, data_line in zip(local_file, history_data): | |
335 regex_line = regex_line.rstrip(to_strip) | |
336 data_line = data_line.rstrip(to_strip) | |
337 if not re.match(regex_line, data_line): | |
338 line_diff_count += 1 | |
339 diffs.append(f'Regular Expression: {regex_line}, Data file: {data_line}\n') | |
340 if line_diff_count > lines_diff: | |
341 raise AssertionError("Regular expression did not match data file (allowed variants=%i):\n%s" % (lines_diff, "".join(diffs))) | |
342 | |
343 | |
344 def files_re_match_multiline(file1, file2, attributes=None): | |
345 """Check the contents of 2 files for differences using re.match in multiline mode.""" | |
346 join_char = '' | |
347 try: | |
348 with open(file2, encoding='utf-8') as fh: | |
349 history_data = fh.readlines() | |
350 with open(file1, encoding='utf-8') as fh: | |
351 local_file = fh.read() | |
352 except UnicodeDecodeError: | |
353 join_char = b'' | |
354 with open(file2, 'rb') as fh: | |
355 history_data = fh.readlines() | |
356 with open(file1, 'rb') as fh: | |
357 local_file = fh.read() | |
358 if attributes is None: | |
359 attributes = {} | |
360 if attributes.get('sort', False): | |
361 history_data.sort() | |
362 history_data = join_char.join(history_data) | |
363 # lines_diff not applicable to multiline matching | |
364 assert re.match(local_file, history_data, re.MULTILINE), "Multiline Regular expression did not match data file" | |
365 | |
366 | |
367 def files_contains(file1, file2, attributes=None): | |
368 """Check the contents of file2 for substrings found in file1, on a per-line basis.""" | |
369 # TODO: allow forcing ordering of contains | |
370 to_strip = os.linesep | |
371 try: | |
372 with open(file2, encoding='utf-8') as fh: | |
373 history_data = fh.read() | |
374 with open(file1, encoding='utf-8') as fh: | |
375 local_file = fh.readlines() | |
376 except UnicodeDecodeError: | |
377 to_strip = os.linesep.encode('utf-8') | |
378 with open(file2, 'rb') as fh: | |
379 history_data = fh.read() | |
380 with open(file1, 'rb') as fh: | |
381 local_file = fh.readlines() | |
382 if attributes is None: | |
383 attributes = {} | |
384 lines_diff = int(attributes.get('lines_diff', 0)) | |
385 line_diff_count = 0 | |
386 for contains in local_file: | |
387 contains = contains.rstrip(to_strip) | |
388 if contains not in history_data: | |
389 line_diff_count += 1 | |
390 if line_diff_count > lines_diff: | |
391 raise AssertionError("Failed to find '%s' in history data. (lines_diff=%i)" % (contains, lines_diff)) |