Mercurial > repos > shellac > sam_consensus_v3
diff env/lib/python3.9/site-packages/galaxy/tool_util/verify/__init__.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/env/lib/python3.9/site-packages/galaxy/tool_util/verify/__init__.py Mon Mar 22 18:12:50 2021 +0000 @@ -0,0 +1,391 @@ +"""Module of utilities for verifying test results.""" + +import difflib +import filecmp +import hashlib +import json +import logging +import os +import os.path +import re +import shutil +import tempfile + +try: + import pysam +except ImportError: + pysam = None + +from galaxy.tool_util.parser.util import ( + DEFAULT_DELTA, + DEFAULT_DELTA_FRAC +) +from galaxy.util import unicodify +from galaxy.util.compression_utils import get_fileobj +from .asserts import verify_assertions +from .test_data import TestDataResolver + +log = logging.getLogger(__name__) + +DEFAULT_TEST_DATA_RESOLVER = TestDataResolver() + + +def verify( + item_label, + output_content, + attributes, + filename=None, + get_filecontent=None, + get_filename=None, + keep_outputs_dir=None, + verify_extra_files=None, + mode='file', +): + """Verify the content of a test output using test definitions described by attributes. + + Throw an informative assertion error if any of these tests fail. + """ + if get_filename is None: + if get_filecontent is None: + get_filecontent = DEFAULT_TEST_DATA_RESOLVER.get_filecontent + + def get_filename(filename): + file_content = get_filecontent(filename) + local_name = make_temp_fname(fname=filename) + with open(local_name, 'wb') as f: + f.write(file_content) + return local_name + + # Check assertions... + assertions = attributes.get("assert_list", None) + if attributes is not None and assertions is not None: + try: + verify_assertions(output_content, attributes["assert_list"]) + except AssertionError as err: + errmsg = '%s different than expected\n' % (item_label) + errmsg += unicodify(err) + raise AssertionError(errmsg) + + # Verify checksum attributes... + # works with older Galaxy style md5=<expected_sum> or cwltest + # style checksum=<hash_type>$<hash>. + expected_checksum_type = None + expected_checksum = None + if attributes is not None and attributes.get("md5", None) is not None: + expected_checksum_type = "md5" + expected_checksum = attributes.get("md5") + elif attributes is not None and attributes.get("checksum", None) is not None: + checksum_value = attributes.get("checksum", None) + expected_checksum_type, expected_checksum = checksum_value.split("$", 1) + + if expected_checksum_type: + try: + _verify_checksum(output_content, expected_checksum_type, expected_checksum) + except AssertionError as err: + errmsg = '%s different than expected\n' % (item_label) + errmsg += unicodify(err) + raise AssertionError(errmsg) + + if attributes is None: + attributes = {} + + # expected object might be None, so don't pull unless available + has_expected_object = 'object' in attributes + if has_expected_object: + assert filename is None + expected_object = attributes.get('object') + actual_object = json.loads(output_content) + + expected_object_type = type(expected_object) + actual_object_type = type(actual_object) + + if expected_object_type != actual_object_type: + message = f"Type mismatch between expected object ({expected_object_type}) and actual object ({actual_object_type})" + raise AssertionError(message) + + if expected_object != actual_object: + message = f"Expected object ({expected_object}) does not match actual object ({actual_object})" + raise AssertionError(message) + + elif filename is not None: + temp_name = make_temp_fname(fname=filename) + with open(temp_name, 'wb') as f: + f.write(output_content) + + # If the server's env has GALAXY_TEST_SAVE, save the output file to that + # directory. + # This needs to be done before the call to `get_filename()` because that + # may raise an exception if `filename` does not exist (e.g. when + # generating a tool output file from scratch with + # `planemo test --update_test_data`). + if keep_outputs_dir: + ofn = os.path.join(keep_outputs_dir, filename) + out_dir = os.path.dirname(ofn) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + log.debug('keep_outputs_dir: %s, ofn: %s', keep_outputs_dir, ofn) + try: + shutil.copy(temp_name, ofn) + except Exception: + log.exception('Could not save output file %s to %s', temp_name, ofn) + else: + log.debug('## GALAXY_TEST_SAVE=%s. saved %s', keep_outputs_dir, ofn) + + if mode == 'directory': + # if verifying a file inside a extra_files_path directory + # filename already point to a file that exists on disk + local_name = filename + else: + local_name = get_filename(filename) + + compare = attributes.get('compare', 'diff') + try: + if attributes.get('ftype', None) in ['bam', 'qname_sorted.bam', 'qname_input_sorted.bam', 'unsorted.bam', 'cram']: + try: + local_fh, temp_name = _bam_to_sam(local_name, temp_name) + local_name = local_fh.name + except Exception as e: + log.warning("%s. Will compare BAM files", unicodify(e)) + if compare == 'diff': + files_diff(local_name, temp_name, attributes=attributes) + elif compare == 're_match': + files_re_match(local_name, temp_name, attributes=attributes) + elif compare == 're_match_multiline': + files_re_match_multiline(local_name, temp_name, attributes=attributes) + elif compare == 'sim_size': + files_delta(local_name, temp_name, attributes=attributes) + elif compare == "contains": + files_contains(local_name, temp_name, attributes=attributes) + else: + raise Exception('Unimplemented Compare type: %s' % compare) + except AssertionError as err: + errmsg = f'{item_label} different than expected, difference (using {compare}):\n' + errmsg += f"( {local_name} v. {temp_name} )\n" + errmsg += unicodify(err) + raise AssertionError(errmsg) + finally: + if 'GALAXY_TEST_NO_CLEANUP' not in os.environ: + os.remove(temp_name) + + if verify_extra_files: + extra_files = attributes.get('extra_files', None) + if extra_files: + verify_extra_files(extra_files) + + +def make_temp_fname(fname=None): + """Safe temp name - preserve the file extension for tools that interpret it.""" + suffix = os.path.split(fname)[-1] # ignore full path + with tempfile.NamedTemporaryFile(prefix='tmp', suffix=suffix, delete=False) as temp: + return temp.name + + +def _bam_to_sam(local_name, temp_name): + temp_local = tempfile.NamedTemporaryFile(suffix='.sam', prefix='local_bam_converted_to_sam_') + with tempfile.NamedTemporaryFile(suffix='.sam', prefix='history_bam_converted_to_sam_', delete=False) as temp: + try: + pysam.view('-h', '-o%s' % temp_local.name, local_name) + except Exception as e: + msg = "Converting local (test-data) BAM to SAM failed: %s" % unicodify(e) + raise Exception(msg) + try: + pysam.view('-h', '-o%s' % temp.name, temp_name) + except Exception as e: + msg = "Converting history BAM to SAM failed: %s" % unicodify(e) + raise Exception(msg) + os.remove(temp_name) + return temp_local, temp.name + + +def _verify_checksum(data, checksum_type, expected_checksum_value): + if checksum_type not in ["md5", "sha1", "sha256", "sha512"]: + raise Exception("Unimplemented hash algorithm [%s] encountered." % checksum_type) + + h = hashlib.new(checksum_type) + h.update(data) + actual_checksum_value = h.hexdigest() + if expected_checksum_value != actual_checksum_value: + template = "Output checksum [%s] does not match expected [%s] (using hash algorithm %s)." + message = template % (actual_checksum_value, expected_checksum_value, checksum_type) + raise AssertionError(message) + + +def files_delta(file1, file2, attributes=None): + """Check the contents of 2 files for size differences.""" + if attributes is None: + attributes = {} + delta = attributes.get('delta', DEFAULT_DELTA) + delta_frac = attributes.get('delta_frac', DEFAULT_DELTA_FRAC) + s1 = os.path.getsize(file1) + s2 = os.path.getsize(file2) + if abs(s1 - s2) > delta: + raise AssertionError('Files %s=%db but %s=%db - compare by size (delta=%s) failed' % (file1, s1, file2, s2, delta)) + if delta_frac is not None and not (s1 - (s1 * delta_frac) <= s2 <= s1 + (s1 * delta_frac)): + raise AssertionError('Files %s=%db but %s=%db - compare by size (delta_frac=%s) failed' % (file1, s1, file2, s2, delta_frac)) + + +def files_diff(file1, file2, attributes=None): + """Check the contents of 2 files for differences.""" + def get_lines_diff(diff): + count = 0 + for line in diff: + if (line.startswith('+') and not line.startswith('+++')) or (line.startswith('-') and not line.startswith('---')): + count += 1 + return count + + if not filecmp.cmp(file1, file2, shallow=False): + if attributes is None: + attributes = {} + decompress = attributes.get("decompress", None) + if decompress: + # None means all compressed formats are allowed + compressed_formats = None + else: + compressed_formats = [] + is_pdf = False + try: + with get_fileobj(file2, compressed_formats=compressed_formats) as fh: + history_data = fh.readlines() + with get_fileobj(file1, compressed_formats=compressed_formats) as fh: + local_file = fh.readlines() + except UnicodeDecodeError: + if file1.endswith('.pdf') or file2.endswith('.pdf'): + is_pdf = True + # Replace non-Unicode characters using unicodify(), + # difflib.unified_diff doesn't work on list of bytes + history_data = [unicodify(l) for l in get_fileobj(file2, mode='rb', compressed_formats=compressed_formats)] + local_file = [unicodify(l) for l in get_fileobj(file1, mode='rb', compressed_formats=compressed_formats)] + else: + raise AssertionError("Binary data detected, not displaying diff") + if attributes.get('sort', False): + local_file.sort() + history_data.sort() + allowed_diff_count = int(attributes.get('lines_diff', 0)) + diff = list(difflib.unified_diff(local_file, history_data, "local_file", "history_data")) + diff_lines = get_lines_diff(diff) + if diff_lines > allowed_diff_count: + if 'GALAXY_TEST_RAW_DIFF' in os.environ: + diff_slice = diff + else: + if len(diff) < 60: + diff_slice = diff[0:40] + else: + diff_slice = diff[:25] + ["********\n", "*SNIP *\n", "********\n"] + diff[-25:] + # FIXME: This pdf stuff is rather special cased and has not been updated to consider lines_diff + # due to unknown desired behavior when used in conjunction with a non-zero lines_diff + # PDF forgiveness can probably be handled better by not special casing by __extension__ here + # and instead using lines_diff or a regular expression matching + # or by creating and using a specialized pdf comparison function + if is_pdf: + # PDF files contain creation dates, modification dates, ids and descriptions that change with each + # new file, so we need to handle these differences. As long as the rest of the PDF file does + # not differ we're ok. + valid_diff_strs = ['description', 'createdate', 'creationdate', 'moddate', 'id', 'producer', 'creator'] + valid_diff = False + invalid_diff_lines = 0 + for line in diff_slice: + # Make sure to lower case strings before checking. + line = line.lower() + # Diff lines will always start with a + or - character, but handle special cases: '--- local_file \n', '+++ history_data \n' + if (line.startswith('+') or line.startswith('-')) and line.find('local_file') < 0 and line.find('history_data') < 0: + for vdf in valid_diff_strs: + if line.find(vdf) < 0: + valid_diff = False + else: + valid_diff = True + # Stop checking as soon as we know we have a valid difference + break + if not valid_diff: + invalid_diff_lines += 1 + log.info("## files diff on '%s' and '%s': lines_diff = %d, found diff = %d, found pdf invalid diff = %d" % (file1, file2, allowed_diff_count, diff_lines, invalid_diff_lines)) + if invalid_diff_lines > allowed_diff_count: + # Print out diff_slice so we can see what failed + log.info("###### diff_slice ######") + raise AssertionError("".join(diff_slice)) + else: + log.info("## files diff on '%s' and '%s': lines_diff = %d, found diff = %d" % (file1, file2, allowed_diff_count, diff_lines)) + raise AssertionError("".join(diff_slice)) + + +def files_re_match(file1, file2, attributes=None): + """Check the contents of 2 files for differences using re.match.""" + join_char = '' + to_strip = os.linesep + try: + with open(file2, encoding='utf-8') as fh: + history_data = fh.readlines() + with open(file1, encoding='utf-8') as fh: + local_file = fh.readlines() + except UnicodeDecodeError: + join_char = b'' + to_strip = os.linesep.encode('utf-8') + with open(file2, 'rb') as fh: + history_data = fh.readlines() + with open(file1, 'rb') as fh: + local_file = fh.readlines() + assert len(local_file) == len(history_data), 'Data File and Regular Expression File contain a different number of lines (%d != %d)\nHistory Data (first 40 lines):\n%s' % (len(local_file), len(history_data), join_char.join(history_data[:40])) + if attributes is None: + attributes = {} + if attributes.get('sort', False): + history_data.sort() + lines_diff = int(attributes.get('lines_diff', 0)) + line_diff_count = 0 + diffs = [] + for regex_line, data_line in zip(local_file, history_data): + regex_line = regex_line.rstrip(to_strip) + data_line = data_line.rstrip(to_strip) + if not re.match(regex_line, data_line): + line_diff_count += 1 + diffs.append(f'Regular Expression: {regex_line}, Data file: {data_line}\n') + if line_diff_count > lines_diff: + raise AssertionError("Regular expression did not match data file (allowed variants=%i):\n%s" % (lines_diff, "".join(diffs))) + + +def files_re_match_multiline(file1, file2, attributes=None): + """Check the contents of 2 files for differences using re.match in multiline mode.""" + join_char = '' + try: + with open(file2, encoding='utf-8') as fh: + history_data = fh.readlines() + with open(file1, encoding='utf-8') as fh: + local_file = fh.read() + except UnicodeDecodeError: + join_char = b'' + with open(file2, 'rb') as fh: + history_data = fh.readlines() + with open(file1, 'rb') as fh: + local_file = fh.read() + if attributes is None: + attributes = {} + if attributes.get('sort', False): + history_data.sort() + history_data = join_char.join(history_data) + # lines_diff not applicable to multiline matching + assert re.match(local_file, history_data, re.MULTILINE), "Multiline Regular expression did not match data file" + + +def files_contains(file1, file2, attributes=None): + """Check the contents of file2 for substrings found in file1, on a per-line basis.""" + # TODO: allow forcing ordering of contains + to_strip = os.linesep + try: + with open(file2, encoding='utf-8') as fh: + history_data = fh.read() + with open(file1, encoding='utf-8') as fh: + local_file = fh.readlines() + except UnicodeDecodeError: + to_strip = os.linesep.encode('utf-8') + with open(file2, 'rb') as fh: + history_data = fh.read() + with open(file1, 'rb') as fh: + local_file = fh.readlines() + if attributes is None: + attributes = {} + lines_diff = int(attributes.get('lines_diff', 0)) + line_diff_count = 0 + for contains in local_file: + contains = contains.rstrip(to_strip) + if contains not in history_data: + line_diff_count += 1 + if line_diff_count > lines_diff: + raise AssertionError("Failed to find '%s' in history data. (lines_diff=%i)" % (contains, lines_diff))