# HG changeset patch # User iuc # Date 1629491746 0 # Node ID c092052ed67317a4f8a7035560a488b506249423 # Parent 397e5f0eb3ef5019dba764b933a58d52afa2a1b5 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ivar/ commit f09d0bee3e957564beccb1bdb3610de02f639ec7" diff -r 397e5f0eb3ef -r c092052ed673 ivar_trim.xml --- a/ivar_trim.xml Thu Aug 05 12:46:37 2021 +0000 +++ b/ivar_trim.xml Fri Aug 20 20:35:46 2021 +0000 @@ -1,4 +1,4 @@ - + Trim reads in aligned BAM macros.xml @@ -61,7 +61,7 @@ + help="When you select Yes, reads that are not fully contained in any amplicon will be dropped before primer trimming. This option is currently marked as [Experimental] in ivar, but nevertheless recommended here. Info on amplicons can be computed from suitable primer BED files (see tool help below) or provided by the user. "> @@ -150,10 +150,25 @@ Optionally, the tool can also discard reads that do not fully map to within any amplicon. Such reads are likely to be wet-lab or mapping artefacts and removing them can increase variant calling precision. To calculate the extent of -expected amplicons the tool requires an additional amplicon info dataset that -lists the names of primers that together form any given amplicon. Primer names -(exactly matching those in the primer info dataset) need to be TAB-separated -with one line per amplicon. +expected amplicons the tool needs to know which primers work together to form +an amplicon. The tool can try to deduce this info from the names of the primers +found in the primer info dataset. This will require a primer naming scheme +following the regex pattern:: + + .*_(?P\d+).*_(?PL(?:EFT)?|R(?:IGHT)?) + +*i.e.*, the following schemes will work (and get parsed as): + +- ``nCoV-2019_1_LEFT`` (forward primer of amplicon 1) + +- ``400_2_out_R`` (reverse primer of amplicon 2) + +- ``QIAseq_163-2_LEFT`` (forward primer of amplicon 163) + +Alternatively, you can specify the amplicon information explicitly through a +dataset that lists the names of primers that together form any given amplicon. +In it, primer names (exactly matching those in the primer info dataset) need to +be TAB-separated with one line per amplicon. If the primer scheme has more than two primers contributing to a given amplicon (in schemes using alternate primers), you can (in this Galaxy tool only) specify all of them on one line and the tool will calculate the maximum extent diff -r 397e5f0eb3ef -r c092052ed673 write_amplicon_info_file.py --- a/write_amplicon_info_file.py Thu Aug 05 12:46:37 2021 +0000 +++ b/write_amplicon_info_file.py Fri Aug 20 20:35:46 2021 +0000 @@ -3,48 +3,49 @@ import argparse import re -AMPLICON_NAME_RE = r'.*_(?P\d+)_[^0-9]*(?PL(?:EFT)?|R(?:IGHT)?)' - -def primer_info_to_position(name): - position = 0 - re_match = re.match(AMPLICON_NAME_RE, name) - if re_match is None: - raise ValueError("{} does not match expected amplicon name format".format(name)) - side = re_match.group('name') - num = re_match.group('num') - if side == 'RIGHT' or side == 'R': - position += 1000 - if num is not None: - position += int(num) - return position +AMPLICON_PAT = re.compile(r'.*_(?P\d+).*_(?PL(?:EFT)?|R(?:IGHT)?)') def write_amplicon_info_file(bed_file, amplicon_info_file): amplicon_sets = {} - amplicon_ids = set() for line in bed_file: fields = line.strip().split('\t') + start = int(fields[1]) name = fields[3] - re_match = re.match(AMPLICON_NAME_RE, name) + re_match = AMPLICON_PAT.match(name) if re_match is None: - raise ValueError("{} does not match expected amplicon name format".format(name)) + raise ValueError( + '{} does not match expected amplicon name format'.format(name) + ) amplicon_id = int(re_match.group('num')) amplicon_set = amplicon_sets.get(amplicon_id, []) - amplicon_set.append(name) + amplicon_set.append((name, start)) amplicon_sets[amplicon_id] = amplicon_set - amplicon_ids.add(amplicon_id) - for id in sorted(list(amplicon_ids)): - amplicon_info = '\t'.join([name for name in sorted(amplicon_sets[id], key=primer_info_to_position)]) + '\n' + # write amplicons sorted by number with primers sorted by start position + for id in sorted(amplicon_sets): + amplicon_info = '\t'.join( + [name for name, start in sorted( + amplicon_sets[id], key=lambda x: x[1] + )] + ) + '\n' amplicon_info_file.write(amplicon_info) amplicon_info_file.close() if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Write an amplicon info file for iVar from a BED file describing primer positions') - parser.add_argument('bed_file', type=argparse.FileType(), help='Primer BED file') - parser.add_argument('amplicon_info_file', type=argparse.FileType('w'), help='Output file: amplicon info file in TSV format') + parser = argparse.ArgumentParser( + description='Write an amplicon info file for iVar ' + 'from a BED file describing primer positions' + ) + parser.add_argument( + 'bed_file', type=argparse.FileType(), help='Primer BED file' + ) + parser.add_argument( + 'amplicon_info_file', type=argparse.FileType('w'), + help='Output file: amplicon info file in TSV format' + ) args = parser.parse_args() write_amplicon_info_file(args.bed_file, args.amplicon_info_file)