Mercurial > repos > iuc > ivar_trim

--- a/ivar_trim.xml	Thu Aug 05 12:46:37 2021 +0000
+++ b/ivar_trim.xml	Fri Aug 20 20:35:46 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="ivar_trim" name="ivar trim" version="@VERSION@+galaxy1">
+<tool id="ivar_trim" name="ivar trim" version="@VERSION@+galaxy2">
     <description>Trim reads in aligned BAM</description>
     <macros>
         <import>macros.xml</import>
@@ -61,7 +61,7 @@
         <conditional name="amplicons">
             <param name="filter_by" type="select"
             label="Filter reads based on amplicon info"
-            help="When you select Yes reads that are not fully contained in any amplicon will be dropped before primer trimming. Info on amplicons can be computed from the primer BED file or provided by the user. This option is currently marked as [Experimental] in ivar, but nevertheless recommended here.">
+            help="When you select Yes, reads that are not fully contained in any amplicon will be dropped before primer trimming. This option is currently marked as [Experimental] in ivar, but nevertheless recommended here. Info on amplicons can be computed from suitable primer BED files (see tool help below) or provided by the user. ">
                 <option value="">No, allow reads to extend beyond amplicon boundaries</option>
                 <option value="yes_compute">Yes, drop reads that extend beyond amplicon boundaries</option>
                 <option value="yes">Yes, drop reads that extend beyond amplicon boundaries and use my amplicon info file</option>
@@ -150,10 +150,25 @@
 Optionally, the tool can also discard reads that do not fully map to within any
 amplicon. Such reads are likely to be wet-lab or mapping artefacts and removing
 them can increase variant calling precision. To calculate the extent of
-expected amplicons the tool requires an additional amplicon info dataset that
-lists the names of primers that together form any given amplicon. Primer names
-(exactly matching those in the primer info dataset) need to be TAB-separated
-with one line per amplicon.
+expected amplicons the tool needs to know which primers work together to form
+an amplicon. The tool can try to deduce this info from the names of the primers
+found in the primer info dataset. This will require a primer naming scheme
+following the regex pattern::
+
+  .*_(?P<amplicon_number>\d+).*_(?P<primer_orientation>L(?:EFT)?|R(?:IGHT)?)
+
+*i.e.*, the following schemes will work (and get parsed as):
+
+- ``nCoV-2019_1_LEFT`` (forward primer of amplicon 1)
+
+- ``400_2_out_R`` (reverse primer of amplicon 2)
+
+- ``QIAseq_163-2_LEFT`` (forward primer of amplicon 163)
+
+Alternatively, you can specify the amplicon information explicitly through a
+dataset that lists the names of primers that together form any given amplicon.
+In it, primer names (exactly matching those in the primer info dataset) need to
+be TAB-separated with one line per amplicon.
 If the primer scheme has more than two primers contributing to a given amplicon
 (in schemes using alternate primers), you can (in this Galaxy tool only)
 specify all of them on one line and the tool will calculate the maximum extent
--- a/write_amplicon_info_file.py	Thu Aug 05 12:46:37 2021 +0000
+++ b/write_amplicon_info_file.py	Fri Aug 20 20:35:46 2021 +0000
@@ -3,48 +3,49 @@
 import argparse
 import re

-AMPLICON_NAME_RE = r'.*_(?P<num>\d+)_[^0-9]*(?P<name>L(?:EFT)?|R(?:IGHT)?)'

-
-def primer_info_to_position(name):
-    position = 0
-    re_match = re.match(AMPLICON_NAME_RE, name)
-    if re_match is None:
-        raise ValueError("{} does not match expected amplicon name format".format(name))
-    side = re_match.group('name')
-    num = re_match.group('num')
-    if side == 'RIGHT' or side == 'R':
-        position += 1000
-    if num is not None:
-        position += int(num)
-    return position
+AMPLICON_PAT = re.compile(r'.*_(?P<num>\d+).*_(?P<name>L(?:EFT)?|R(?:IGHT)?)')


 def write_amplicon_info_file(bed_file, amplicon_info_file):
     amplicon_sets = {}
-    amplicon_ids = set()
     for line in bed_file:
         fields = line.strip().split('\t')
+        start = int(fields[1])
         name = fields[3]
-        re_match = re.match(AMPLICON_NAME_RE, name)
+        re_match = AMPLICON_PAT.match(name)
         if re_match is None:
-            raise ValueError("{} does not match expected amplicon name format".format(name))
+            raise ValueError(
+                '{} does not match expected amplicon name format'.format(name)
+            )
         amplicon_id = int(re_match.group('num'))
         amplicon_set = amplicon_sets.get(amplicon_id, [])
-        amplicon_set.append(name)
+        amplicon_set.append((name, start))
         amplicon_sets[amplicon_id] = amplicon_set
-        amplicon_ids.add(amplicon_id)

-    for id in sorted(list(amplicon_ids)):
-        amplicon_info = '\t'.join([name for name in sorted(amplicon_sets[id], key=primer_info_to_position)]) + '\n'
+    # write amplicons sorted by number with primers sorted by start position
+    for id in sorted(amplicon_sets):
+        amplicon_info = '\t'.join(
+            [name for name, start in sorted(
+                amplicon_sets[id], key=lambda x: x[1]
+            )]
+        ) + '\n'
         amplicon_info_file.write(amplicon_info)
     amplicon_info_file.close()


 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Write an amplicon info file for iVar from a BED file describing primer positions')
-    parser.add_argument('bed_file', type=argparse.FileType(), help='Primer BED file')
-    parser.add_argument('amplicon_info_file', type=argparse.FileType('w'), help='Output file: amplicon info file in TSV format')
+    parser = argparse.ArgumentParser(
+        description='Write an amplicon info file for iVar '
+                    'from a BED file describing primer positions'
+    )
+    parser.add_argument(
+        'bed_file', type=argparse.FileType(), help='Primer BED file'
+    )
+    parser.add_argument(
+        'amplicon_info_file', type=argparse.FileType('w'),
+        help='Output file: amplicon info file in TSV format'
+    )
     args = parser.parse_args()

     write_amplicon_info_file(args.bed_file, args.amplicon_info_file)