diff data_manager/install_primer_scheme_bedfiles.py @ 6:dd451e45681c draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_primer_scheme_bedfiles commit 4880dcfcdddd9ed8415ccde01b2f8e2c28dab5c3"
author iuc
date Tue, 16 Nov 2021 08:22:08 +0000
parents 157a5c9ea72a
children
line wrap: on
line diff
--- a/data_manager/install_primer_scheme_bedfiles.py	Sat Apr 24 20:56:25 2021 +0000
+++ b/data_manager/install_primer_scheme_bedfiles.py	Tue Nov 16 08:22:08 2021 +0000
@@ -18,33 +18,50 @@
 DATA_TABLE_NAME = "primer_scheme_bedfiles"
 
 
-def write_artic_style_bed(input_file, bed_output_filename):
+def convert_and_write_bed(input_file, bed_output_filename, scheme_name, force_string=True):
     with open(bed_output_filename, "w") as bed_output_file:
         for line in input_file:
-            fields = line.split("\t")
-            if len(fields) < 6:
-                # too short to encode the strand format
-                exit("invalid format in BED file: {}".format(line.rstrip()))
+            fields = line.strip().split("\t")
+            if "Midnight" in scheme_name:
+                # Midnight primers are distributed in a tabular file, not a BED file
+                if line.startswith("Primer Name"):
+                    continue
+                if len(fields) != 8:
+                    exit("Unexpected format in Midnight primer file: {}".format(line.rstrip()))
+                (primer_name, _, pool, _, _, _, start, end) = fields
+                strand = '+' if primer_name.endswith('LEFT') else '-'
+                if strand == '-':
+                    start, end = end, start
+                fields = ["MN908947.3", start, end, primer_name, pool, strand]
+            else:
+                if len(fields) < 5:
+                    # too short to encode the "ARTIC style BED" format
+                    exit("invalid format in BED file: {}".format(line.rstrip()))
+            # 'BED' format used by ARTIC pipeline uses
+            # chrom  start  end  primer_name  pool_name
+            # see this: https://github.com/artic-network/fieldbioinformatics/blob/master/artic/vcftagprimersites.py#L76
+            # for ARTIC minion and
+            # this: https://github.com/andersen-lab/ivar/blob/master/src/primer_bed.cpp#L125
+            # for ivar trim (ivar trim treats the file as BED following the standard but also allows the ARTIC format)
             try:
-                # try and parse field 5 as a number
-                score = float(fields[4])
+                float(fields[4])
             except ValueError:
-                # Alright, this is an ARTIC-style bed,
-                # which is actually against the specs, but required by the
-                # ARTIC pipeline.
+                # this is a string, we can leave it as is
                 pass
             else:
-                # This is a regular bed with numbers in the score column.
-                # We need to "fix" it for the ARTIC pipeline.
-                fields[4] = '_{0}'.format(score)
-            bed_output_file.write("\t".join(fields))
+                # ensure that it is forced to be a string
+                fields[4] = '_{0}'.format(fields[4])
+            print('\t'.join(fields), file=bed_output_file)
 
 
-def fetch_artic_primers(output_directory, primers):
+def fetch_primers(output_directory, primers):
     primer_sets = {
         "SARS-CoV-2-ARTICv1": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V1/nCoV-2019.bed",
         "SARS-CoV-2-ARTICv2": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V2/nCoV-2019.bed",
         "SARS-CoV-2-ARTICv3": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V3/nCoV-2019.bed",
+        "SARS-CoV-2-ARTICv4": "https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V4/SARS-CoV-2.scheme.bed",
+        "VarSkip-V1a": "https://raw.githubusercontent.com/nebiolabs/VarSkip/main/schemes/NEB_VarSkip/V1a/NEB_VarSkip.scheme.bed",
+        "Midnight-v1": "https://zenodo.org/record/3897530/files/SARS-CoV-2_primer_sets_RBK004_nanopore_sequencing.tab?download=1"
     }
 
     data = []
@@ -62,19 +79,23 @@
             )
             exit(response.status_code)
         bed_output_filename = os.path.join(output_directory, name + ".bed")
-        write_artic_style_bed(StringIO(response.text), bed_output_filename)
-        description = name[:-2] + " " + name[-2:] + " primer set"
+        convert_and_write_bed(StringIO(response.text), bed_output_filename, name)
+        if 'ARTIC' in name:
+            # split the vX from the rest of the name in ARTIC primer set description
+            description = name[:-2] + " " + name[-2:] + " primer set"
+        else:
+            description = name + " primer set"
         data.append(dict(value=name, path=bed_output_filename, description=description))
     return data
 
 
 def install_primer_file(
-    output_directory, input_filename, primer_name, primer_description
+    output_directory, input_filename, scheme_name, primer_description
 ):
-    name = re.sub(r"\W", "", str(primer_name).replace(" ", "_"))
+    name = re.sub(r"[^\w-]", "", str(scheme_name).replace(" ", "_"))
     output_filename = os.path.join(output_directory, name + ".bed")
     with open(input_filename) as input_file:
-        write_artic_style_bed(input_file, output_filename)
+        convert_and_write_bed(input_file, output_filename, scheme_name)
     data = [dict(value=name, description=primer_description, path=output_filename)]
     return data
 
@@ -86,7 +107,7 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Fetch ARTIC SARS-CoV-2 primer files for Galaxy/IRIDA use"
+        description="Fetch ARTIC, VarSkip and Midnight SARS-CoV-2 primer files for Galaxy/IRIDA use"
     )
     parser.add_argument(
         "--output_directory", default="tmp", help="Directory to write output to"
@@ -146,12 +167,10 @@
 
     data_manager_dict = {}
     data_manager_dict["data_tables"] = config.get("data_tables", {})
-    data_manager_dict["data_tables"][DATA_TABLE_NAME] = data_manager_dict[
-        "data_tables"
-    ].get(DATA_TABLE_NAME, [])
+    data_manager_dict["data_tables"][DATA_TABLE_NAME] = []
 
     if args.artic_primers:
-        data = fetch_artic_primers(output_directory, args.artic_primers)
+        data = fetch_primers(output_directory, args.artic_primers)
     else:
         data = install_primer_file(
             output_directory,