Repository 'medaka_consensus_pipeline'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/medaka_consensus_pipeline

Changeset 13:72b5e9dda577 (2021-11-18)
Previous changeset 12:0e9e7fcdd543 (2021-09-17) Next changeset 14:cb34f00cc10f (2022-06-27)
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 52289bc7b99bfa8a3bda46cb35cea98399419dab"
modified:
convert_VCF_info_fields.py
macros.xml
medaka_consensus.xml
added:
test-data/basecalls.fastq.gz
removed:
test-data/basecalls.fastq
b
diff -r 0e9e7fcdd543 -r 72b5e9dda577 convert_VCF_info_fields.py
--- a/convert_VCF_info_fields.py Fri Sep 17 20:23:12 2021 +0000
+++ b/convert_VCF_info_fields.py Thu Nov 18 20:01:28 2021 +0000
[
@@ -24,10 +24,10 @@
 
 
 def parseInfoField(info):
-    info_fields = info.split(';')
+    info_fields = info.split(";")
     info_dict = OrderedDict()
     for info_field in info_fields:
-        code, val = info_field.split('=')
+        code, val = info_field.split("=")
         info_dict[code] = val
     return info_dict
 
@@ -40,7 +40,7 @@
     and multiple alternate alleles with simple ref, alt allele counterparts.
     """
 
-    in_vcf = open(in_vcf_filepath, 'r')
+    in_vcf = open(in_vcf_filepath, "r")
     # medaka INFO fields that do not make sense after splitting of
     # multi-allelic records
     # DP will be overwritten with the value of DPSP because medaka tools
@@ -48,8 +48,8 @@
     # (https://github.com/nanoporetech/medaka/issues/192).
     # DPS, which is as unreliable as DP, gets skipped and the code
     # calculates the spanning reads equivalent DPSPS instead.
-    to_skip = {'SC', 'SR', 'AR', 'DP', 'DPSP', 'DPS'}
-    struct_meta_pat = re.compile('##(.+)=<ID=([^,]+)(,.+)?>')
+    to_skip = {"SC", "SR", "AR", "DP", "DPSP", "DPS"}
+    struct_meta_pat = re.compile("##(.+)=<ID=([^,]+)(,.+)?>")
     header_lines = []
     contig_ids = set()
     contig_ids_simple = set()
@@ -59,8 +59,8 @@
     # - redundant contig information
     while True:
         line = in_vcf.readline()
-        if line[:2] != '##':
-            assert line.startswith('#CHROM')
+        if line[:2] != "##":
+            assert line.startswith("#CHROM")
             break
         if line in header_lines:
             # the annotate tool may generate lines already written by
@@ -69,12 +69,12 @@
         match = struct_meta_pat.match(line)
         if match:
             match_type, match_id, match_misc = match.groups()
-            if match_type == 'INFO':
-                if match_id == 'DPSP':
-                    line = line.replace('DPSP', 'DP')
+            if match_type == "INFO":
+                if match_id == "DPSP":
+                    line = line.replace("DPSP", "DP")
                 elif match_id in to_skip:
                     continue
-            elif match_type == 'contig':
+            elif match_type == "contig":
                 contig_ids.add(match_id)
                 if not match_misc:
                     # the annotate tools writes its own contig info,
@@ -87,7 +87,7 @@
     # Lets check the above assumption about each ID-only contig line
     # having a more complete counterpart.
     assert not (contig_ids_simple - contig_ids)
-    header_lines.insert(1, '##convert_VCF_info_fields=0.2\n')
+    header_lines.insert(1, "##convert_VCF_info_fields=0.2\n")
     header_lines += [
         '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n',
         '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n',
@@ -96,47 +96,34 @@
         '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n',
         '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n',
         '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n',
-        line
+        line,
     ]
 
-    with open(out_vcf_filepath, 'w') as out_vcf:
+    with open(out_vcf_filepath, "w") as out_vcf:
         out_vcf.writelines(header_lines)
         for line in in_vcf:
-            fields = line.split('\t')
+            fields = line.split("\t")
             info_dict = parseInfoField(fields[7])
-            sr_list = [int(x) for x in info_dict["SR"].split(',')]
-            sc_list = [int(x) for x in info_dict["SC"].split(',')]
+            sr_list = [int(x) for x in info_dict["SR"].split(",")]
+            sc_list = [int(x) for x in info_dict["SC"].split(",")]
             if len(sr_list) != len(sc_list):
-                print(
-                    'WARNING - SR and SC are different lengths, '
-                    'skipping variant'
-                )
+                print("WARNING - SR and SC are different lengths, " "skipping variant")
                 print(line.strip())  # Print the line for debugging purposes
                 continue
-            variant_list = fields[4].split(',')
-            dpsp = int(info_dict['DPSP'])
+            variant_list = fields[4].split(",")
+            dpsp = int(info_dict["DPSP"])
             ref_fwd, ref_rev = 0, 1
-            dpspf, dpspr = (int(x) for x in info_dict['AR'].split(','))
+            dpspf, dpspr = (int(x) for x in info_dict["AR"].split(","))
             for i in range(0, len(sr_list), 2):
                 dpspf += sr_list[i]
                 dpspr += sr_list[i + 1]
             for j, i in enumerate(range(2, len(sr_list), 2)):
-                dp4 = (
-                    sr_list[ref_fwd],
-                    sr_list[ref_rev],
-                    sr_list[i],
-                    sr_list[i + 1]
-                )
+                dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1])
                 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
                 _, p_val = scipy.stats.fisher_exact(dp2x2)
                 sb = pval_to_phredqual(p_val)
 
-                as_ = (
-                    sc_list[ref_fwd],
-                    sc_list[ref_rev],
-                    sc_list[i],
-                    sc_list[i + 1]
-                )
+                as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1])
 
                 info = []
                 for code in info_dict:
@@ -145,31 +132,31 @@
                     val = info_dict[code]
                     info.append("%s=%s" % (code, val))
 
-                info.append('DP=%d' % dpsp)
-                info.append('DPSPS=%d,%d' % (dpspf, dpspr))
+                info.append("DP=%d" % dpsp)
+                info.append("DPSPS=%d,%d" % (dpspf, dpspr))
 
                 if dpsp == 0:
-                    info.append('AF=NaN')
+                    info.append("AF=NaN")
                 else:
                     af = (dp4[2] + dp4[3]) / dpsp
-                    info.append('AF=%.6f' % af)
+                    info.append("AF=%.6f" % af)
                 if dpspf == 0:
-                    info.append('FAF=NaN')
+                    info.append("FAF=NaN")
                 else:
                     faf = dp4[2] / dpspf
-                    info.append('FAF=%.6f' % faf)
+                    info.append("FAF=%.6f" % faf)
                 if dpspr == 0:
-                    info.append('RAF=NaN')
+                    info.append("RAF=NaN")
                 else:
                     raf = dp4[3] / dpspr
-                    info.append('RAF=%.6f' % raf)
-                info.append('SB=%d' % sb)
-                info.append('DP4=%d,%d,%d,%d' % dp4)
-                info.append('AS=%d,%d,%d,%d' % as_)
-                new_info = ';'.join(info)
+                    info.append("RAF=%.6f" % raf)
+                info.append("SB=%d" % sb)
+                info.append("DP4=%d,%d,%d,%d" % dp4)
+                info.append("AS=%d,%d,%d,%d" % as_)
+                new_info = ";".join(info)
                 fields[4] = variant_list[j]
                 fields[7] = new_info
-                out_vcf.write('\t'.join(fields))
+                out_vcf.write("\t".join(fields))
     in_vcf.close()
 
 
b
diff -r 0e9e7fcdd543 -r 72b5e9dda577 macros.xml
--- a/macros.xml Fri Sep 17 20:23:12 2021 +0000
+++ b/macros.xml Thu Nov 18 20:01:28 2021 +0000
[
@@ -1,10 +1,10 @@
-<?xml version="1.0"?>
 <macros>
-    <token name="@TOOL_VERSION@">1.3.2</token>
+    <token name="@TOOL_VERSION@">1.4.4</token>
+    <token name="@VERSION_SUFFIX@">0</token>
     <token name="@PROFILE@">20.01</token>
     <xml name="bio_tools">
         <xrefs>
-            <xref type="bio.tools">khmer</xref>
+            <xref type="bio.tools">medaka</xref>
         </xrefs>
     </xml>
     <xml name="requirements">
@@ -52,22 +52,51 @@
         <param argument="@ARGUMENT@" type="integer" value="100" min="1" label="Set inference batch size"/>
     </xml>
     <xml name="model" token_argument="-m" token_label="Select model">
-        <param argument="@ARGUMENT@" type="select" label="@LABEL@">
+        <param argument="@ARGUMENT@" type="select" label="@LABEL@" help="For best results it is important to specify the correct model, 
+            according to the basecaller used. Medaka models are named to indicate i) the pore type, ii) the sequencing device (MinION 
+            or PromethION), iii) the basecaller variant, and iv) the basecaller version">
+            <option value="r103_fast_g507">r103_fast_g507</option>
+            <option value="r103_fast_snp_g507">r103_fast_snp_g507</option>
+            <option value="r103_fast_variant_g507">r103_fast_variant_g507</option>
+            <option value="r103_hac_g507">r103_hac_g507</option>
+            <option value="r103_hac_snp_g507">r103_hac_snp_g507</option>
+            <option value="r103_hac_variant_g507">r103_hac_variant_g507</option>
             <option value="r103_min_high_g345">r103_min_high_g345</option>
             <option value="r103_min_high_g360">r103_min_high_g360</option>
             <option value="r103_prom_high_g360">r103_prom_high_g360</option>
             <option value="r103_prom_snp_g3210">r103_prom_snp_g3210</option>
             <option value="r103_prom_variant_g3210">r103_prom_variant_g3210</option>
+            <option value="r103_sup_g507">r103_sup_g507</option>
+            <option value="r103_sup_snp_g507">r103_sup_snp_g507</option>
+            <option value="r103_sup_variant_g507">r103_sup_variant_g507</option>
+            <option value="r104_e81_fast_g5015">r104_e81_fast_g5015</option>
+            <option value="r104_e81_hac_g5015">r104_e81_hac_g5015</option>
+            <option value="r104_e81_sup_g5015">r104_e81_sup_g5015</option>
             <option value="r10_min_high_g303">r10_min_high_g303</option>
             <option value="r10_min_high_g340">r10_min_high_g340</option>
             <option value="r941_min_fast_g303">r941_min_fast_g303</option>
+            <option value="r941_min_fast_g507">r941_min_fast_g507</option>
+            <option value="r941_min_fast_snp_g507">r941_min_fast_snp_g507</option>
+            <option value="r941_min_fast_variant_g507">r941_min_fast_variant_g507</option>
+            <option value="r941_min_hac_g507">r941_min_hac_g507</option>
+            <option value="r941_min_hac_snp_g507">r941_min_hac_snp_g507</option>
+            <option value="r941_min_hac_variant_g507">r941_min_hac_variant_g507</option>
             <option value="r941_min_high_g303">r941_min_high_g303</option>
             <option value="r941_min_high_g330">r941_min_high_g330</option>
             <option value="r941_min_high_g340_rle">r941_min_high_g340_rle</option>
             <option value="r941_min_high_g344">r941_min_high_g344</option>
             <option value="r941_min_high_g351">r941_min_high_g351</option>
             <option value="r941_min_high_g360" selected="true">r941_min_high_g360</option>
+            <option value="r941_min_sup_g507">r941_min_sup_g507</option>
+            <option value="r941_min_sup_snp_g507">r941_min_sup_snp_g507</option>
+            <option value="r941_min_sup_variant_g507">r941_min_sup_variant_g507</option>
             <option value="r941_prom_fast_g303">r941_prom_fast_g303</option>
+            <option value="r941_prom_fast_g507">r941_prom_fast_g507</option>
+            <option value="r941_prom_fast_snp_g507">r941_prom_fast_snp_g507</option>
+            <option value="r941_prom_fast_variant_g507">r941_prom_fast_variant_g507</option>
+            <option value="r941_prom_hac_g507">r941_prom_hac_g507</option>
+            <option value="r941_prom_hac_snp_g507">r941_prom_hac_snp_g507</option>
+            <option value="r941_prom_hac_variant_g507">r941_prom_hac_variant_g507</option>
             <option value="r941_prom_high_g303">r941_prom_high_g303</option>
             <option value="r941_prom_high_g330">r941_prom_high_g330</option>
             <option value="r941_prom_high_g344">r941_prom_high_g344</option>
@@ -76,6 +105,9 @@
             <option value="r941_prom_snp_g303">r941_prom_snp_g303</option>
             <option value="r941_prom_snp_g322">r941_prom_snp_g322</option>
             <option value="r941_prom_snp_g360">r941_prom_snp_g360</option>
+            <option value="r941_prom_sup_g507">r941_prom_sup_g507</option>
+            <option value="r941_prom_sup_snp_g507">r941_prom_sup_snp_g507</option>
+            <option value="r941_prom_sup_variant_g507">r941_prom_sup_variant_g507</option>
             <option value="r941_prom_variant_g303">r941_prom_variant_g303</option>
             <option value="r941_prom_variant_g322">r941_prom_variant_g322</option>
             <option value="r941_prom_variant_g360">r941_prom_variant_g360</option>
@@ -111,6 +143,28 @@
 
 This task is performed using neural networks applied from a pileup of individual sequencing reads against a draft assembly. It outperforms graph-based methods operating on basecalled data, and can be competitive with state-of-the-art signal-based methods, whilst being much faster.
     ]]></token>
+
+    <token name="@MODELS@"><![CDATA[
+
+----
+
+.. class:: infomark
+
+**Models**
+
+For best results it is important to specify the correct model, -m in the above, according to the basecaller used. Allowed values can be found by running medaka tools list\_models.
+
+Medaka models are named to indicate i) the pore type, ii) the sequencing device (MinION or PromethION), iii) the basecaller variant, and iv) the basecaller version, with the format:
+
+    ::
+
+        {pore}_{device}_{caller variant}_{caller version}
+
+For example the model named r941_min_fast_g303 should be used with data from MinION (or GridION) R9.4.1 flowcells using the fast Guppy basecaller version 3.0.3. By contrast the model 
+r941_prom_hac_g303 should be used with PromethION data and the high accuracy basecaller (termed "hac" in Guppy configuration files). Where a version of Guppy has been used without an exactly corresponding medaka model, the medaka model with the highest version equal to or less than the guppy version should be selected.
+          
+    ]]></token>
+
     <token name="@REFERENCES@"><![CDATA[
 More information are available in the `manual <https://nanoporetech.github.io/medaka/index.html>`_ and `github <https://github.com/nanoporetech/medaka>`_.
     ]]></token>
b
diff -r 0e9e7fcdd543 -r 72b5e9dda577 medaka_consensus.xml
--- a/medaka_consensus.xml Fri Sep 17 20:23:12 2021 +0000
+++ b/medaka_consensus.xml Thu Nov 18 20:01:28 2021 +0000
b
@@ -1,4 +1,4 @@
-<tool id="medaka_consensus_pipeline" name="medaka consensus pipeline" version="@TOOL_VERSION@+galaxy0" profile="@PROFILE@">
+<tool id="medaka_consensus_pipeline" name="medaka consensus pipeline" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
     <description>Assembly polishing via neural networks</description>
     <macros>
         <import>macros.xml</import>
@@ -55,7 +55,7 @@
     <tests>
         <!-- #1 default -->
         <test expect_num_outputs="3">
-            <param name="i" value="basecalls.fastq"/>
+            <param name="i" value="basecalls.fastq.gz"/>
             <param name="d" value="assembly.fasta"/>
             <output name="out_consensus">
                 <assert_contents>
@@ -71,13 +71,13 @@
             </output>
             <output name="out_calls">
                 <assert_contents>
-                    <has_size value="343197" delta="100"/>
+                    <has_size value="343341" delta="100"/>
                 </assert_contents>
             </output>
         </test>
         <!-- #2 -->
         <test expect_num_outputs="5">
-            <param name="i" value="basecalls.fastq"/>
+            <param name="i" value="basecalls.fastq.gz"/>
             <param name="d" value="assembly.fasta"/>
             <param name="m" value="r941_min_fast_g303"/>
             <param name="b" value="99"/>
@@ -95,7 +95,7 @@
             </output>
             <output name="out_calls">
                 <assert_contents>
-                    <has_size value="343197" delta="100"/>
+                    <has_size value="343340" delta="100"/>
                 </assert_contents>
             </output>
             <output name="out_log">
@@ -119,10 +119,18 @@
 
 The *medaka_consensus* pipeline performs assembly polishing via neural networks.
 
+----
+
+.. class:: infomark
+
 **Input**
 
 An *assembly* in .fasta format and *basecalls* in .fasta or .fastq format are required. See `Creating a Draft Assembly  <https://nanoporetech.github.io/medaka/walkthrough.html#basecalling-and-draft-assembly>`_ for a detailed example of one method of obtaining these.
 
+----
+
+.. class:: infomark
+
 **Output**
 
 - Consensus polished assembly (FASTA)
@@ -132,6 +140,12 @@
 - Variants: VCF of changes (VCF)
 - Polished: BED file of polished regions (BED)
 
+@MODELS@
+
+----
+
+.. class:: infomark
+
 **References**
 
 @REFERENCES@
b
diff -r 0e9e7fcdd543 -r 72b5e9dda577 test-data/basecalls.fastq
--- a/test-data/basecalls.fastq Fri Sep 17 20:23:12 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
b"@@ -1,500 +0,0 @@\n-@215f1e0c-27d1-4446-bca5-cb0fd6a8c054\n-TTGCAGTATCTGCGTCAGAATCGCATTCCAGCCGCAGGTGTTCAGCGGTGCGTACAATTACAGCATTATGTTAAATTTTATAATTGTCTTTAGTCATTGCGCTATGATTTGTAGGTGGAGTTGTTTCATATTTTGATATAAATTCTTTTATCTCACCAAACAGTCATCGTCAGTATCAATGGTTTTGTTCTTCATTGATATTACCTATTCTAGAATCTGATTCCTTGTACAACTAAGGTAAACTTGAACTTTGCTTTCTTTTCATTATTTGGGTAAGCGTAATGATGTAGGATCTAGGATATATCTATAATAATAATTCTTGTTCTTTTTTTATTTTATTGAATATGAATCAATACCTGTCTAAGCACACCTCTTTCTTACGCGTGAGCAATGGTTAGATGGATCTTCTGGCGGCCTTGCGCCATTCCTGTTTGTTC\n-+\n-?B4-)&'%<ABC>:883110,;*-$/+%)(.(($$#%&$$#%14+6-/=1@5@A>GE@F@<B8B..0F>?AA13EJHTIDHBDAJB@BOM908=B;=>49/(?>>?>?@?(004?C57/9<B>DEEE@@@C?=>=<912+,*'1/1396;78+&$&,15(85::C@>BA>9(,</(B:;G?B;@K=<HF?JF5**+&&0/#%$;>BCB=(4<.=91<4@ADC9:AC$=9>555=3412;6;+=>?@>2$%+839@BH=CAOHFCI<.98<-0/*$%)0**()(')&$)$474)+/,'267432&&'&''''$%&'*')>?C3=;467===95489G:./5229,$$&$(&&$%%(''')+8%/0%.-%0$')&'*=566(-(685''57./4%)44+/*$$$'42:;6($$##$$$'-0(%67112$$'$())'7;:\n-@baf1c4f7-8f02-4c92-be5f-0431ca399c18\n-GGTATTACTTCGTTCAGTTACGTATTGCTGCGCGCACTCTCTGTCGTCGGCAGCGTCAGATGTGTATAAGAGACAGCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACAGGGGAGTTCGTGCACACAGCCCGAAGCTTGGAGCGAACGACCTACGCGAACCGAGATACCTACAGCGTGAGCTATGAAAGCGCCACGCTTCCCGAAGGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCGGGGGAAACACACCTGGTATCTTTTTATAGTCCTGTCCGGGTTTCGCCACCTCTGGCAGGCGTCGATTTTTGTGATGCTCGTCAGAGCGGAATTTATGGAAACGCCTGCGGCGCTGGCTTCCTCCGATGCTGCTTTTTGCTCACATATTCTTTCCGACTTTATCCCCCGATTCTGTGGATAACCGTATTACCGCTGCAGGTGAGCTGACACGCTCGCCGCAGTCGAACGACCGAGAGCGTAGCGAGTCAGTGAGCGAGGAAGCGGAAGAGCGCCTATGTGACATTTTCTCCTTACGCTCTGTTGTGCCGTTCGGCATCCTGTCTGAGCGTTATCTCTCTGTGCTATTTTTACTTCAAAGCGTGTCTGGATGCTGTTCTGGAGTTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGCAGCCTGTTCGTCAGCATCGGGTGCGTCCAGTTTTCGAGCAGCATTCAGGCTCTGACTTTTATGAATCCCGCCATGTTGGTACGGCTTTTCTTTCTTGTTCATCTTTTCGTTTTCTCCGTTCTGTCTGTCATCTGCGTTGTGTATGATTATATCGCGTACCACTTTTCGGCTGTTTGCTGCCGTTATTCTGCGCCGCTTGGCTTTTTGACGGGCATTTCTGTCAGACAACACTGTCACTGCCAAAAAACTGCCGTGCCTTTGTCGGTAATTCGAGCTTGCTGACAGGACAGGATGTACAATTGTTATACCGCGCATACATGCACGCTATTACAATTGCCCTGGTCAGGAGCTTTGCCCCGACACCCATGTCCAGATACGGAGCCATGTTTTGCTGACAAAACGAAGTGGAAGTAATACGCGCAGGCGGGCTATCAGTCCCCTGTTCGTCTGACGGCAGAGAGAAGACCAGGAAATCAGAAAAAGGGCTGCTGAATGCGGCAAGACCGTTTCGGTTTTTACGGGCGGCAGCTCTCGGTAAGAAAGTTAACCCACTGACTGATGATCGGGTACTGAAAGAAGTTATGAGACTGGGAGCGTTACAGAAAAAACTCTTTATCGACGGCAAGCGTGTCGGGGACAGGAGTATGCGTTTGATCGCTATTACGGGTATCACGTGCCCTGTTATCAGACTTATGGCAGATTGGCTTCCCGGAGAGAAAACTGTCGAAAACAGACGGTATGAACACCGTAAGCTCCCAAAGTGATCACCATTCGCTTTCATGCATAGCTATGCAGCGAGCTGAAAACGATCCTGACGCATCCTTCCTGTTTTCCCGGGGTAAACATCTTTTTTGCAGTGTCTGCGTCAGAATCGCGTTCAGCGCGTTTCAGCAGTGCGTACAATTACAGCATTATGTTAAATTTTATAGTGTCTTTAGTCATTTGCGCTATGATTTTGTAGGTGGAGTTGCTTCATATTTTTGATATAAAGTTCTTTTCCCATCTGCTAGAGCAGTCATCGTCAGTATCAATGGTTTGTTCTTCATTGATATGCCTATTCTAGAATCTGGTCCTTATTACAACTAAGGTAAACTTGAACTTTTGCTTTCTTTTCATTATTTGGGTAAGCGTAATCAGGATCTAAGGATGTAATAATAGATTCTTGTTCTTTTTTATTTTATTGAATAGATCGTCTATATATCTAAACACCTCTTTCTTACACGCGTGAGCAATAGTTGGAGTCTTCTATTGCGCCATTCTTGTTTTTCCTGGCATGATAAAGAAGATATTTTTGTGCTTCCCGTTAATGGCTATTTCATTGTGCGAGCTTGGTGATAAATGTTAAGCTCATTTAATGTTGAAATTATCTGATCTTTCCGTGTCAGAAAAACGATTCCGTCGGCATCGATGATTAAGAATATACTTTCTTTTCCATTAAAGACTCTATGCTTTTAAATGCTCTAAGGGTTTGTTTTAGTTTGTCTTTCCATTTTATTTTTTCAACATAAACATCATTAATTCCTTCTATTTCAAGTAGATTTAGGAAGTGAAAATCATCATCACCCTCAACAAAAAATGACTTTTTTATAATCATTTATTTTCTCGGTTTCATTTATCTCACCTCCCAGCCATTTTCAACTGAAAAGGAAAATCATCCATGTTAAATTGCATTGCTGTAGGTTTTGGGAATTGCTTGTCTCTTCCCAGTCTTATGTAAGATATATCTTTCTTTTTCATCTAATTCGTTTATCGCACATATTACATCATGGCTGTGTGTTGTTGTAAATATTTGATTATTATTGGTTTTGCTGCTTCTATCAAAGCGTTGATCATATCTTTGATTACTGAGTAATGAATTCCATTTTCTATCTCATCAATGAGAATAATTGAGTCTTTTTGAACTAAGATTGAGCTAATGAAAGACAGCGCTCTGCTTACTCCTTCACCAAGCATTGATATTTCATTTAATTCACTTAGTCCTATATCTAAAAAAATCTCTTTGTTTTGATCTGTTGCTATTATTGCAATATCGTTAATTCTATTATCTATTCTTTTAGGTTGCCTATTATTTCATCTTTCTTTTGTGTTAGTAGTTTGACCGTTTTTATCGTTGCTTCTTTGTTTATGCTTCTTGATGTGGTTATTACCGCGCTGCGGTTTTGAACTCATATAGCTTACACACTTTTTAGTTTGTTTTATTTCTCCGTTTATTTCTTGTGCTTTTTGTGTTATTTTTATAGTAATCAATTCTTCTGGGGGTTCTTTTCCGTTCTGTGTTTCACTAACCTTAATCGTTAAAACATTCCCCTTTTCCTGAGAATA"..b"=8+,$/-.$#+**'*;<=@==<..8-/0.1+,5003:><+>7=79227:678><:>99//16<==(@AGCBAE@>F/A@;BBCA?@:?FONPIBJGDDE>@<.%+,+$%)-/8)3==B>%$>=?@<IB;<>=<>D;0FB;8:=??@++.-500*'?:<80B8=<773007718..)5&&=/.01/2289,0<>8B>@?))+287%789;C?533*)(#&%-''-3;-*)(%35<5469<76:EF?<;=>87597<;3>9,&%+&3/3=996&556;=BG*3(.,<6)3).)+AA;@B*5&;>956?;;7/.10)00/,'%(>/299.)7.876.GCF;27-,=9-3--**$1,*)'/:&-0--,.35;:947873.306''*B=:>15B;GBG557><=@B99;666889;89<A?MEMI:?>?FCEIA7.?$--,+$.847'*22.*)24-'(86BA=73>BB6&*73;,$465:?)?((6-+++))$(--.,*<2>49474987=;=:;;:66537<=D@>DFP>-/&'/'258740.4*&'%&-5779,+...-&'+/1.0))4,-57?=?EGBB?8/,*%4&%08AB<<8-=DB7?10%$(+5-5:=8*)&%'%,'4AB>>&69GI?<899//:@;>1>BECA?2@%'./9GCDHAC@JJ:ADDE;D3,/7>>7125C;67;3772>;$7=A=H<77@B>E>A.<KDRPJA?><>@BAA00CAG.$:'?DDMFCDDEBB<-/:@?BFHE?<5<9+=@472<2=%+567;5A7&5=/0*++()78CCHDB>:;2EJ?>1)89?94/00?B?HKP/B88.2-7:976(+((,0:97==HEB;6..C66H://2ABFF;;3>:8C12AGH<AD81/28?A@HDGA<=BBHE?;ABE=?C',?F?B95<=64=@@AD>66;F?5:7@B?G:235>A64433/0&'(...1:;;BEDAOL?>?B@A:F=3D?>?&492+,-69)).7668@4;2,:DDEDB@80+*&.,#&($',('&/8:1/A-+,**;<96;9;><A/%6**$$%,6765A/1:=@B<=7,?ACDAC7D/446B>($DC>1.05-,++?9?A<>;17>AFHFC@A4-*-,21'&'(511.6/221$$&$%'&&&%014*??CC=<:FDCBB448@8:>65A35)565233>?FD;9DF<2<?D>?77<@F>?9*-.:84:?9832)+?+(*+25=))8//-<@@EB3+?HDB*&7-957*,/-*832?8@<=3-;908<4@96;(3&847((,&$$$<,5:/71,+*8669;<-:5;JDIE5,<+-?@1:A9@CD=.E/7A45::;8CECA@657A9>FD>AFBGD*$D-2BEA>;?4>C621?38B>C=>?::;@87%8981;<-4,2@AAA9<:52238>?A;@:ACCC3?:=0+/$&31@<,&:95)4>=018'/':9B6/)))9@85/*330:8;19E50@C<=6>4;6>?,8@@((G9:B::;CE=?9@8>ECF<FNJ7;KCCGHB+FE:>=?@>=A@EC?GBB389738//.;=A<:1)4==>HG7892AB90,)%&%#&%&2=C;@CI;0+D99=0084:5,,4;:@?FA88679:=7B@DA@B;<=:58;?>=;C?+/C).-*;3417=::=65FBDAC25+*<9?DC>BA;1)&&/6A=B?D0.5*/*(-/31,-$%/(4@<=:BCBDGBE78935%%;=78D>0.0.BEKH<@@DEH4:;HGA==:@<>AG,:A@=<:<GHF9-7(0=9<8;=?3AA+,$156>??A?ACBA32+,(/>??<;?.7&&&(*()$(59511/*000@A=BBA>?@IHNB8CA::-0235:;>?77:8;A>AA=:>=>66344CA1429B9:<@@@@A8927DCHGGOHFCA?A@EC4**+:66ADD@?CE>IHC49C;<=2(42(4685D$(?B<;:=>6?DB55AE@55@::(;?>II?CD=I@IC;IFUN:JHADG5-)88?=;<<85@D?9;@AA-/.256>DCC;:==+=:3627//080/*&.@<=5C@FE4942?D22'<?::;==20)'::;(9(%5+6454<4B;%+;A7)DC:MJLH<D8<CE7A9CCCG879@2GCD5;35.('&')*+,777$((+)17-7F69B9(&/15BA@BC==B0*-021:633$$7===E9;=D91?00;3D>777<?4,'=;@/CBG<=?;7?8)*.+*..**,,./.-CJGA@C@@?110BB<A@9327<=@@GLJ94/999662;<FEB@?@N@7:AAOH<2255),,/3C;HFPE?@BB>259;58@<>@GDC@@C9:=@A;<<>AA128@@B89>09?@+93><6;449<<9?==?>=::43.0-5%$$$$%43/,+7DGFTG=;:D?@=<8A>@<>?QGC<0,$,8:.2.06;:7>,=?==>+7465898<<NBD<<<<;<H>1%4<@KH:9>E;=@:<>IG)+33>9;9:9/12(555<AAD@:-,-:;?*/*-*%$(21.+1166=?BG*%%0+,'133.-'%%?@A9D=BDB=>=DFDEEI<8KMH5.;,,,-6A>:=79=73&$(3.68?8F>EF<ABBE?C;44HH?;33;444&;;(-<AMG@30C9896?5@:=AA94,+2481+>*@=:@B@BB>>3(((*).79:BECBED-07?ESAFDE<;+B944**+1>CFIID<;7/17>RNDEJA7:-.<??@DDHG<=41;<AB@=30,&+/%$)%$;AEGGCFFAA@HEB>-.,)*</AB<3.3*-%%++,6+))(\n-@ef1c09bf-a870-4994-85e6-196592dd4143\n-GTTGTACTTCGTTCGGTTACGTATTGCTGCGCGCGCACTCTCTGTCGTCAACGGCGTCAGATATTATTATAAGAGACAGCTCCTTCACCAAGCATTGATATTTCATTTAATTCGCCAGTCCTATATAAAAAAAATCTCTTTGTTTGATCTGTTGCTATTATTGCAATATCGTTAATTCTATTATCTATTCTTTTTAGGTTACCTATTATTTCATCTTTTCTTTTTGTGTTAGTAGTTTTGATACGTTTTATCGTTGCTTCTTCATTTATGTTTCTTGATGTGGTTATTATCGCTTGCGGTTTTGAATTCATATAGCTTACCTTTTTTAGTTTGTTTTATTTCTCCGTTTATTTCTTGTGCTTTTTTGTGTTATTTTTATAGTAATCAATTCTTCTGGGGGTTCTTTTTCCGTTCTGTGTTTCACTAACCTTAATCATTTAAAACATTTCCCCCCTTTCCTGAGAGCCGAGGGGTTTGTCGCAATGGAGGGTGTTTGATAATATTGACTCAACATTGGGGGACACGCTTGATTCTGTCTCTTATATACAATCTCCGAGCCCACGAGACCAGAGTGCGCGCAGCAATGCGTAACT\n-+\n-)8;541>AA?9F3@'.FD:->?E?D?AB56<70+)*&%)/02-2<<67;$$/$06518..1(*',)&(24')$&)16:6@@89EA=3)703,3''+0=85MG>B>@C?:;:,>$%%).))6-&'&%BD???@>=:?B;<?D>76750:>C9EE9D=@<8./1-.;>GE?A<NIDDECAGCFIC>BECA@;D7==491<E1%==A9HE<DD>D@>?@CM;)=@KHD9BABA98@A<BENCA;:6;3<=CAA@@@FI>78834.''24DA99@BI?MFFDD<>27*=<:?($4$%.11705A@MD?<8A=@7<@4C1ACCB@>GDB?8,1(8=F==:?A?FCC9A3><<=;79<C==<%?)9>?IG>='>,AF=CAJIC6AB@>0?9<68&*8K@C?467.<=5134LID564>:621=ACEH<@E:8<343/BA7C779<?AD@>9?BD23DB;889@:@=>=**342,+68:4CELD@G2C?@@BDC9)=GDHH@E@FBBGBGJEDBBA007/65$%<?FKB=B?((+EBJ>>?;00:;9;60+&&;798>>&924&ACG8>>=BC@?8:2;EA7.BB1+481254D>:5/#%\n"
b
diff -r 0e9e7fcdd543 -r 72b5e9dda577 test-data/basecalls.fastq.gz
b
Binary file test-data/basecalls.fastq.gz has changed