Repository 'bionano_scaffold'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/bionano_scaffold

Changeset 4:8cc3862f8b8e (2021-05-25)
Previous changeset 3:295c0e28f4ee (2021-05-23) Next changeset 5:96cacb31d571 (2021-05-29)
Commit message:
"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
modified:
bionano_scaffold.xml
macros.xml
added:
remove_fake_cut_sites.py
test-data/test_05_report.txt
b
diff -r 295c0e28f4ee -r 8cc3862f8b8e bionano_scaffold.xml
--- a/bionano_scaffold.xml Sun May 23 17:23:03 2021 +0000
+++ b/bionano_scaffold.xml Tue May 25 20:12:52 2021 +0000
[
b'@@ -7,6 +7,8 @@\n     <expand macro="requirements"/>\n     <command detect_errors="exit_code"><![CDATA[\n         #set RefAligner = \'/usr/local/bin/RefAligner\'\n+        #set output_file_NCBI = \'hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD_NCBI.fasta\'\n+        #set output_file_not_scaffolded = \'hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD_NOT_SCAFFOLDED.fasta\'\n         ## softlinks do not work\n         cp \'${ngs_fasta}\' ./ngs.fasta\n         && cp \'${bionano_cmap}\' ./bionano.cmap\n@@ -33,8 +35,7 @@\n         -r $RefAligner\n         #if $conflict_resolution\n             -M \'${conflict_resolution}\'\n-        #end if\n-        #if not $conflict_resolution\n+        #else\n             -B $conflict_filter_genome\n             -N $conflict_filter_sequence\n         #end if\n@@ -54,7 +55,10 @@\n         -f\n         $zip_file\n         -o ./\n-\n+        && cat $output_file_NCBI $output_file_not_scaffolded > total_contigs_raw.fasta\n+        #if $trim_cut_sites\n+            && python \'$__tool_directory__/remove_fake_cut_sites.py\' \'total_contigs_raw.fasta\' \'total_contigs_trimmed.fasta\' \'output.log\'\n+        #end if        \n     ]]>    </command>\n     <configfiles>\n         <configfile name="vgp_mode"><![CDATA[\n@@ -70,7 +74,7 @@\n                         <flag attr="RAmem" val0="3" val1="1"/>\n                     </global>\n                     <fasta2cmap>\n-                        <flag attr="enzyme" val0="$configuration_options.enzyme" display="Enzyme" group="FASTA to CMAP digestion" description="Define single enzyme for in-silico FASTA to CMAP digestion. Avalible enzymes: BspQI, BbvCI, BsmI, BsrDI, BssSI, DLE1."/>\n+                        <flag attr="enzyme" val0="$configuration_options.enzyme" display="Enzyme" group="FASTA to CMAP digestion" description="Define single enzyme for in-silico FASTA to CMAP digestion. Available enzymes: BspQI, BbvCI, BsmI, BsrDI, BssSI, DLE1."/>\n                         <flag attr="channelNum" val0="1" display="Channel number" group="FASTA to CMAP digestion" description="Specify the channel the enzyme was used."/>\n                         <flag attr="minLabels" val0="0" display="Minimum label sites" group="FASTA to CMAP digestion" description="Specify minimum number of label sites per digested contig."/>\n                         <flag attr="minLength" val0="0" display="Minimum length (Kb)" group="FASTA to CMAP digestion" description="Specify minimum length in Kb of each digested contig."/>\n@@ -384,6 +388,7 @@\n             <option value="3">Exclude conflicting contig</option>\n         </param>\n         <param name="zip_file" argument="-z" type="boolean" truevalue="-z results.zip" falsevalue="" checked="false" label="Generate an output package in ZIP format" help="The hybrid scaffold output package (.zip) can be imported into Access for visualization" />\n+        <param name="trim_cut_sites" type="boolean" checked="true" label="Remove BioNano cut sites" help="This option removes the spurious BioNano cut sites that are inserted into gaps in some assemblies, replacing them with Ns." />\n         <!-- \n         \n         Those options have been disabled because the Docker container doesn\'t include the required packages\n@@ -415,16 +420,22 @@\n         -->\n     </inputs>\n     <outputs>\n-        <data name="ngs_contigs_scaffold_fasta" format="fasta" from_work_dir="hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD.fasta" label="${tool.name} on ${on_string}: NGScontigs hybrid scaffold (fasta)"/>\n-        <data name="ngs_contigs_scaffold_agp" format="txt" from_work_dir="hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD.agp" label="${tool.name} on ${on_string}: NGScontigs hybrid scaffold (agp)"/>\n-        <data name="ngs_contigs_scaffold_gap" format="txt" from_work_dir="hybrid_scaffolds/bionano_bppAdjust_cmap_ngs_fasta_NGScontigs_HYBRID_SCAFFOLD.gap" label="${tool.name} on ${on_string}: NGScontigs hybrid scaffold (gap)"/'..b'aram name="zip_file" value="true"/>\n-            <output name="ngs_contigs_scaffold_fasta" ftype="fasta">\n+            <param name="trim_cut_sites" value="false"/>\n+            <output name="ngs_contigs" ftype="fasta">\n                 <assert_contents>\n                     <has_size value="4753369" delta="100" />\n                     <has_n_lines n="2"/>\n                     <has_line line=">Super-Scaffold_1"/>\n                 </assert_contents>\n             </output>\n-            <output name="ngs_contigs_scaffold_agp" file="test_03.agp" ftype="txt"/>\n-            <output name="ngs_contigs_scaffold_gap" file="test_03.gap" ftype="txt"/>\n             <output name="report" file="test_03_report.txt" ftype="txt"/>\n             <output name="results" ftype="zip">\n                 <assert_contents>\n@@ -542,7 +550,7 @@\n                 <has_text text="hybridScaffold"/>\n             </assert_stdout>\n         </test>\n-        <test expect_num_outputs="5">\n+        <test expect_num_outputs="3">\n             <param name="ngs_fasta" value="assembly.fasta.gz"/>\n             <param name="bionano_cmap" value="colormap_assembly.cmap"/>\n             <param name="conflict_filter_genome" value="2"/>\n@@ -552,15 +560,14 @@\n                 <param name="enzyme" value="BspQI"/>\n             </conditional>\n             <param name="zip_file" value="true"/>\n-            <output name="ngs_contigs_scaffold_fasta" ftype="fasta">\n+            <param name="trim_cut_sites" value="false"/>\n+            <output name="ngs_contigs" ftype="fasta">\n                 <assert_contents>\n                     <has_size value="4753369" delta="100" />\n                     <has_n_lines n="2"/>\n                     <has_line line=">Super-Scaffold_1"/>\n                 </assert_contents>\n             </output>\n-            <output name="ngs_contigs_scaffold_agp" file="test_04.agp" ftype="txt"/>\n-            <output name="ngs_contigs_scaffold_gap" file="test_04.gap" ftype="txt"/>\n             <output name="report" file="test_04_report.txt" ftype="txt"/>\n             <output name="results" ftype="zip">\n                 <assert_contents>\n@@ -583,6 +590,43 @@\n                 <has_text text="hybridScaffold"/>\n             </assert_stdout>\n         </test>\n+        <test expect_num_outputs="3">\n+            <param name="ngs_fasta" value="assembly.fasta.gz"/>\n+            <param name="bionano_cmap" value="colormap_assembly.cmap"/>\n+            <param name="conflict_filter_genome" value="3"/>\n+            <param name="conflict_filter_sequence" value="3"/>\n+            <conditional name="configuration_options">\n+                <param name="configuration" value="file"/>\n+                <param name="configuration_file" value="configuration.xml"/>\n+            </conditional>\n+            <param name="trim_cut_sites" value="true"/>\n+            <output name="ngs_contigs_trimmed" ftype="fasta">\n+                <assert_contents>\n+                    <has_size value="4832591" delta="300" />\n+                </assert_contents>\n+            </output>\n+            <output name="ngs_contigs_trimmed_report" ftype="txt">\n+                <assert_contents>\n+                    <has_size value="0" />\n+                </assert_contents>\n+            </output>\n+            <output name="report" file="test_05_report.txt" ftype="txt"/>\n+            <assert_stdout>\n+                <has_text text=\'attr="maxmem" val0="8"\'/>\n+            </assert_stdout>\n+            <assert_stdout>\n+                <has_text text=\'attr="maxthreads" val0="1"\'/>\n+            </assert_stdout>\n+            <assert_stdout>\n+                <has_text text=\'attr="insertThreads" val0="1"\'/>\n+            </assert_stdout>\n+             <assert_stdout>\n+                <has_text text=\'attr="maxvirtmem" val0="8"\'/>\n+            </assert_stdout>\n+            <assert_stdout>\n+                <has_text text="hybridScaffold"/>\n+            </assert_stdout>\n+        </test>\n     </tests>\n     <help><![CDATA[\n .. class:: infomark\n'
b
diff -r 295c0e28f4ee -r 8cc3862f8b8e macros.xml
--- a/macros.xml Sun May 23 17:23:03 2021 +0000
+++ b/macros.xml Tue May 25 20:12:52 2021 +0000
b
@@ -1,6 +1,6 @@
 <macros>
     <token name="@TOOL_VERSION@">3.6.1</token>
-    <token name="@GALAXY_TOOL_VERSION@">galaxy0</token>
+    <token name="@GALAXY_TOOL_VERSION@">galaxy1</token>
     <token name="@BIONANO_SUPPORT_TEXT@">
 Bionano Genomics has agreed to provide the licensed Bionano Solve
 software to enable the VGP to package the software in a container.
@@ -23,7 +23,7 @@
     </xml>
     <xml name="requirements">
         <requirements>
-            <container type="docker">bionanodocker/bionano-docker-scaffold:latest</container>
+            <container type="docker">quay.io/galaxy/bionano-docker-scaffold:1.6.01-bio</container>
         </requirements>
     </xml>
     <macro name="sanitize_string" >
b
diff -r 295c0e28f4ee -r 8cc3862f8b8e remove_fake_cut_sites.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/remove_fake_cut_sites.py Tue May 25 20:12:52 2021 +0000
[
@@ -0,0 +1,84 @@
+import re
+import sys
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+
+
+def main():
+
+    fasta_file = sys.argv[1]
+    output_file = sys.argv[2]
+    log_file = sys.argv[3]
+
+    output_handle = open(output_file, "w")
+    log_handle = open(log_file, "w")
+
+    with open(fasta_file, "r") as fasta_input_handle:
+        for record in SeqIO.parse(fasta_input_handle, "fasta"):
+
+            change_count = 0
+            cut_sites = [
+                Seq("CTTAAG"),
+                Seq("CTTCTCG"),
+                Seq("GCTCTTC"),
+                Seq("CCTCAGC"),
+                Seq("GAATGC"),
+                Seq("GCAATG"),
+                Seq("ATCGAT"),
+                Seq("CACGAG"),
+            ]
+
+            for cut_site in cut_sites:
+                cut_site_both_orientations = (cut_site, cut_site.reverse_complement())
+
+                for cut_site_for_orientation in cut_site_both_orientations:
+
+                    n_flank_length = 1
+                    search_pattern = (
+                        "N" * n_flank_length
+                        + str(cut_site_for_orientation)
+                        + "N" * n_flank_length
+                    )
+                    replacement = "N" * (
+                        n_flank_length * 2 + len(cut_site_for_orientation)
+                    )
+
+                    (new_string, changes) = re.subn(
+                        search_pattern,
+                        replacement,
+                        str(record.seq.upper()),
+                        flags=re.IGNORECASE,
+                    )
+                    change_count += changes
+
+                    record.seq = Seq(new_string)
+
+            if change_count > 0:
+                log_handle.write(
+                    " ".join([record.id, ":", str(change_count), "changes\n"])
+                )
+            SeqIO.write([record], output_handle, "fasta")
+
+            # Finally, count the matches
+            possible_fake_cut_sites = re.findall(
+                "N[^N]{1,10}N", str(record.seq.upper())
+            )
+            if len(possible_fake_cut_sites) > 0:
+                log_handle.write(
+                    " ".join(
+                        [
+                            record.id,
+                            ":",
+                            str(len(possible_fake_cut_sites)),
+                            "possible non-standard fake cut sites\n",
+                        ]
+                    )
+                )
+
+    output_handle.close()
+    log_handle.close()
+
+
+if __name__ == "__main__":
+    main()
b
diff -r 295c0e28f4ee -r 8cc3862f8b8e test-data/test_05_report.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_05_report.txt Tue May 25 20:12:52 2021 +0000
b
@@ -0,0 +1,45 @@
+Original BioNano Genome Map statistics:
+Count  = 2
+Min length (Mbp) = 0.720
+Median length (Mbp) = 2.313
+Mean length (Mbp) = 2.313
+N50 length (Mbp) = 3.906
+Max length (Mbp) = 3.906
+Total length (Mbp) = 4.625
+
+Original NGS sequences statistics:
+Count  = 1
+Min length (Mbp) = 4.753
+Median length (Mbp) = 4.753
+Mean length (Mbp) = 4.753
+N50 length (Mbp) = 4.753
+Max length (Mbp) = 4.753
+Total length (Mbp) = 4.753
+
+NGS FASTA sequence in hybrid scaffold statistics:
+Count  = 1
+Min length (Mbp) = 4.753
+Median length (Mbp) = 4.753
+Mean length (Mbp) = 4.753
+N50 length (Mbp) = 4.753
+Max length (Mbp) = 4.753
+Total length (Mbp) = 4.753
+
+Hybrid scaffold FASTA statistics:
+Count  = 1
+Min length (Mbp) = 4.753
+Median length (Mbp) = 4.753
+Mean length (Mbp) = 4.753
+N50 length (Mbp) = 4.753
+Max length (Mbp) = 4.753
+Total length (Mbp) = 4.753
+
+Hybrid scaffold FASTA plus not scaffolded NGS FASTA statistics:
+Count  = 1
+Min length (Mbp) = 4.753
+Median length (Mbp) = 4.753
+Mean length (Mbp) = 4.753
+N50 length (Mbp) = 4.753
+Max length (Mbp) = 4.753
+Total length (Mbp) = 4.753
+