changeset 0:bdf781f2658b draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/blastx_to_scaffold commit 1353e75b8459213e88f32744a759ce4d7b43826d
author artbio
date Sun, 15 Oct 2017 13:16:03 -0400
parents
children ec3b8341f551
files blastx_to_scaffold.py blastx_to_scaffold.xml test-data/blastx.tab test-data/contigs.fa test-data/scaffold.fa
diffstat 5 files changed, 248 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blastx_to_scaffold.py	Sun Oct 15 13:16:03 2017 -0400
@@ -0,0 +1,63 @@
+#!/usr/bin/python
+import argparse
+
+
+def insert_newlines(string, every=60):
+    lines = []
+    for i in range(0, len(string), every):
+        lines.append(string[i:i+every])
+    return '\n'.join(lines)
+
+
+def Parser():
+    the_parser = argparse.ArgumentParser(
+        description="Generate DNA scaffold from blastx alignment of Contigs")
+    the_parser.add_argument('--sequences', action="store", type=str,
+                            help="input sequence file in fasta format")
+    the_parser.add_argument('--blastx-tab', dest="blastx_tab", action="store",
+                            type=str, help="13-columns tabular blastx output")
+    the_parser.add_argument('--output', action="store", type=str,
+                            help="output file path, fasta format")
+    args = the_parser.parse_args()
+    return args
+
+
+def __main__():
+    args = Parser()
+    protLenght = int(open(args.blastx_tab, "r").readline().split("\t")[12])
+    BlastxOutput = open(args.blastx_tab, "r")
+    Contigs = open(args.sequences, "r")
+    ContigsDict = {}
+    protScaffold = {}
+    for line in Contigs:
+        if line[0] == ">":
+            header = line[1:-1]
+            ContigsDict[header] = ""
+        else:
+            ContigsDict[header] += line[:-1]
+    protScaffold = dict([(i, "NNN") for i in range(1, protLenght+1)])
+    for line in BlastxOutput:
+        fields = line[:-1].split("\t")
+        queryStart = int(fields[6])
+        queryStop = int(fields[7])
+        subjectStart = int(fields[8])
+        subjectStop = int(fields[9])
+        seqHeader = fields[0]
+        sequence = ContigsDict[seqHeader]
+        for i in range(subjectStart, subjectStop):
+            del protScaffold[i]
+        protScaffold[subjectStop] = sequence[queryStart-1: queryStop]
+    finalSeqList = []
+    for i in sorted(protScaffold):
+        finalSeqList.append(protScaffold[i])
+    finalSequence = insert_newlines("".join(finalSeqList))
+    Out = open(args.output, "w")
+    Out.write(">Scaffold\n")
+    Out.write("%s\n" % finalSequence)
+    BlastxOutput.close()
+    Contigs.close()
+    Out.close()
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blastx_to_scaffold.xml	Sun Oct 15 13:16:03 2017 -0400
@@ -0,0 +1,42 @@
+<tool id="blastx2scaffold" name="blastx_to_scaffold" version="1.0.0">
+<description>Generate DNA scaffold from blastx alignment of Contigs</description>
+<requirements>
+</requirements>
+<command interpreter="python">
+        blastx_to_scaffold.py --sequences $sequences
+                              --blastx-tab $blastx_tab
+                              --output $output
+</command>
+<inputs>
+<param name="sequences" type="data" format="fasta" label="Select a fasta contigs file"/> 
+<param name="blastx_tab"  type="data" format="tabular" label="Select a blastx output from your history" help="must have 13 columns with column 13 containing the subject lenght, other columns are standard"/> 
+
+</inputs>
+<outputs>
+ <data format="fasta" name="output"/>
+</outputs>
+
+
+<tests>
+  <test>
+    <param name="sequences" value="contigs.fa" ftype="fasta"/>
+    <param name="blastx_tab" value="blastx.tab" ftype="tabular"/>
+    <output name="output" file="scaffold.fa" ftype="fasta"/>
+  </test>
+</tests>
+        
+
+<help>
+
+
+**What it Does**
+This tool starts from DNA contigs that aligned to a subject protein sequence through blastx.
+The contigs must be provided in fasta format. The blastx output must be tabular, the 12 standard column plus column 13 with the length of the blastx subject.
+The final scaffold is a DNA sequence.
+Sequences of the subject protein which were not aligned to the contigs are replaced by Ns in this scaffold.
+
+**Attribution**
+This Galaxy tool was created by drosofff@gmail.com on 28/05/2015
+</help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastx.tab	Sun Oct 15 13:16:03 2017 -0400
@@ -0,0 +1,2 @@
+Contig1	gi|81971654|sp|Q9IJX4.1|POLN_CRPVC_RecName:_Full_Replicase_polyprotein;_Contains:_RecName:_Full_Pro	43.46	451	247	3	6	1343	237	684	6e-128	  397	1771
+Contig2	gi|81971654|sp|Q9IJX4.1|POLN_CRPVC_RecName:_Full_Replicase_polyprotein;_Contains:_RecName:_Full_Pro	52.99	536	236	4	6	1571	1217	1750	0.0	  580	1771
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/contigs.fa	Sun Oct 15 13:16:03 2017 -0400
@@ -0,0 +1,52 @@
+>Contig1
+TAGATAAGGTTTGCTCATTTCTTGAGGATACTTTACCAGGTATGGTCGAGCACGTTACGC
+TCGTAGCACAAAATACATCCGCGTCAGCCAAGGTGTTATCTGACGAGTTGATCAAATCAA
+TGCTTTGCATTGTTTTGATTTGCTTGTTGATTGAAACCAAGTTCTATAAGACCGCTTTCG
+CGGTACTTATAGTGGTTGCTCTACGTGTTTTCGGGTACAGTGAGCAAATAATTGAGACAG
+CTATGGACATGTATCGCGTAATTAGGGCTCCAAAGGCTCAAGGTAATATGGAAGATGTCG
+TTTTCCATCCGTGGTTGAACACGTGTGGAAAGTTGATTTTCCTACTTATCGCTGTCCTGT
+GTCTCAAGAAATTACCAGGAAAGAACGACGTAGACACTTTCATGCGCAGGCTCGACAGCT
+TACCCAAAGCTGTTAAGGGTGCGACACAACTACATGAATGGGTGTCAAAATACTTCGATC
+TCTCTTTGGATCACGTCAAGGCGATGATTGTTGGTAAATCTTGTGCCGAAATGAAGAAGG
+CTGAATCATCAAGCGCCAAAGTTTTGGCTTGGGCCGCTAGAGTTCAAGATTTCGTCAGAT
+TGGAACAACGAAGTAAAATCGATAGTGATATCGCTGTCGCCAACGAGGCTGAAGCCTTGT
+ATCACACTGGATTGCAATTTGCAGGAGACACTCTGTTACCTCCAGAATTGCACAAGGTCG
+TGAACAGTGCGCTAAGACCAGCCCGCGATATATATGAGTACGTCACCCGCTCCCCAATAA
+AAGGAGGGAGTCGTAAGATGAGACCCTTGATGATTTGGCTAGCTGGCCAGTCAGGAATTG
+GGAAAACCTCTATGGTGGATCCTCTATGTATCGATTTGCTTCGAGCAATGGGTTATGTGG
+GACCTGAACATCTCCACTCGTTGGTGTATGGCCGCCAAGTTGAGACGGAGTACTGGGATG
+GTTACAAAGCCCACAAGATAGTGATCTATGATGATGCTTTTCAGCTGAAAGATGATGCTG
+TGAACAGGAATTTGGAGGTATTTGAGGTTATACGTTCTTGCAACACGTATCCTCAACACC
+TTCATATGGCTTGTCTCTCGGATAAAAACACTTTTTCAGTAGCGGAAGTGTACATCTACA
+CTACCAACGAAATGAATGTCAAACTTGAGTCGCTGACTCATGAACAAGCATTCTACAACC
+GCATGAGTGAAAACGCGTTCACTGTGCGTCCAAAAGAGGCTTATCGTCTAGTCGAAGAAG
+GATCAACCGGCAATAAGCAGTATCGTTTGGACAAAACGAAAACCAAAGGAGCTATCGATC
+TCGATGTGTACGAATTCGTGCGC
+>Contig2
+CTAGAATCACAGCTCAGATGAGTTTTGAGGCACCGAAGGACGCAATTGAAGGACCGTGTC
+AAACCCCGGAAGGATTGTTCGCCCCTATTGGCAAAGCGCCGATAGGCGTAGGGATGTCCA
+CCAAGACGGCTATACGACCTTCACGCTTGTATGGAAGAATAACTAAACCGACAACTGCAC
+CATCATACTTGGGTAAAGACGCGCTTTATCGTGGATTGACCAAGTGTGGTGTTCGCACAG
+TTAATATTCAACCAGAATACATAGACGCAGCGGCGAATGACGTCGCACGCTATGTGTTAA
+ACCAGCATGTTGGTCACGTGGATAGGGAACGATACACACGTATATTGTCGTACGAGGAGG
+CTGTTAAGGGCGTGCCGTACGATGATTTCATGAAGTCAGTGACTCGAGTCACTTCCCCTG
+GTTACCCCTATTGCTTGGATACTGGAAACATGCCAGGGAAAAGCAAATGGATGGGGCTCG
+AACAAGATTTCGATATGACAAGTCCAGCTGCTTTGGCTTTGAGGAAAGATGTTGAAAGTT
+TGTTGGAAGATTGCAAAAATGGCTTAGTCCGTGATGTGGTGTTTGTCGACACTCTCAAGG
+ATGAAAGGCGCGAGCTGATAAAGGTGGAAGCAAAGAAGACTCGAGTCTTTTCTGCTGGAC
+CACAGCATTTTGTAATAGCTTTCCGGCAATACTTTCTTCCATTCTCTGCCTGGGTCATGC
+ATAACAGAATCGAAAACGAAGTAGCCGTTGGAACAAACCCCTTCTCAATGGATTGGCACA
+ACATTGCTGTGCGTATGCGTAGTAAAGGGAGACACATTATTGCTGGAGATTTTAGCAATT
+TTGATGGATCCCTCAACGCCCAAGTTCTCTGGACAATATTTTGGAAGATATTTGTCCCGT
+GGCTTAATGATATTGAACCACTTGGTACACCCAAGAATGAGGAGAATCTGCGGGTCTGCA
+CGAGTCTATGGACGCACTTGGTGCACTCCGTGCACATTTGTGGAGATAACTTGTACATGT
+GGACACATTCTCAACCATCGGGCAATCCCTTCACGGTGATAATCAATAGTTTGTATAACT
+CAGTTATCATGCGTGTCGTGTGGCAATACATAATGGCGAAAGAAGAACCTAAGTTACGCA
+CAATGAACCATTTCAATCAACATGTTGCTATGGTTTCATATGGTGATGACAATCTACTTA
+ACATCTCGGAAGGGGTAATTGATATCTTCAACCAACTTACCATCTCGGAAGCCATGCGTT
+GGATAGGACACGAATACACAGATGAAACGAAAACAGGCGAGGCTGCGCCCTATCGGACAT
+TGGAAGAAGTCCGTTTCCTTAAAAGAGGGTTCAGAATGGATCACCTCTTGTGTCGGTGGG
+TAGCTCCTTTGAAGAAGGATGTCATCTACGAAATGCTTAATTGGACGCGCAAAGGGATTA
+ACCCAGATGATGTGACGATGATGATCATTGATACAGCATTTAGGGAGATCTCTTATCACG
+GAAGGGAAGCTTTCGAGAAGCTGCGAGGGCAGATACTTGAGCAGCGGGATGTGTTGGTTG
+AATATCCTCAA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/scaffold.fa	Sun Oct 15 13:16:03 2017 -0400
@@ -0,0 +1,89 @@
+>Scaffold
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAGGTTTGCTCA
+TTTCTTGAGGATACTTTACCAGGTATGGTCGAGCACGTTACGCTCGTAGCACAAAATACA
+TCCGCGTCAGCCAAGGTGTTATCTGACGAGTTGATCAAATCAATGCTTTGCATTGTTTTG
+ATTTGCTTGTTGATTGAAACCAAGTTCTATAAGACCGCTTTCGCGGTACTTATAGTGGTT
+GCTCTACGTGTTTTCGGGTACAGTGAGCAAATAATTGAGACAGCTATGGACATGTATCGC
+GTAATTAGGGCTCCAAAGGCTCAAGGTAATATGGAAGATGTCGTTTTCCATCCGTGGTTG
+AACACGTGTGGAAAGTTGATTTTCCTACTTATCGCTGTCCTGTGTCTCAAGAAATTACCA
+GGAAAGAACGACGTAGACACTTTCATGCGCAGGCTCGACAGCTTACCCAAAGCTGTTAAG
+GGTGCGACACAACTACATGAATGGGTGTCAAAATACTTCGATCTCTCTTTGGATCACGTC
+AAGGCGATGATTGTTGGTAAATCTTGTGCCGAAATGAAGAAGGCTGAATCATCAAGCGCC
+AAAGTTTTGGCTTGGGCCGCTAGAGTTCAAGATTTCGTCAGATTGGAACAACGAAGTAAA
+ATCGATAGTGATATCGCTGTCGCCAACGAGGCTGAAGCCTTGTATCACACTGGATTGCAA
+TTTGCAGGAGACACTCTGTTACCTCCAGAATTGCACAAGGTCGTGAACAGTGCGCTAAGA
+CCAGCCCGCGATATATATGAGTACGTCACCCGCTCCCCAATAAAAGGAGGGAGTCGTAAG
+ATGAGACCCTTGATGATTTGGCTAGCTGGCCAGTCAGGAATTGGGAAAACCTCTATGGTG
+GATCCTCTATGTATCGATTTGCTTCGAGCAATGGGTTATGTGGGACCTGAACATCTCCAC
+TCGTTGGTGTATGGCCGCCAAGTTGAGACGGAGTACTGGGATGGTTACAAAGCCCACAAG
+ATAGTGATCTATGATGATGCTTTTCAGCTGAAAGATGATGCTGTGAACAGGAATTTGGAG
+GTATTTGAGGTTATACGTTCTTGCAACACGTATCCTCAACACCTTCATATGGCTTGTCTC
+TCGGATAAAAACACTTTTTCAGTAGCGGAAGTGTACATCTACACTACCAACGAAATGAAT
+GTCAAACTTGAGTCGCTGACTCATGAACAAGCATTCTACAACCGCATGAGTGAAAACGCG
+TTCACTGTGCGTCCAAAAGAGGCTTATCGTCTAGTCGAAGAAGGATCAACCGGCAATAAG
+CAGTATCGTTTGGACAAAACGAAAACCAAAGGAGCTATCGATCTCGATGTGTACGAATTC
+GTGCGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATCACAGCTCAGATGAGT
+TTTGAGGCACCGAAGGACGCAATTGAAGGACCGTGTCAAACCCCGGAAGGATTGTTCGCC
+CCTATTGGCAAAGCGCCGATAGGCGTAGGGATGTCCACCAAGACGGCTATACGACCTTCA
+CGCTTGTATGGAAGAATAACTAAACCGACAACTGCACCATCATACTTGGGTAAAGACGCG
+CTTTATCGTGGATTGACCAAGTGTGGTGTTCGCACAGTTAATATTCAACCAGAATACATA
+GACGCAGCGGCGAATGACGTCGCACGCTATGTGTTAAACCAGCATGTTGGTCACGTGGAT
+AGGGAACGATACACACGTATATTGTCGTACGAGGAGGCTGTTAAGGGCGTGCCGTACGAT
+GATTTCATGAAGTCAGTGACTCGAGTCACTTCCCCTGGTTACCCCTATTGCTTGGATACT
+GGAAACATGCCAGGGAAAAGCAAATGGATGGGGCTCGAACAAGATTTCGATATGACAAGT
+CCAGCTGCTTTGGCTTTGAGGAAAGATGTTGAAAGTTTGTTGGAAGATTGCAAAAATGGC
+TTAGTCCGTGATGTGGTGTTTGTCGACACTCTCAAGGATGAAAGGCGCGAGCTGATAAAG
+GTGGAAGCAAAGAAGACTCGAGTCTTTTCTGCTGGACCACAGCATTTTGTAATAGCTTTC
+CGGCAATACTTTCTTCCATTCTCTGCCTGGGTCATGCATAACAGAATCGAAAACGAAGTA
+GCCGTTGGAACAAACCCCTTCTCAATGGATTGGCACAACATTGCTGTGCGTATGCGTAGT
+AAAGGGAGACACATTATTGCTGGAGATTTTAGCAATTTTGATGGATCCCTCAACGCCCAA
+GTTCTCTGGACAATATTTTGGAAGATATTTGTCCCGTGGCTTAATGATATTGAACCACTT
+GGTACACCCAAGAATGAGGAGAATCTGCGGGTCTGCACGAGTCTATGGACGCACTTGGTG
+CACTCCGTGCACATTTGTGGAGATAACTTGTACATGTGGACACATTCTCAACCATCGGGC
+AATCCCTTCACGGTGATAATCAATAGTTTGTATAACTCAGTTATCATGCGTGTCGTGTGG
+CAATACATAATGGCGAAAGAAGAACCTAAGTTACGCACAATGAACCATTTCAATCAACAT
+GTTGCTATGGTTTCATATGGTGATGACAATCTACTTAACATCTCGGAAGGGGTAATTGAT
+ATCTTCAACCAACTTACCATCTCGGAAGCCATGCGTTGGATAGGACACGAATACACAGAT
+GAAACGAAAACAGGCGAGGCTGCGCCCTATCGGACATTGGAAGAAGTCCGTTTCCTTAAA
+AGAGGGTTCAGAATGGATCACCTCTTGTGTCGGTGGGTAGCTCCTTTGAAGAAGGATGTC
+ATCTACGAAATGCTTAATTGGACGCGCAAAGGGATTAACCCAGATGATGTGACGATGATG
+ATCATTGATACAGCATTTAGGGAGATCTCTTATCACGGAAGGGAAGCTTTCGAGAAGCTG
+CGAGGGCAGATACTTGAGCAGCGGGATGTGTTGGTTGAATATCCTCAANNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN