Repository 'remove_fasta_subsequences'
hg clone https://toolshed.g2.bx.psu.edu/repos/pravs/remove_fasta_subsequences

Changeset 0:9ec27561593e (2017-08-02)
Next changeset 1:d49328dfeceb (2017-08-02)
Commit message:
planemo upload
added:
removeFastaSubSequence.py
removeFastaSubSequence.xml
test-data/test_query.fasta
test-data/test_ref.fasta
test-data/uniqSeq_test_query.fasta
tool_dependencies.xml
b
diff -r 000000000000 -r 9ec27561593e removeFastaSubSequence.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/removeFastaSubSequence.py Wed Aug 02 18:09:53 2017 -0400
[
@@ -0,0 +1,42 @@
+
+# This program checks if any of the sequence in a query fasta file is present in the 
+# reference fasta file (checks for sub-string) or not. If they are present, it removes it.
+# The updated database will have sequences unique to query fasta file.
+
+def main():
+    import sys
+    from Bio import SeqIO
+    
+    ref_fastaFile = sys.argv[1].strip()
+    query_fastaFile = sys.argv[2].strip()
+    
+    x = SeqIO.to_dict(SeqIO.parse(ref_fastaFile, "fasta"))
+    y = x.values()
+    b = []
+    for a in y:
+        b.append(str(a.seq))
+    ref_fastaSeq = "#".join(b)
+    
+    outfh = open(sys.argv[3].strip(), "w")
+    
+    x = SeqIO.to_dict(SeqIO.parse(query_fastaFile, "fasta"))
+    y = x.values()
+    count = 0
+    for a in x.keys():
+        seq = str(x[a].seq)
+        desc = str(x[a].description)
+        if ref_fastaSeq.find(seq) < 0:
+            outfh.write(">" + desc + "\n" + seq + "\n")
+        else:
+            count = count + 1
+    print >> sys.stdout,"Total Number of Sequences Removed: %d" % count
+    outfh.close()
+    return None
+
+if __name__ == "__main__":
+    main()
+    
+    
+    
+
+
b
diff -r 000000000000 -r 9ec27561593e removeFastaSubSequence.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/removeFastaSubSequence.xml Wed Aug 02 18:09:53 2017 -0400
[
@@ -0,0 +1,88 @@
+
+<tool id="removeFastaSubSequence" name="Remove Fasta Substring Sequence" version="1.0.0">
+  <description>Removes sequences that are subsequence in a reference Fasta File.</description>
+  <requirements>
+      <requirement type="package" version="1.70">biopython</requirement>
+  </requirements>
+  <command interpreter="python"><![CDATA[removeFastaSubSequence.py $ref_fastafile $query_fastafile $output]]></command>
+  <inputs>
+    <param name="ref_fastafile" type="data" format="fasta">
+      <label>Input Reference Fasta File</label>
+    </param>
+    <param name="query_fastafile" type="data" format="fasta">
+      <label>Input Query Fasta File</label>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data format="fasta" name="output" label="uniqSeq_${query_fastafile.name.rsplit('.',1)[0]}.fasta" />
+  </outputs>
+  
+  <tests> 
+    <test>
+      <param name="ref_fastafile" value="test_ref.fasta" />
+      <param name="query_fastafile" value="test_query.fasta" />
+      <output name="output" file="uniqSeq_test_query.fasta">
+        <assert_contents>
+            <has_text text="ENSMUST00000193003" />
+        </assert_contents>
+      </output>
+    </test>
+  </tests>
+  
+  
+  <help>
+This program removes the sequences from the query fasta file that are present as subsequence in a reference fasta file.
+
+EXAMPLE:
+
+----
+
+Ref sequences:
+
+>reference_seq_1
+
+TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP
+
+>reference_seq_2
+
+RGLCISGLEKEVQVQSRQAEGPVHLWLRKGSTSAE
+
+----
+
+Query Sequences:
+
+>query_seq_1
+
+TKTILNYAVLSPCLSPGHVLGC
+
+
+>query_seq_2
+
+LDKDHLELCCTLSLPFSWACSWVLVL
+
+
+>query_seq_3
+
+LWGVPRGLCISG
+
+----
+
+Output Sequences:
+
+>query_seq_1
+
+TKTILNYAVLSPCLSPGHVLGC
+
+
+>query_seq_3
+
+LWGVPRGLCISG
+
+----
+
+Output Sequence file will have only query_seq_1 and query_seq_3. query_seq_2 is removed because query_seq_2's sequence "LDKDHLELCCTLSLPFSWACSWVLVL" is 
+present as substring in reference_seq_1's sequence "TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP".
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 9ec27561593e test-data/test_query.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_query.fasta Wed Aug 02 18:09:53 2017 -0400
[
@@ -0,0 +1,10 @@
+>generic|ENSMUST00000193003.1_1| [2 - 49] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+EGPVHLWLRKGSTSAE
+>generic|ENSMUST00000193003.1_2| [3 - 59] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+RGLCISGLEKEVQVQSRQA
+>generic|ENSMUST00000193003.1_3| [28 - 75] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+LCCLVPSFLAEDVQETDTSQKDQSPASHEIATNLGDFAISLYRELVHQSNTSNIFFSPV
+>generic|ENSMUST00000193003.1_4| [63 - 128] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+TKTILNYAVLSPCLSPGHVLGC
+>generic|ENSMUST00000193003.1_5| [53 - 205] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP
b
diff -r 000000000000 -r 9ec27561593e test-data/test_ref.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_ref.fasta Wed Aug 02 18:09:53 2017 -0400
b
@@ -0,0 +1,10 @@
+>sp|Q9EST3|4ET_MOUSE Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 PE=1 SV=2
+MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSRNVESHLLAPAEIPGQPVSKNILQELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ
+>sp|Q9EST3-2|4ET_MOUSE Isoform 2 of Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1
+MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ
+>sp|P34968|5HT2C_MOUSE 5-hydroxytryptamine receptor 2C OS=Mus musculus GN=Htr2c PE=2 SV=2
+MVNLGTAVRSLLVHLIGLLVWQFDISISPVAAIVTDTFNSSDGGRLFQFPDGVQNWPALSIVVIIIMTIGGNILVIMAVSMEKKLHNATNYFLMSLAIADMLVGLLVMPLSLLAILYDYVWPLPRYLCPVWISLDVLFSTASIMHLCAISLDRYVAIRNPIEHSRFNSRTKAIMKIAIVWAISIGVSVPIPVIGLRDESKVFVNNTTCVLNDPNFVLIGSFVAFFIPLTIMVITYFLTIYVLRRQTLMLLRGHTEEELRNISLNFLKCCCKKGDEEENAPNPNPDQKPRRKKKEKRPRGTMQAINNEKKASKVLGIVFFVFLIMWCPFFITNILSVLCGKACNQKLMEKLLNVFVWIGYVCSGINPLVYTLFNKIYRRAFSKYLRCDYKPDKKPPVRQIPRVAATALSGRELNVNIYRHTNERVVRKANDTEPGIEMQVENLELPVNPSNVVSERISSV
+>sp|Q00896|A1AT3_MOUSE Alpha-1-antitrypsin 1-3 OS=Mus musculus GN=Serpina1c PE=1 SV=2
+MTPSISWGLLLLAGLCCLVPSFLAEDVQETDTSQKDQSPASHEIATNLGDFAISLYRELVHQSNTSNIFFSPVSIATAFAMLSLGSKGDTHTQILEGLQFNLTQTSEADIHKSFQHLLQTLNRPDSELQLSTGNGLFVNNDLKLVEKFLEEAKNHYQAEVFSVNFAESEEAKKVINDFVEKGTQGKIVEAVKKLDQDTVFALANYILFKGKWKKPFDPENTEEAEFHVDESTTVKVPMMTLSGMLDVHHCSTLSSWVLLMDYAGNATAVFLLPDDGKMQHLEQTLSKELISKFLLNRRRRLAQIHFPRLSISGEYNLKTLMSPLGITRIFNNGADLSGITEENAPLKLSQAVHKAVLTIDETGTEAAAVTVLLAVPYSMPPILRFDHPFLFIIFEEHTQSPLFVGKVVDPTH
+>sp|Q9D2R0|AACS_MOUSE Acetoacetyl-CoA synthetase OS=Mus musculus GN=Aacs PE=1 SV=1
+MSKLARLEREEIMECQVMWEPDSKKDTQMDRFRAAVGTACGLALGNYNDLYHWSVRSYMDFWAEFWKFSGIVYSRMYDEVVDTSKGIADVPEWFRGSRLNYAENLLRHKENDRVALYVAREGREEIVKVTFEELRQQVALFAAAMRKMGVKKGDRVVGYLPNSAHAVEAMLAAASIGAIWSSTSPDFGVNGVLDRFSQIQPKLIFSVEAVVYNGKEHGHLEKLQRVVKGLPDLQRVVLIPYVLPREKIDISKIPNSVFLDDFLASGTGAQAPQLEFEQLPFSHPLFIMFSSGTTGAPKCMVHSAGGTLIQHLKEHMLHGNMTSSDILLYYTTVGWMMWNWMVSALATGASLVLYDGSPLVPTPNVLWDLVDRIGITILGTGAKWLSVLEEKDMKPVETHNLHTLHTILSTGSPLKAQSYEYVYRCIKSSVLLGSISGGTDIISCFMGQNSSIPVYKGEIQARNLGMAVEAWDEEGKAVWGASGELVCTKPIPCQPTHFWNDENGSKYRKAYFSKFPGVWAHGDYCRINPKTGGIIMLGRSDGTLNPNGVRFGSSEIYNIVEAFDEVEDSLCVPQYNRDGEERVVLFLKMASGHTFQPDLVKRIRDAIRLGLSARHVPSLILETRGIPYTLNGKKVEVAVKQVMAGRTVEHRGAFSNPETLDLYRDIPELQDF
b
diff -r 000000000000 -r 9ec27561593e test-data/uniqSeq_test_query.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/uniqSeq_test_query.fasta Wed Aug 02 18:09:53 2017 -0400
[
@@ -0,0 +1,8 @@
+>generic|ENSMUST00000193003.1_5| [53 - 205] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP
+>generic|ENSMUST00000193003.1_4| [63 - 128] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+TKTILNYAVLSPCLSPGHVLGC
+>generic|ENSMUST00000193003.1_1| [2 - 49] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+EGPVHLWLRKGSTSAE
+>generic|ENSMUST00000193003.1_2| [3 - 59] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+RGLCISGLEKEVQVQSRQA
b
diff -r 000000000000 -r 9ec27561593e tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Wed Aug 02 18:09:53 2017 -0400
b
@@ -0,0 +1,4 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  
+</tool_dependency>
\ No newline at end of file