# HG changeset patch # User pravs # Date 1501711793 14400 # Node ID 9ec27561593ec3f779ebf61fd0fe497bd73d48d0 planemo upload diff -r 000000000000 -r 9ec27561593e removeFastaSubSequence.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/removeFastaSubSequence.py Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,42 @@ + +# This program checks if any of the sequence in a query fasta file is present in the +# reference fasta file (checks for sub-string) or not. If they are present, it removes it. +# The updated database will have sequences unique to query fasta file. + +def main(): + import sys + from Bio import SeqIO + + ref_fastaFile = sys.argv[1].strip() + query_fastaFile = sys.argv[2].strip() + + x = SeqIO.to_dict(SeqIO.parse(ref_fastaFile, "fasta")) + y = x.values() + b = [] + for a in y: + b.append(str(a.seq)) + ref_fastaSeq = "#".join(b) + + outfh = open(sys.argv[3].strip(), "w") + + x = SeqIO.to_dict(SeqIO.parse(query_fastaFile, "fasta")) + y = x.values() + count = 0 + for a in x.keys(): + seq = str(x[a].seq) + desc = str(x[a].description) + if ref_fastaSeq.find(seq) < 0: + outfh.write(">" + desc + "\n" + seq + "\n") + else: + count = count + 1 + print >> sys.stdout,"Total Number of Sequences Removed: %d" % count + outfh.close() + return None + +if __name__ == "__main__": + main() + + + + + diff -r 000000000000 -r 9ec27561593e removeFastaSubSequence.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/removeFastaSubSequence.xml Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,88 @@ + + + Removes sequences that are subsequence in a reference Fasta File. + + biopython + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This program removes the sequences from the query fasta file that are present as subsequence in a reference fasta file. + +EXAMPLE: + +---- + +Ref sequences: + +>reference_seq_1 + +TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP + +>reference_seq_2 + +RGLCISGLEKEVQVQSRQAEGPVHLWLRKGSTSAE + +---- + +Query Sequences: + +>query_seq_1 + +TKTILNYAVLSPCLSPGHVLGC + + +>query_seq_2 + +LDKDHLELCCTLSLPFSWACSWVLVL + + +>query_seq_3 + +LWGVPRGLCISG + +---- + +Output Sequences: + +>query_seq_1 + +TKTILNYAVLSPCLSPGHVLGC + + +>query_seq_3 + +LWGVPRGLCISG + +---- + +Output Sequence file will have only query_seq_1 and query_seq_3. query_seq_2 is removed because query_seq_2's sequence "LDKDHLELCCTLSLPFSWACSWVLVL" is +present as substring in reference_seq_1's sequence "TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP". + + + diff -r 000000000000 -r 9ec27561593e test-data/test_query.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_query.fasta Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,10 @@ +>generic|ENSMUST00000193003.1_1| [2 - 49] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +EGPVHLWLRKGSTSAE +>generic|ENSMUST00000193003.1_2| [3 - 59] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +RGLCISGLEKEVQVQSRQA +>generic|ENSMUST00000193003.1_3| [28 - 75] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +LCCLVPSFLAEDVQETDTSQKDQSPASHEIATNLGDFAISLYRELVHQSNTSNIFFSPV +>generic|ENSMUST00000193003.1_4| [63 - 128] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +TKTILNYAVLSPCLSPGHVLGC +>generic|ENSMUST00000193003.1_5| [53 - 205] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP diff -r 000000000000 -r 9ec27561593e test-data/test_ref.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_ref.fasta Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,10 @@ +>sp|Q9EST3|4ET_MOUSE Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 PE=1 SV=2 +MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSRNVESHLLAPAEIPGQPVSKNILQELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ +>sp|Q9EST3-2|4ET_MOUSE Isoform 2 of Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 +MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ +>sp|P34968|5HT2C_MOUSE 5-hydroxytryptamine receptor 2C OS=Mus musculus GN=Htr2c PE=2 SV=2 +MVNLGTAVRSLLVHLIGLLVWQFDISISPVAAIVTDTFNSSDGGRLFQFPDGVQNWPALSIVVIIIMTIGGNILVIMAVSMEKKLHNATNYFLMSLAIADMLVGLLVMPLSLLAILYDYVWPLPRYLCPVWISLDVLFSTASIMHLCAISLDRYVAIRNPIEHSRFNSRTKAIMKIAIVWAISIGVSVPIPVIGLRDESKVFVNNTTCVLNDPNFVLIGSFVAFFIPLTIMVITYFLTIYVLRRQTLMLLRGHTEEELRNISLNFLKCCCKKGDEEENAPNPNPDQKPRRKKKEKRPRGTMQAINNEKKASKVLGIVFFVFLIMWCPFFITNILSVLCGKACNQKLMEKLLNVFVWIGYVCSGINPLVYTLFNKIYRRAFSKYLRCDYKPDKKPPVRQIPRVAATALSGRELNVNIYRHTNERVVRKANDTEPGIEMQVENLELPVNPSNVVSERISSV +>sp|Q00896|A1AT3_MOUSE Alpha-1-antitrypsin 1-3 OS=Mus musculus GN=Serpina1c PE=1 SV=2 +MTPSISWGLLLLAGLCCLVPSFLAEDVQETDTSQKDQSPASHEIATNLGDFAISLYRELVHQSNTSNIFFSPVSIATAFAMLSLGSKGDTHTQILEGLQFNLTQTSEADIHKSFQHLLQTLNRPDSELQLSTGNGLFVNNDLKLVEKFLEEAKNHYQAEVFSVNFAESEEAKKVINDFVEKGTQGKIVEAVKKLDQDTVFALANYILFKGKWKKPFDPENTEEAEFHVDESTTVKVPMMTLSGMLDVHHCSTLSSWVLLMDYAGNATAVFLLPDDGKMQHLEQTLSKELISKFLLNRRRRLAQIHFPRLSISGEYNLKTLMSPLGITRIFNNGADLSGITEENAPLKLSQAVHKAVLTIDETGTEAAAVTVLLAVPYSMPPILRFDHPFLFIIFEEHTQSPLFVGKVVDPTH +>sp|Q9D2R0|AACS_MOUSE Acetoacetyl-CoA synthetase OS=Mus musculus GN=Aacs PE=1 SV=1 +MSKLARLEREEIMECQVMWEPDSKKDTQMDRFRAAVGTACGLALGNYNDLYHWSVRSYMDFWAEFWKFSGIVYSRMYDEVVDTSKGIADVPEWFRGSRLNYAENLLRHKENDRVALYVAREGREEIVKVTFEELRQQVALFAAAMRKMGVKKGDRVVGYLPNSAHAVEAMLAAASIGAIWSSTSPDFGVNGVLDRFSQIQPKLIFSVEAVVYNGKEHGHLEKLQRVVKGLPDLQRVVLIPYVLPREKIDISKIPNSVFLDDFLASGTGAQAPQLEFEQLPFSHPLFIMFSSGTTGAPKCMVHSAGGTLIQHLKEHMLHGNMTSSDILLYYTTVGWMMWNWMVSALATGASLVLYDGSPLVPTPNVLWDLVDRIGITILGTGAKWLSVLEEKDMKPVETHNLHTLHTILSTGSPLKAQSYEYVYRCIKSSVLLGSISGGTDIISCFMGQNSSIPVYKGEIQARNLGMAVEAWDEEGKAVWGASGELVCTKPIPCQPTHFWNDENGSKYRKAYFSKFPGVWAHGDYCRINPKTGGIIMLGRSDGTLNPNGVRFGSSEIYNIVEAFDEVEDSLCVPQYNRDGEERVVLFLKMASGHTFQPDLVKRIRDAIRLGLSARHVPSLILETRGIPYTLNGKKVEVAVKQVMAGRTVEHRGAFSNPETLDLYRDIPELQDF diff -r 000000000000 -r 9ec27561593e test-data/uniqSeq_test_query.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/uniqSeq_test_query.fasta Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,8 @@ +>generic|ENSMUST00000193003.1_5| [53 - 205] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP +>generic|ENSMUST00000193003.1_4| [63 - 128] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +TKTILNYAVLSPCLSPGHVLGC +>generic|ENSMUST00000193003.1_1| [2 - 49] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +EGPVHLWLRKGSTSAE +>generic|ENSMUST00000193003.1_2| [3 - 59] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +RGLCISGLEKEVQVQSRQA diff -r 000000000000 -r 9ec27561593e tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,4 @@ + + + + \ No newline at end of file