# HG changeset patch # User galaxyp # Date 1550266711 18000 # Node ID 3c623e81be778d4a29b8233643041955e0ee3877 # Parent 1bd985f1493890b130e3070318f54de49daeb5b5 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76 diff -r 1bd985f14938 -r 3c623e81be77 filter_by_fasta_ids.py --- a/filter_by_fasta_ids.py Sat Apr 28 03:49:28 2018 -0400 +++ b/filter_by_fasta_ids.py Fri Feb 15 16:38:31 2019 -0500 @@ -41,17 +41,16 @@ yield Sequence(header, sequence_parts) -def target_match(targets, header): +def target_match(targets, search_entry, pattern='>([^| ]+)'): ''' Matches ''' - # Remove '>' and initial spaces from the header - header = header[1:].lstrip().upper() - # Search for an exact match among the targets - if header in targets: - return header - # Try to find an exact match for the first "word" in the header - header = header.split()[0] - if header in targets: - return header + search_entry = search_entry.upper() + m = re.search(pattern,search_entry) + if m: + target = m.group(len(m.groups())) + if target in targets: + return target + else: + print( 'No ID match: %s' % search_entry, file=sys.stdout) return None @@ -64,6 +63,7 @@ parser.add_argument('-d', help='Path to discarded entries file') header_criteria = parser.add_mutually_exclusive_group() header_criteria.add_argument('--id_list', help='Path to the ID list file') + parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry') header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match') sequence_criteria = parser.add_mutually_exclusive_group() sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length') @@ -71,7 +71,14 @@ parser.add_argument('--max_length', type=int, help='Maximum sequence length') parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences') options = parser.parse_args() - + + + if options.pattern: + pattern = options.pattern + if not re.match('^.*[(](?![?]:).*[)].*$',pattern): + print('pattern: "%s" did not include capture group "()" in regex ' % pattern) + exit(1) + if options.min_length is not None and options.max_length is None: options.max_length = sys.maxsize if options.header_regexp: @@ -100,12 +107,13 @@ for entry in homd_db: print_entry = True if options.id_list: - target_matched_results = target_match(targets, entry.header) + target_matched_results = target_match(targets, entry.header, pattern=pattern) if target_matched_results: work_summary['found'] += 1 targets.remove(target_matched_results) else: print_entry = False + elif options.header_regexp: if regexp.search(entry.header) is None: print_entry = False diff -r 1bd985f14938 -r 3c623e81be77 filter_by_fasta_ids.xml --- a/filter_by_fasta_ids.xml Sat Apr 28 03:49:28 2018 -0400 +++ b/filter_by_fasta_ids.xml Fri Feb 15 16:38:31 2019 -0500 @@ -1,4 +1,4 @@ - + on the headers and/or the sequences @@ -23,6 +23,11 @@ -i '$input' #if $header_criteria.header_criteria_select == 'id_list' --id_list '$header_criteria.identifiers' + #if $header_criteria.id_regex.find == 'pattern': + --pattern '$header_criteria.id_regex.pattern' + #elif $header_criteria.id_regex.find == 'beginning': + --pattern '$header_criteria.id_regex.pattern' + #end if #elif $header_criteria.header_criteria_select == 'regexp' --header_regexp '$header_criteria.regexp' #end if @@ -51,6 +56,30 @@ + + + + + + Default: >ID will use search pattern >([^| ]+) to input ID; Use custom regex to change + + + + + + ^.*[(](?![?]:).*[)].*$ + + + + + search pattern must contain %s where the ID will be substituted. Use this for Uniprot Acc: >.+?\|(.+?)\|.*$ + + ^.*[(](?![?]:).*[)].*$ + + + + + @@ -88,6 +117,15 @@ + + + + + + + + + diff -r 1bd985f14938 -r 3c623e81be77 test-data/ids_sp.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ids_sp.txt Fri Feb 15 16:38:31 2019 -0500 @@ -0,0 +1,4 @@ +Q9EST3 +Q9EST3-2 +P34968 +Q9D2R0 diff -r 1bd985f14938 -r 3c623e81be77 test-data/input_sp.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input_sp.fasta Fri Feb 15 16:38:31 2019 -0500 @@ -0,0 +1,10 @@ +>sp|Q9EST3|4ET_MOUSE Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 PE=1 SV=2 +MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSRNVESHLLAPAEIPGQPVSKNILQELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ +>sp|Q9EST3-2|4ET_MOUSE Isoform 2 of Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 +MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ +>sp|P34968|5HT2C_MOUSE 5-hydroxytryptamine receptor 2C OS=Mus musculus GN=Htr2c PE=2 SV=2 +MVNLGTAVRSLLVHLIGLLVWQFDISISPVAAIVTDTFNSSDGGRLFQFPDGVQNWPALSIVVIIIMTIGGNILVIMAVSMEKKLHNATNYFLMSLAIADMLVGLLVMPLSLLAILYDYVWPLPRYLCPVWISLDVLFSTASIMHLCAISLDRYVAIRNPIEHSRFNSRTKAIMKIAIVWAISIGVSVPIPVIGLRDESKVFVNNTTCVLNDPNFVLIGSFVAFFIPLTIMVITYFLTIYVLRRQTLMLLRGHTEEELRNISLNFLKCCCKKGDEEENAPNPNPDQKPRRKKKEKRPRGTMQAINNEKKASKVLGIVFFVFLIMWCPFFITNILSVLCGKACNQKLMEKLLNVFVWIGYVCSGINPLVYTLFNKIYRRAFSKYLRCDYKPDKKPPVRQIPRVAATALSGRELNVNIYRHTNERVVRKANDTEPGIEMQVENLELPVNPSNVVSERISSV +>sp|Q00896|A1AT3_MOUSE Alpha-1-antitrypsin 1-3 OS=Mus musculus GN=Serpina1c PE=1 SV=2 +MTPSISWGLLLLAGLCCLVPSFLAEDVQETDTSQKDQSPASHEIATNLGDFAISLYRELVHQSNTSNIFFSPVSIATAFAMLSLGSKGDTHTQILEGLQFNLTQTSEADIHKSFQHLLQTLNRPDSELQLSTGNGLFVNNDLKLVEKFLEEAKNHYQAEVFSVNFAESEEAKKVINDFVEKGTQGKIVEAVKKLDQDTVFALANYILFKGKWKKPFDPENTEEAEFHVDESTTVKVPMMTLSGMLDVHHCSTLSSWVLLMDYAGNATAVFLLPDDGKMQHLEQTLSKELISKFLLNRRRRLAQIHFPRLSISGEYNLKTLMSPLGITRIFNNGADLSGITEENAPLKLSQAVHKAVLTIDETGTEAAAVTVLLAVPYSMPPILRFDHPFLFIIFEEHTQSPLFVGKVVDPTH +>sp|Q9D2R0|AACS_MOUSE Acetoacetyl-CoA synthetase OS=Mus musculus GN=Aacs PE=1 SV=1 +MSKLARLEREEIMECQVMWEPDSKKDTQMDRFRAAVGTACGLALGNYNDLYHWSVRSYMDFWAEFWKFSGIVYSRMYDEVVDTSKGIADVPEWFRGSRLNYAENLLRHKENDRVALYVAREGREEIVKVTFEELRQQVALFAAAMRKMGVKKGDRVVGYLPNSAHAVEAMLAAASIGAIWSSTSPDFGVNGVLDRFSQIQPKLIFSVEAVVYNGKEHGHLEKLQRVVKGLPDLQRVVLIPYVLPREKIDISKIPNSVFLDDFLASGTGAQAPQLEFEQLPFSHPLFIMFSSGTTGAPKCMVHSAGGTLIQHLKEHMLHGNMTSSDILLYYTTVGWMMWNWMVSALATGASLVLYDGSPLVPTPNVLWDLVDRIGITILGTGAKWLSVLEEKDMKPVETHNLHTLHTILSTGSPLKAQSYEYVYRCIKSSVLLGSISGGTDIISCFMGQNSSIPVYKGEIQARNLGMAVEAWDEEGKAVWGASGELVCTKPIPCQPTHFWNDENGSKYRKAYFSKFPGVWAHGDYCRINPKTGGIIMLGRSDGTLNPNGVRFGSSEIYNIVEAFDEVEDSLCVPQYNRDGEERVVLFLKMASGHTFQPDLVKRIRDAIRLGLSARHVPSLILETRGIPYTLNGKKVEVAVKQVMAGRTVEHRGAFSNPETLDLYRDIPELQDF diff -r 1bd985f14938 -r 3c623e81be77 test-data/output_sp_dedup.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_sp_dedup.fasta Fri Feb 15 16:38:31 2019 -0500 @@ -0,0 +1,8 @@ +>sp|Q9EST3|4ET_MOUSE Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 PE=1 SV=2 +MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSRNVESHLLAPAEIPGQPVSKNILQELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ +>sp|Q9EST3-2|4ET_MOUSE Isoform 2 of Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 +MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ +>sp|P34968|5HT2C_MOUSE 5-hydroxytryptamine receptor 2C OS=Mus musculus GN=Htr2c PE=2 SV=2 +MVNLGTAVRSLLVHLIGLLVWQFDISISPVAAIVTDTFNSSDGGRLFQFPDGVQNWPALSIVVIIIMTIGGNILVIMAVSMEKKLHNATNYFLMSLAIADMLVGLLVMPLSLLAILYDYVWPLPRYLCPVWISLDVLFSTASIMHLCAISLDRYVAIRNPIEHSRFNSRTKAIMKIAIVWAISIGVSVPIPVIGLRDESKVFVNNTTCVLNDPNFVLIGSFVAFFIPLTIMVITYFLTIYVLRRQTLMLLRGHTEEELRNISLNFLKCCCKKGDEEENAPNPNPDQKPRRKKKEKRPRGTMQAINNEKKASKVLGIVFFVFLIMWCPFFITNILSVLCGKACNQKLMEKLLNVFVWIGYVCSGINPLVYTLFNKIYRRAFSKYLRCDYKPDKKPPVRQIPRVAATALSGRELNVNIYRHTNERVVRKANDTEPGIEMQVENLELPVNPSNVVSERISSV +>sp|Q9D2R0|AACS_MOUSE Acetoacetyl-CoA synthetase OS=Mus musculus GN=Aacs PE=1 SV=1 +MSKLARLEREEIMECQVMWEPDSKKDTQMDRFRAAVGTACGLALGNYNDLYHWSVRSYMDFWAEFWKFSGIVYSRMYDEVVDTSKGIADVPEWFRGSRLNYAENLLRHKENDRVALYVAREGREEIVKVTFEELRQQVALFAAAMRKMGVKKGDRVVGYLPNSAHAVEAMLAAASIGAIWSSTSPDFGVNGVLDRFSQIQPKLIFSVEAVVYNGKEHGHLEKLQRVVKGLPDLQRVVLIPYVLPREKIDISKIPNSVFLDDFLASGTGAQAPQLEFEQLPFSHPLFIMFSSGTTGAPKCMVHSAGGTLIQHLKEHMLHGNMTSSDILLYYTTVGWMMWNWMVSALATGASLVLYDGSPLVPTPNVLWDLVDRIGITILGTGAKWLSVLEEKDMKPVETHNLHTLHTILSTGSPLKAQSYEYVYRCIKSSVLLGSISGGTDIISCFMGQNSSIPVYKGEIQARNLGMAVEAWDEEGKAVWGASGELVCTKPIPCQPTHFWNDENGSKYRKAYFSKFPGVWAHGDYCRINPKTGGIIMLGRSDGTLNPNGVRFGSSEIYNIVEAFDEVEDSLCVPQYNRDGEERVVLFLKMASGHTFQPDLVKRIRDAIRLGLSARHVPSLILETRGIPYTLNGKKVEVAVKQVMAGRTVEHRGAFSNPETLDLYRDIPELQDF