changeset 3:3c623e81be77 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/filter_by_fasta_ids commit 0556e0fe5aa17c84033a75a45baeb3a4c2b5ff76
author galaxyp
date Fri, 15 Feb 2019 16:38:31 -0500
parents 1bd985f14938
children cd22452edec2
files filter_by_fasta_ids.py filter_by_fasta_ids.xml test-data/ids_sp.txt test-data/input_sp.fasta test-data/output_sp_dedup.fasta
diffstat 5 files changed, 81 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/filter_by_fasta_ids.py	Sat Apr 28 03:49:28 2018 -0400
+++ b/filter_by_fasta_ids.py	Fri Feb 15 16:38:31 2019 -0500
@@ -41,17 +41,16 @@
             yield Sequence(header, sequence_parts)
 
 
-def target_match(targets, header):
+def target_match(targets, search_entry, pattern='>([^| ]+)'):
     ''' Matches '''
-    # Remove '>' and initial spaces from the header
-    header = header[1:].lstrip().upper()
-    # Search for an exact match among the targets
-    if header in targets:
-        return header
-    # Try to find an exact match for the first "word" in the header
-    header = header.split()[0]
-    if header in targets:
-        return header
+    search_entry = search_entry.upper()
+    m = re.search(pattern,search_entry)
+    if m:
+        target = m.group(len(m.groups()))
+        if target in targets:
+            return target
+    else:
+         print( 'No ID match: %s' % search_entry, file=sys.stdout)
     return None
 
 
@@ -64,6 +63,7 @@
     parser.add_argument('-d', help='Path to discarded entries file')
     header_criteria = parser.add_mutually_exclusive_group()
     header_criteria.add_argument('--id_list', help='Path to the ID list file')
+    parser.add_argument('--pattern', help='regex earch attern for ID in Fasta entry')
     header_criteria.add_argument('--header_regexp', help='Regular expression pattern the header should match')
     sequence_criteria = parser.add_mutually_exclusive_group()
     sequence_criteria.add_argument('--min_length', type=int, help='Minimum sequence length')
@@ -71,7 +71,14 @@
     parser.add_argument('--max_length', type=int, help='Maximum sequence length')
     parser.add_argument('--dedup', action='store_true', default=False, help='Whether to remove duplicate sequences')
     options = parser.parse_args()
-
+    
+    
+    if options.pattern:
+        pattern =  options.pattern 
+        if not re.match('^.*[(](?![?]:).*[)].*$',pattern):
+            print('pattern: "%s" did not include capture group "()" in regex ' % pattern)
+            exit(1)
+    
     if options.min_length is not None and options.max_length is None:
         options.max_length = sys.maxsize
     if options.header_regexp:
@@ -100,12 +107,13 @@
         for entry in homd_db:
             print_entry = True
             if options.id_list:
-                target_matched_results = target_match(targets, entry.header)
+                target_matched_results = target_match(targets, entry.header, pattern=pattern)
                 if target_matched_results:
                     work_summary['found'] += 1
                     targets.remove(target_matched_results)
                 else:
                     print_entry = False
+            
             elif options.header_regexp:
                 if regexp.search(entry.header) is None:
                     print_entry = False
--- a/filter_by_fasta_ids.xml	Sat Apr 28 03:49:28 2018 -0400
+++ b/filter_by_fasta_ids.xml	Fri Feb 15 16:38:31 2019 -0500
@@ -1,4 +1,4 @@
-<tool id="filter_by_fasta_ids" name="Filter FASTA" version="2.0">
+<tool id="filter_by_fasta_ids" name="Filter FASTA" version="2.1">
     <description>on the headers and/or the sequences</description>
     <macros>
         <xml name="regexp_macro" token_label="Regular expression pattern">
@@ -23,6 +23,11 @@
 -i '$input'
 #if $header_criteria.header_criteria_select == 'id_list'
     --id_list '$header_criteria.identifiers'
+    #if $header_criteria.id_regex.find == 'pattern':
+        --pattern '$header_criteria.id_regex.pattern'
+    #elif $header_criteria.id_regex.find == 'beginning':
+        --pattern '$header_criteria.id_regex.pattern'
+    #end if
 #elif $header_criteria.header_criteria_select == 'regexp'
     --header_regexp '$header_criteria.regexp'
 #end if
@@ -51,6 +56,30 @@
             <when value="" />
             <when value="id_list">
                 <param name="identifiers" type="data" format="txt" label="List of IDs to extract sequences for"/>
+                
+                
+                <conditional name="id_regex">
+                    <param name="find" type="select" label="Match IDs by">
+                        <option value="beginning">Default: ID is expected at the beginning: &gt;ID </option>
+                        <help>Default: &gt;ID will use search pattern >([^| ]+) to input ID; Use custom regex to change</help>
+                        <option value="pattern">Custom regex pattern</option>
+                    </param>
+                    <when value="beginning">
+                        <param name="pattern" type="hidden" value=">([^| ]+)" label="regex search pattern for ID" >
+                            <sanitizer sanitize="False"/>
+                            <validator type="regex" message="must include a group that returns an ID">^.*[(](?![?]:).*[)].*$</validator>
+                        </param>
+                    </when>
+                    <when value="pattern">
+                        <param name="pattern" type="text" value="" label="regex search pattern for ID">
+                            <help>search pattern must contain %s where the ID will be substituted. Use this for Uniprot Acc: >.+?\|(.+?)\|.*$ </help>
+                            <sanitizer sanitize="False"/>
+                            <validator type="regex" message="must include a group that returns an ID">^.*[(](?![?]:).*[)].*$</validator>
+                        </param>
+                    </when>
+                </conditional>
+                
+                
             </when>
             <when value="regexp">
                 <expand macro="regexp_macro" label="Regular expression pattern the header should match" />
@@ -88,6 +117,15 @@
             <param name="dedup" value="True" />
             <output name="output" file="output_dedup.fasta" />
         </test>
+        <test expect_num_outputs="1">
+            <param name="input" ftype="fasta" value="input_sp.fasta" />
+            <param name="header_criteria_select" value="id_list" />
+            <param name="find" value="pattern" />
+            <param name="pattern" value=">.+?\|(.+?)\|.*$" />
+            <param name="identifiers" ftype="txt" value="ids_sp.txt" />
+            <param name="dedup" value="True" />
+            <output name="output" file="output_sp_dedup.fasta" />
+        </test>
         <test expect_num_outputs="2">
             <param name="input" ftype="fasta" value="input.fasta" />
             <param name="header_criteria_select" value="id_list" />
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ids_sp.txt	Fri Feb 15 16:38:31 2019 -0500
@@ -0,0 +1,4 @@
+Q9EST3
+Q9EST3-2
+P34968
+Q9D2R0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_sp.fasta	Fri Feb 15 16:38:31 2019 -0500
@@ -0,0 +1,10 @@
+>sp|Q9EST3|4ET_MOUSE Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 PE=1 SV=2
+MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSRNVESHLLAPAEIPGQPVSKNILQELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ
+>sp|Q9EST3-2|4ET_MOUSE Isoform 2 of Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1
+MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ
+>sp|P34968|5HT2C_MOUSE 5-hydroxytryptamine receptor 2C OS=Mus musculus GN=Htr2c PE=2 SV=2
+MVNLGTAVRSLLVHLIGLLVWQFDISISPVAAIVTDTFNSSDGGRLFQFPDGVQNWPALSIVVIIIMTIGGNILVIMAVSMEKKLHNATNYFLMSLAIADMLVGLLVMPLSLLAILYDYVWPLPRYLCPVWISLDVLFSTASIMHLCAISLDRYVAIRNPIEHSRFNSRTKAIMKIAIVWAISIGVSVPIPVIGLRDESKVFVNNTTCVLNDPNFVLIGSFVAFFIPLTIMVITYFLTIYVLRRQTLMLLRGHTEEELRNISLNFLKCCCKKGDEEENAPNPNPDQKPRRKKKEKRPRGTMQAINNEKKASKVLGIVFFVFLIMWCPFFITNILSVLCGKACNQKLMEKLLNVFVWIGYVCSGINPLVYTLFNKIYRRAFSKYLRCDYKPDKKPPVRQIPRVAATALSGRELNVNIYRHTNERVVRKANDTEPGIEMQVENLELPVNPSNVVSERISSV
+>sp|Q00896|A1AT3_MOUSE Alpha-1-antitrypsin 1-3 OS=Mus musculus GN=Serpina1c PE=1 SV=2
+MTPSISWGLLLLAGLCCLVPSFLAEDVQETDTSQKDQSPASHEIATNLGDFAISLYRELVHQSNTSNIFFSPVSIATAFAMLSLGSKGDTHTQILEGLQFNLTQTSEADIHKSFQHLLQTLNRPDSELQLSTGNGLFVNNDLKLVEKFLEEAKNHYQAEVFSVNFAESEEAKKVINDFVEKGTQGKIVEAVKKLDQDTVFALANYILFKGKWKKPFDPENTEEAEFHVDESTTVKVPMMTLSGMLDVHHCSTLSSWVLLMDYAGNATAVFLLPDDGKMQHLEQTLSKELISKFLLNRRRRLAQIHFPRLSISGEYNLKTLMSPLGITRIFNNGADLSGITEENAPLKLSQAVHKAVLTIDETGTEAAAVTVLLAVPYSMPPILRFDHPFLFIIFEEHTQSPLFVGKVVDPTH
+>sp|Q9D2R0|AACS_MOUSE Acetoacetyl-CoA synthetase OS=Mus musculus GN=Aacs PE=1 SV=1
+MSKLARLEREEIMECQVMWEPDSKKDTQMDRFRAAVGTACGLALGNYNDLYHWSVRSYMDFWAEFWKFSGIVYSRMYDEVVDTSKGIADVPEWFRGSRLNYAENLLRHKENDRVALYVAREGREEIVKVTFEELRQQVALFAAAMRKMGVKKGDRVVGYLPNSAHAVEAMLAAASIGAIWSSTSPDFGVNGVLDRFSQIQPKLIFSVEAVVYNGKEHGHLEKLQRVVKGLPDLQRVVLIPYVLPREKIDISKIPNSVFLDDFLASGTGAQAPQLEFEQLPFSHPLFIMFSSGTTGAPKCMVHSAGGTLIQHLKEHMLHGNMTSSDILLYYTTVGWMMWNWMVSALATGASLVLYDGSPLVPTPNVLWDLVDRIGITILGTGAKWLSVLEEKDMKPVETHNLHTLHTILSTGSPLKAQSYEYVYRCIKSSVLLGSISGGTDIISCFMGQNSSIPVYKGEIQARNLGMAVEAWDEEGKAVWGASGELVCTKPIPCQPTHFWNDENGSKYRKAYFSKFPGVWAHGDYCRINPKTGGIIMLGRSDGTLNPNGVRFGSSEIYNIVEAFDEVEDSLCVPQYNRDGEERVVLFLKMASGHTFQPDLVKRIRDAIRLGLSARHVPSLILETRGIPYTLNGKKVEVAVKQVMAGRTVEHRGAFSNPETLDLYRDIPELQDF
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_sp_dedup.fasta	Fri Feb 15 16:38:31 2019 -0500
@@ -0,0 +1,8 @@
+>sp|Q9EST3|4ET_MOUSE Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 PE=1 SV=2
+MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSRNVESHLLAPAEIPGQPVSKNILQELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ
+>sp|Q9EST3-2|4ET_MOUSE Isoform 2 of Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1
+MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ
+>sp|P34968|5HT2C_MOUSE 5-hydroxytryptamine receptor 2C OS=Mus musculus GN=Htr2c PE=2 SV=2
+MVNLGTAVRSLLVHLIGLLVWQFDISISPVAAIVTDTFNSSDGGRLFQFPDGVQNWPALSIVVIIIMTIGGNILVIMAVSMEKKLHNATNYFLMSLAIADMLVGLLVMPLSLLAILYDYVWPLPRYLCPVWISLDVLFSTASIMHLCAISLDRYVAIRNPIEHSRFNSRTKAIMKIAIVWAISIGVSVPIPVIGLRDESKVFVNNTTCVLNDPNFVLIGSFVAFFIPLTIMVITYFLTIYVLRRQTLMLLRGHTEEELRNISLNFLKCCCKKGDEEENAPNPNPDQKPRRKKKEKRPRGTMQAINNEKKASKVLGIVFFVFLIMWCPFFITNILSVLCGKACNQKLMEKLLNVFVWIGYVCSGINPLVYTLFNKIYRRAFSKYLRCDYKPDKKPPVRQIPRVAATALSGRELNVNIYRHTNERVVRKANDTEPGIEMQVENLELPVNPSNVVSERISSV
+>sp|Q9D2R0|AACS_MOUSE Acetoacetyl-CoA synthetase OS=Mus musculus GN=Aacs PE=1 SV=1
+MSKLARLEREEIMECQVMWEPDSKKDTQMDRFRAAVGTACGLALGNYNDLYHWSVRSYMDFWAEFWKFSGIVYSRMYDEVVDTSKGIADVPEWFRGSRLNYAENLLRHKENDRVALYVAREGREEIVKVTFEELRQQVALFAAAMRKMGVKKGDRVVGYLPNSAHAVEAMLAAASIGAIWSSTSPDFGVNGVLDRFSQIQPKLIFSVEAVVYNGKEHGHLEKLQRVVKGLPDLQRVVLIPYVLPREKIDISKIPNSVFLDDFLASGTGAQAPQLEFEQLPFSHPLFIMFSSGTTGAPKCMVHSAGGTLIQHLKEHMLHGNMTSSDILLYYTTVGWMMWNWMVSALATGASLVLYDGSPLVPTPNVLWDLVDRIGITILGTGAKWLSVLEEKDMKPVETHNLHTLHTILSTGSPLKAQSYEYVYRCIKSSVLLGSISGGTDIISCFMGQNSSIPVYKGEIQARNLGMAVEAWDEEGKAVWGASGELVCTKPIPCQPTHFWNDENGSKYRKAYFSKFPGVWAHGDYCRINPKTGGIIMLGRSDGTLNPNGVRFGSSEIYNIVEAFDEVEDSLCVPQYNRDGEERVVLFLKMASGHTFQPDLVKRIRDAIRLGLSARHVPSLILETRGIPYTLNGKKVEVAVKQVMAGRTVEHRGAFSNPETLDLYRDIPELQDF