# HG changeset patch
# User galaxyp
# Date 1486150076 18000
# Node ID 9ad0d336e5edd71c127dbd57774c74f649e8d008
# Parent  379c41d859aa9c89bcd27def13c1e6ee72471f8f
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe

diff -r 379c41d859aa -r 9ad0d336e5ed fasta_merge_files_and_filter_unique_sequences.py
--- a/fasta_merge_files_and_filter_unique_sequences.py	Wed Feb 01 13:24:16 2017 -0500
+++ b/fasta_merge_files_and_filter_unique_sequences.py	Fri Feb 03 14:27:56 2017 -0500
@@ -1,19 +1,22 @@
 #!/usr/bin/env python
 import os
 import sys
+import re
 
 class Sequence:
     ''' Holds protein sequence information '''
     def __init__(self):
         self.header = ""
+        self.accession = ""
         self.sequence = ""
 
 class FASTAReader:
     """
         FASTA db iterator. Returns a single FASTA sequence object.
     """
-    def __init__(self, fasta_name):
+    def __init__(self, fasta_name, accession_parser):
         self.fasta_file = open(fasta_name)
+        self.accession_parser = accession_parser
 
     def __iter__(self):
         return self
@@ -30,6 +33,11 @@
         seq = Sequence()
         seq.header = line.rstrip().replace('\n','').replace('\r','')
 
+        m = re.search(self.accession_parser, seq.header)
+        if not m or len(m.groups()) < 1 or len(m.group(1)) == 0:
+          sys.exit("Could not parse accession from '%s'" % seq.header)
+        seq.accession = m.group(1)
+
         while True:
             tail = self.fasta_file.tell()
             line = self.fasta_file.readline()
@@ -47,7 +55,7 @@
 
 def main():
     seen_sequences = dict([])
-    seen_headers = set([])
+    seen_accessions = set([])
 
     out_file = open(sys.argv[1], 'w')
     if sys.argv[2] == "sequence":
@@ -57,26 +65,30 @@
     else:
         sys.exit("2nd argument must be 'sequence' or 'accession'")
 
-    for fasta_file in sys.argv[3:]:
+    accession_parser = sys.argv[3]
+    for key, value in { '\'' :'__sq__', '\\' : '__backslash__' }.items():
+      accession_parser = accession_parser.replace(value, key)
+
+    for fasta_file in sys.argv[4:]:
         print("Reading entries from '%s'" % fasta_file)
-        fa_reader = FASTAReader(fasta_file)
+        fa_reader = FASTAReader(fasta_file, accession_parser)
         for protein in fa_reader:
             if unique_sequences:
-                if protein.header in seen_headers:
-                    print("Skipping protein '%s' with duplicate header" % protein.header)
+                if protein.accession in seen_accessions:
+                    print("Skipping protein '%s' with duplicate accession" % protein.header)
                     continue
-                elif protein.sequence in seen_sequences:
-                    print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[protein.sequence]))
+                elif hash(protein.sequence) in seen_sequences:
+                    print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[hash(protein.sequence)]))
                     continue
                 else:
-                    seen_sequences[protein.sequence] = protein.header
-                    seen_headers.add(protein.header)
+                    seen_sequences[hash(protein.sequence)] = protein.header
+                    seen_accessions.add(protein.accession)
             else:
-                if protein.header in seen_headers:
-                    print("Skipping protein '%s' with duplicate header" % protein.header)
+                if protein.accession in seen_accessions:
+                    print("Skipping protein '%s' with duplicate accession" % protein.header)
                     continue
                 else:
-                    seen_headers.add(protein.header)
+                    seen_accessions.add(protein.accession)
             out_file.write(protein.header)
             out_file.write(os.linesep)
             out_file.write(protein.sequence)
diff -r 379c41d859aa -r 9ad0d336e5ed fasta_merge_files_and_filter_unique_sequences.xml
--- a/fasta_merge_files_and_filter_unique_sequences.xml	Wed Feb 01 13:24:16 2017 -0500
+++ b/fasta_merge_files_and_filter_unique_sequences.xml	Fri Feb 03 14:27:56 2017 -0500
@@ -5,7 +5,7 @@
     </requirements>
     <command>
         python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py'
-        '$output' $uniqueness_criterion
+        '$output' $uniqueness_criterion '$accession_parser'
         #for $input in $inputs:
             '$input'
         #end for
@@ -16,6 +16,19 @@
             <option value="sequence" selected="true">Accession and Sequence</option>
             <option value="accession">Accession Only</option>
         </param>
+        <param name="accession_parser" type="text" label="Accession Parsing Regex" value="^&gt;([^ ]+).*$" help="Regular expression with 1 capture group; the capture group is the accession (which must be unique)">
+          <sanitizer>
+            <valid>
+              <add preset="string.printable"/>
+              <remove value="&#92;" />
+              <remove value="&apos;" />
+            </valid>
+            <mapping initial="none">
+              <add source="&#92;" target="__backslash__" />
+              <add source="&apos;" target="__sq__"/>
+            </mapping>
+          </sanitizer>
+        </param>
     </inputs>
     <outputs>
         <data format="fasta" name="output" label="Merged and Filtered FASTA from ${on_string}"/>
@@ -24,19 +37,23 @@
         <test>
           <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
           <param name="uniqueness_criterion" value="sequence" />
+          <param name="accession_parser" value="^&gt;([^ |]+).*$" />
           <output name="output" file="res-sequence.fa" ftype="fasta" />
           <assert_stdout>
             <has_line line="Skipping protein '&gt;one_2' with duplicate sequence (first seen as '&gt;one')" />
             <has_line line="Skipping protein '&gt;two_2' with duplicate sequence (first seen as '&gt;two')" />
-            <has_line line="Skipping protein '&gt;three_2' with duplicate header" />
+            <has_line line="Skipping protein '&gt;three_2|456' with duplicate accession" />
+            <has_line line="Skipping protein '&gt;three_2 789' with duplicate accession" />
           </assert_stdout>
         </test>
         <test>
           <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
           <param name="uniqueness_criterion" value="accession" />
+          <param name="accession_parser" value="^&gt;([^ |]+).*$" />
           <output name="output" file="res-accession.fa" ftype="fasta" />
           <assert_stdout>
-            <has_line line="Skipping protein '&gt;three_2' with duplicate header" />
+            <has_line line="Skipping protein '&gt;three_2|456' with duplicate accession" />
+            <has_line line="Skipping protein '&gt;three_2 789' with duplicate accession" />
           </assert_stdout>
         </test>
     </tests>
@@ -49,7 +66,7 @@
 If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output.
 Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output.
 
-In the context of this script, the accession is the entire header line.
+The default accession parser will treat everything in the header before the first space as the accession.
 
 ------
 
diff -r 379c41d859aa -r 9ad0d336e5ed test-data/2.fa
--- a/test-data/2.fa	Wed Feb 01 13:24:16 2017 -0500
+++ b/test-data/2.fa	Fri Feb 03 14:27:56 2017 -0500
@@ -2,7 +2,9 @@
 ACGTACGT
 >two_2
 GGTGTGTACGT
->three_2
+>three_2|123
 ACGTACGACTTTGGTTGTGT
->three_2
+>three_2|456
+ACGTACGACTTTGGTTGTGT
+>three_2 789
 ACGTACGACTTTGGTTGTGTT
diff -r 379c41d859aa -r 9ad0d336e5ed test-data/res-accession.fa
--- a/test-data/res-accession.fa	Wed Feb 01 13:24:16 2017 -0500
+++ b/test-data/res-accession.fa	Fri Feb 03 14:27:56 2017 -0500
@@ -8,5 +8,5 @@
 ACGTACGT
 >two_2
 GGTGTGTACGT
->three_2
+>three_2|123
 ACGTACGACTTTGGTTGTGT
\ No newline at end of file
diff -r 379c41d859aa -r 9ad0d336e5ed test-data/res-sequence.fa
--- a/test-data/res-sequence.fa	Wed Feb 01 13:24:16 2017 -0500
+++ b/test-data/res-sequence.fa	Fri Feb 03 14:27:56 2017 -0500
@@ -4,5 +4,5 @@
 GGTGTGTACGT
 >three
 ACGTACG
->three_2
+>three_2|123
 ACGTACGACTTTGGTTGTGT
\ No newline at end of file