Galaxy |

Changeset 2:379c41d859aa (2017-02-01)

Previous changeset 1:74144834b0bd (2016-12-16) Next changeset 3:9ad0d336e5ed (2017-02-03)

Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f

modified:
README.md
fasta_merge_files_and_filter_unique_sequences.py
fasta_merge_files_and_filter_unique_sequences.xml
test-data/2.fa

added:
test-data/res-accession.fa
test-data/res-sequence.fa

removed:
test-data/res.fa

diff -r 74144834b0bd -r 379c41d859aa README.md
--- a/README.md Fri Dec 16 05:19:27 2016 -0500
+++ b/README.md Wed Feb 01 13:24:16 2017 -0500

@@ -9,7 +9,7 @@
Description
-----------

-Merge FASTA files, keeping only unique sequences.
+Merge FASTA files, keeping either only unique sequences or only unique header/accession lines.

GalaxyP Community
@@ -44,4 +44,5 @@
Authors and contributors:

* John Chilton <jmchilton@gmail.com>
+* Matt Chambers <matt.chambers42@gmail.com>
* Minnesota Supercomputing Institute, Univeristy of Minnesota

diff -r 74144834b0bd -r 379c41d859aa fasta_merge_files_and_filter_unique_sequences.py
--- a/fasta_merge_files_and_filter_unique_sequences.py Fri Dec 16 05:19:27 2016 -0500
+++ b/fasta_merge_files_and_filter_unique_sequences.py Wed Feb 01 13:24:16 2017 -0500

[

@@ -46,20 +46,41 @@

def main():
-    seen_sequences = set([])
+    seen_sequences = dict([])
+    seen_headers = set([])

     out_file = open(sys.argv[1], 'w')
-    for fasta_file in sys.argv[2:]:
+    if sys.argv[2] == "sequence":
+        unique_sequences = True
+    elif sys.argv[2] == "accession":
+        unique_sequences = False
+    else:
+        sys.exit("2nd argument must be 'sequence' or 'accession'")
+
+    for fasta_file in sys.argv[3:]:
+        print("Reading entries from '%s'" % fasta_file)
         fa_reader = FASTAReader(fasta_file)
         for protein in fa_reader:
-            if protein.sequence in seen_sequences:
-                pass
+            if unique_sequences:
+                if protein.header in seen_headers:
+                    print("Skipping protein '%s' with duplicate header" % protein.header)
+                    continue
+                elif protein.sequence in seen_sequences:
+                    print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[protein.sequence]))
+                    continue
+                else:
+                    seen_sequences[protein.sequence] = protein.header
+                    seen_headers.add(protein.header)
             else:
-                seen_sequences.add(protein.sequence)
-                out_file.write(protein.header)
-                out_file.write(os.linesep)
-                out_file.write(protein.sequence)
-                out_file.write(os.linesep)
+                if protein.header in seen_headers:
+                    print("Skipping protein '%s' with duplicate header" % protein.header)
+                    continue
+                else:
+                    seen_headers.add(protein.header)
+            out_file.write(protein.header)
+            out_file.write(os.linesep)
+            out_file.write(protein.sequence)
+            out_file.write(os.linesep)
     out_file.close()

if __name__ == "__main__":

diff -r 74144834b0bd -r 379c41d859aa fasta_merge_files_and_filter_unique_sequences.xml
--- a/fasta_merge_files_and_filter_unique_sequences.xml Fri Dec 16 05:19:27 2016 -0500
+++ b/fasta_merge_files_and_filter_unique_sequences.xml Wed Feb 01 13:24:16 2017 -0500

@@ -5,13 +5,17 @@
     </requirements>
     <command>
         python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py'
-        '$output'
+        '$output' $uniqueness_criterion
         #for $input in $inputs:
             '$input'
         #end for
     </command>
     <inputs>
         <param name="inputs" format="fasta" multiple="True" type="data" label="Input FASTA files"/>
+        <param name="uniqueness_criterion" type="select" label="How are sequences judged to be unique?">
+            <option value="sequence" selected="true">Accession and Sequence</option>
+            <option value="accession">Accession Only</option>
+        </param>
     </inputs>
     <outputs>
         <data format="fasta" name="output" label="Merged and Filtered FASTA from ${on_string}"/>
@@ -19,7 +23,21 @@
     <tests>
         <test>
           <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
-          <output name="output" file="res.fa" ftype="fasta" />
+          <param name="uniqueness_criterion" value="sequence" />
+          <output name="output" file="res-sequence.fa" ftype="fasta" />
+          <assert_stdout>
+            <has_line line="Skipping protein '>one_2' with duplicate sequence (first seen as '>one')" />
+            <has_line line="Skipping protein '>two_2' with duplicate sequence (first seen as '>two')" />
+            <has_line line="Skipping protein '>three_2' with duplicate header" />
+          </assert_stdout>
+        </test>
+        <test>
+          <param name="inputs" value="1.fa,2.fa" ftype="fasta" />
+          <param name="uniqueness_criterion" value="accession" />
+          <output name="output" file="res-accession.fa" ftype="fasta" />
+          <assert_stdout>
+            <has_line line="Skipping protein '>three_2' with duplicate header" />
+          </assert_stdout>
         </test>
     </tests>
     <help>
@@ -27,7 +45,11 @@
**What it does**

Concatenate FASTA database files together.
-Only first appearence of each unique sequence will appear in output.
+
+If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output.
+Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output.
+
+In the context of this script, the accession is the entire header line.

------

diff -r 74144834b0bd -r 379c41d859aa test-data/2.fa
--- a/test-data/2.fa Fri Dec 16 05:19:27 2016 -0500
+++ b/test-data/2.fa Wed Feb 01 13:24:16 2017 -0500

@@ -4,3 +4,5 @@
GGTGTGTACGT
>three_2
ACGTACGACTTTGGTTGTGT
+>three_2
+ACGTACGACTTTGGTTGTGTT

diff -r 74144834b0bd -r 379c41d859aa test-data/res-accession.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/res-accession.fa Wed Feb 01 13:24:16 2017 -0500

@@ -0,0 +1,12 @@
+>one
+ACGTACGT
+>two
+GGTGTGTACGT
+>three
+ACGTACG
+>one_2
+ACGTACGT
+>two_2
+GGTGTGTACGT
+>three_2
+ACGTACGACTTTGGTTGTGT
\ No newline at end of file

diff -r 74144834b0bd -r 379c41d859aa test-data/res-sequence.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/res-sequence.fa Wed Feb 01 13:24:16 2017 -0500

@@ -0,0 +1,8 @@
+>one
+ACGTACGT
+>two
+GGTGTGTACGT
+>three
+ACGTACG
+>three_2
+ACGTACGACTTTGGTTGTGT
\ No newline at end of file

diff -r 74144834b0bd -r 379c41d859aa test-data/res.fa
--- a/test-data/res.fa Fri Dec 16 05:19:27 2016 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,8 +0,0 @@
->one
-ACGTACGT
->two
-GGTGTGTACGT
->three
-ACGTACG
->three_2
-ACGTACGACTTTGGTTGTGT