Previous changeset 1:74144834b0bd (2016-12-16) Next changeset 3:9ad0d336e5ed (2017-02-03) |
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f |
modified:
README.md fasta_merge_files_and_filter_unique_sequences.py fasta_merge_files_and_filter_unique_sequences.xml test-data/2.fa |
added:
test-data/res-accession.fa test-data/res-sequence.fa |
removed:
test-data/res.fa |
b |
diff -r 74144834b0bd -r 379c41d859aa README.md --- a/README.md Fri Dec 16 05:19:27 2016 -0500 +++ b/README.md Wed Feb 01 13:24:16 2017 -0500 |
b |
@@ -9,7 +9,7 @@ Description ----------- -Merge FASTA files, keeping only unique sequences. +Merge FASTA files, keeping either only unique sequences or only unique header/accession lines. GalaxyP Community @@ -44,4 +44,5 @@ Authors and contributors: * John Chilton <jmchilton@gmail.com> +* Matt Chambers <matt.chambers42@gmail.com> * Minnesota Supercomputing Institute, Univeristy of Minnesota |
b |
diff -r 74144834b0bd -r 379c41d859aa fasta_merge_files_and_filter_unique_sequences.py --- a/fasta_merge_files_and_filter_unique_sequences.py Fri Dec 16 05:19:27 2016 -0500 +++ b/fasta_merge_files_and_filter_unique_sequences.py Wed Feb 01 13:24:16 2017 -0500 |
[ |
@@ -46,20 +46,41 @@ def main(): - seen_sequences = set([]) + seen_sequences = dict([]) + seen_headers = set([]) out_file = open(sys.argv[1], 'w') - for fasta_file in sys.argv[2:]: + if sys.argv[2] == "sequence": + unique_sequences = True + elif sys.argv[2] == "accession": + unique_sequences = False + else: + sys.exit("2nd argument must be 'sequence' or 'accession'") + + for fasta_file in sys.argv[3:]: + print("Reading entries from '%s'" % fasta_file) fa_reader = FASTAReader(fasta_file) for protein in fa_reader: - if protein.sequence in seen_sequences: - pass + if unique_sequences: + if protein.header in seen_headers: + print("Skipping protein '%s' with duplicate header" % protein.header) + continue + elif protein.sequence in seen_sequences: + print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[protein.sequence])) + continue + else: + seen_sequences[protein.sequence] = protein.header + seen_headers.add(protein.header) else: - seen_sequences.add(protein.sequence) - out_file.write(protein.header) - out_file.write(os.linesep) - out_file.write(protein.sequence) - out_file.write(os.linesep) + if protein.header in seen_headers: + print("Skipping protein '%s' with duplicate header" % protein.header) + continue + else: + seen_headers.add(protein.header) + out_file.write(protein.header) + out_file.write(os.linesep) + out_file.write(protein.sequence) + out_file.write(os.linesep) out_file.close() if __name__ == "__main__": |
b |
diff -r 74144834b0bd -r 379c41d859aa fasta_merge_files_and_filter_unique_sequences.xml --- a/fasta_merge_files_and_filter_unique_sequences.xml Fri Dec 16 05:19:27 2016 -0500 +++ b/fasta_merge_files_and_filter_unique_sequences.xml Wed Feb 01 13:24:16 2017 -0500 |
b |
@@ -5,13 +5,17 @@ </requirements> <command> python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py' - '$output' + '$output' $uniqueness_criterion #for $input in $inputs: '$input' #end for </command> <inputs> <param name="inputs" format="fasta" multiple="True" type="data" label="Input FASTA files"/> + <param name="uniqueness_criterion" type="select" label="How are sequences judged to be unique?"> + <option value="sequence" selected="true">Accession and Sequence</option> + <option value="accession">Accession Only</option> + </param> </inputs> <outputs> <data format="fasta" name="output" label="Merged and Filtered FASTA from ${on_string}"/> @@ -19,7 +23,21 @@ <tests> <test> <param name="inputs" value="1.fa,2.fa" ftype="fasta" /> - <output name="output" file="res.fa" ftype="fasta" /> + <param name="uniqueness_criterion" value="sequence" /> + <output name="output" file="res-sequence.fa" ftype="fasta" /> + <assert_stdout> + <has_line line="Skipping protein '>one_2' with duplicate sequence (first seen as '>one')" /> + <has_line line="Skipping protein '>two_2' with duplicate sequence (first seen as '>two')" /> + <has_line line="Skipping protein '>three_2' with duplicate header" /> + </assert_stdout> + </test> + <test> + <param name="inputs" value="1.fa,2.fa" ftype="fasta" /> + <param name="uniqueness_criterion" value="accession" /> + <output name="output" file="res-accession.fa" ftype="fasta" /> + <assert_stdout> + <has_line line="Skipping protein '>three_2' with duplicate header" /> + </assert_stdout> </test> </tests> <help> @@ -27,7 +45,11 @@ **What it does** Concatenate FASTA database files together. -Only first appearence of each unique sequence will appear in output. + +If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output. +Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output. + +In the context of this script, the accession is the entire header line. ------ |
b |
diff -r 74144834b0bd -r 379c41d859aa test-data/2.fa --- a/test-data/2.fa Fri Dec 16 05:19:27 2016 -0500 +++ b/test-data/2.fa Wed Feb 01 13:24:16 2017 -0500 |
b |
@@ -4,3 +4,5 @@ GGTGTGTACGT >three_2 ACGTACGACTTTGGTTGTGT +>three_2 +ACGTACGACTTTGGTTGTGTT |
b |
diff -r 74144834b0bd -r 379c41d859aa test-data/res-accession.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/res-accession.fa Wed Feb 01 13:24:16 2017 -0500 |
b |
@@ -0,0 +1,12 @@ +>one +ACGTACGT +>two +GGTGTGTACGT +>three +ACGTACG +>one_2 +ACGTACGT +>two_2 +GGTGTGTACGT +>three_2 +ACGTACGACTTTGGTTGTGT \ No newline at end of file |
b |
diff -r 74144834b0bd -r 379c41d859aa test-data/res-sequence.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/res-sequence.fa Wed Feb 01 13:24:16 2017 -0500 |
b |
@@ -0,0 +1,8 @@ +>one +ACGTACGT +>two +GGTGTGTACGT +>three +ACGTACG +>three_2 +ACGTACGACTTTGGTTGTGT \ No newline at end of file |
b |
diff -r 74144834b0bd -r 379c41d859aa test-data/res.fa --- a/test-data/res.fa Fri Dec 16 05:19:27 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,8 +0,0 @@ ->one -ACGTACGT ->two -GGTGTGTACGT ->three -ACGTACG ->three_2 -ACGTACGACTTTGGTTGTGT |