# HG changeset patch # User galaxyp # Date 1485973456 18000 # Node ID 379c41d859aa9c89bcd27def13c1e6ee72471f8f # Parent 74144834b0bdb1e0610e2bc009d1f991cb778350 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f diff -r 74144834b0bd -r 379c41d859aa README.md --- a/README.md Fri Dec 16 05:19:27 2016 -0500 +++ b/README.md Wed Feb 01 13:24:16 2017 -0500 @@ -9,7 +9,7 @@ Description ----------- -Merge FASTA files, keeping only unique sequences. +Merge FASTA files, keeping either only unique sequences or only unique header/accession lines. GalaxyP Community @@ -44,4 +44,5 @@ Authors and contributors: * John Chilton +* Matt Chambers * Minnesota Supercomputing Institute, Univeristy of Minnesota diff -r 74144834b0bd -r 379c41d859aa fasta_merge_files_and_filter_unique_sequences.py --- a/fasta_merge_files_and_filter_unique_sequences.py Fri Dec 16 05:19:27 2016 -0500 +++ b/fasta_merge_files_and_filter_unique_sequences.py Wed Feb 01 13:24:16 2017 -0500 @@ -46,20 +46,41 @@ def main(): - seen_sequences = set([]) + seen_sequences = dict([]) + seen_headers = set([]) out_file = open(sys.argv[1], 'w') - for fasta_file in sys.argv[2:]: + if sys.argv[2] == "sequence": + unique_sequences = True + elif sys.argv[2] == "accession": + unique_sequences = False + else: + sys.exit("2nd argument must be 'sequence' or 'accession'") + + for fasta_file in sys.argv[3:]: + print("Reading entries from '%s'" % fasta_file) fa_reader = FASTAReader(fasta_file) for protein in fa_reader: - if protein.sequence in seen_sequences: - pass + if unique_sequences: + if protein.header in seen_headers: + print("Skipping protein '%s' with duplicate header" % protein.header) + continue + elif protein.sequence in seen_sequences: + print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[protein.sequence])) + continue + else: + seen_sequences[protein.sequence] = protein.header + seen_headers.add(protein.header) else: - seen_sequences.add(protein.sequence) - out_file.write(protein.header) - out_file.write(os.linesep) - out_file.write(protein.sequence) - out_file.write(os.linesep) + if protein.header in seen_headers: + print("Skipping protein '%s' with duplicate header" % protein.header) + continue + else: + seen_headers.add(protein.header) + out_file.write(protein.header) + out_file.write(os.linesep) + out_file.write(protein.sequence) + out_file.write(os.linesep) out_file.close() if __name__ == "__main__": diff -r 74144834b0bd -r 379c41d859aa fasta_merge_files_and_filter_unique_sequences.xml --- a/fasta_merge_files_and_filter_unique_sequences.xml Fri Dec 16 05:19:27 2016 -0500 +++ b/fasta_merge_files_and_filter_unique_sequences.xml Wed Feb 01 13:24:16 2017 -0500 @@ -5,13 +5,17 @@ python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py' - '$output' + '$output' $uniqueness_criterion #for $input in $inputs: '$input' #end for + + + + @@ -19,7 +23,21 @@ - + + + + + + + + + + + + + + + @@ -27,7 +45,11 @@ **What it does** Concatenate FASTA database files together. -Only first appearence of each unique sequence will appear in output. + +If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output. +Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output. + +In the context of this script, the accession is the entire header line. ------ diff -r 74144834b0bd -r 379c41d859aa test-data/2.fa --- a/test-data/2.fa Fri Dec 16 05:19:27 2016 -0500 +++ b/test-data/2.fa Wed Feb 01 13:24:16 2017 -0500 @@ -4,3 +4,5 @@ GGTGTGTACGT >three_2 ACGTACGACTTTGGTTGTGT +>three_2 +ACGTACGACTTTGGTTGTGTT diff -r 74144834b0bd -r 379c41d859aa test-data/res-accession.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/res-accession.fa Wed Feb 01 13:24:16 2017 -0500 @@ -0,0 +1,12 @@ +>one +ACGTACGT +>two +GGTGTGTACGT +>three +ACGTACG +>one_2 +ACGTACGT +>two_2 +GGTGTGTACGT +>three_2 +ACGTACGACTTTGGTTGTGT \ No newline at end of file diff -r 74144834b0bd -r 379c41d859aa test-data/res-sequence.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/res-sequence.fa Wed Feb 01 13:24:16 2017 -0500 @@ -0,0 +1,8 @@ +>one +ACGTACGT +>two +GGTGTGTACGT +>three +ACGTACG +>three_2 +ACGTACGACTTTGGTTGTGT \ No newline at end of file diff -r 74144834b0bd -r 379c41d859aa test-data/res.fa --- a/test-data/res.fa Fri Dec 16 05:19:27 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ ->one -ACGTACGT ->two -GGTGTGTACGT ->three -ACGTACG ->three_2 -ACGTACGACTTTGGTTGTGT