Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences
changeset 2:379c41d859aa draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 240d1baaa04767c7d6ad6e36c854c2b54093e92f
author | galaxyp |
---|---|
date | Wed, 01 Feb 2017 13:24:16 -0500 |
parents | 74144834b0bd |
children | 9ad0d336e5ed |
files | README.md fasta_merge_files_and_filter_unique_sequences.py fasta_merge_files_and_filter_unique_sequences.xml test-data/2.fa test-data/res-accession.fa test-data/res-sequence.fa test-data/res.fa |
diffstat | 7 files changed, 79 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/README.md Fri Dec 16 05:19:27 2016 -0500 +++ b/README.md Wed Feb 01 13:24:16 2017 -0500 @@ -9,7 +9,7 @@ Description ----------- -Merge FASTA files, keeping only unique sequences. +Merge FASTA files, keeping either only unique sequences or only unique header/accession lines. GalaxyP Community @@ -44,4 +44,5 @@ Authors and contributors: * John Chilton <jmchilton@gmail.com> +* Matt Chambers <matt.chambers42@gmail.com> * Minnesota Supercomputing Institute, Univeristy of Minnesota
--- a/fasta_merge_files_and_filter_unique_sequences.py Fri Dec 16 05:19:27 2016 -0500 +++ b/fasta_merge_files_and_filter_unique_sequences.py Wed Feb 01 13:24:16 2017 -0500 @@ -46,20 +46,41 @@ def main(): - seen_sequences = set([]) + seen_sequences = dict([]) + seen_headers = set([]) out_file = open(sys.argv[1], 'w') - for fasta_file in sys.argv[2:]: + if sys.argv[2] == "sequence": + unique_sequences = True + elif sys.argv[2] == "accession": + unique_sequences = False + else: + sys.exit("2nd argument must be 'sequence' or 'accession'") + + for fasta_file in sys.argv[3:]: + print("Reading entries from '%s'" % fasta_file) fa_reader = FASTAReader(fasta_file) for protein in fa_reader: - if protein.sequence in seen_sequences: - pass + if unique_sequences: + if protein.header in seen_headers: + print("Skipping protein '%s' with duplicate header" % protein.header) + continue + elif protein.sequence in seen_sequences: + print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[protein.sequence])) + continue + else: + seen_sequences[protein.sequence] = protein.header + seen_headers.add(protein.header) else: - seen_sequences.add(protein.sequence) - out_file.write(protein.header) - out_file.write(os.linesep) - out_file.write(protein.sequence) - out_file.write(os.linesep) + if protein.header in seen_headers: + print("Skipping protein '%s' with duplicate header" % protein.header) + continue + else: + seen_headers.add(protein.header) + out_file.write(protein.header) + out_file.write(os.linesep) + out_file.write(protein.sequence) + out_file.write(os.linesep) out_file.close() if __name__ == "__main__":
--- a/fasta_merge_files_and_filter_unique_sequences.xml Fri Dec 16 05:19:27 2016 -0500 +++ b/fasta_merge_files_and_filter_unique_sequences.xml Wed Feb 01 13:24:16 2017 -0500 @@ -5,13 +5,17 @@ </requirements> <command> python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py' - '$output' + '$output' $uniqueness_criterion #for $input in $inputs: '$input' #end for </command> <inputs> <param name="inputs" format="fasta" multiple="True" type="data" label="Input FASTA files"/> + <param name="uniqueness_criterion" type="select" label="How are sequences judged to be unique?"> + <option value="sequence" selected="true">Accession and Sequence</option> + <option value="accession">Accession Only</option> + </param> </inputs> <outputs> <data format="fasta" name="output" label="Merged and Filtered FASTA from ${on_string}"/> @@ -19,7 +23,21 @@ <tests> <test> <param name="inputs" value="1.fa,2.fa" ftype="fasta" /> - <output name="output" file="res.fa" ftype="fasta" /> + <param name="uniqueness_criterion" value="sequence" /> + <output name="output" file="res-sequence.fa" ftype="fasta" /> + <assert_stdout> + <has_line line="Skipping protein '>one_2' with duplicate sequence (first seen as '>one')" /> + <has_line line="Skipping protein '>two_2' with duplicate sequence (first seen as '>two')" /> + <has_line line="Skipping protein '>three_2' with duplicate header" /> + </assert_stdout> + </test> + <test> + <param name="inputs" value="1.fa,2.fa" ftype="fasta" /> + <param name="uniqueness_criterion" value="accession" /> + <output name="output" file="res-accession.fa" ftype="fasta" /> + <assert_stdout> + <has_line line="Skipping protein '>three_2' with duplicate header" /> + </assert_stdout> </test> </tests> <help> @@ -27,7 +45,11 @@ **What it does** Concatenate FASTA database files together. -Only first appearence of each unique sequence will appear in output. + +If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output. +Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output. + +In the context of this script, the accession is the entire header line. ------
--- a/test-data/2.fa Fri Dec 16 05:19:27 2016 -0500 +++ b/test-data/2.fa Wed Feb 01 13:24:16 2017 -0500 @@ -4,3 +4,5 @@ GGTGTGTACGT >three_2 ACGTACGACTTTGGTTGTGT +>three_2 +ACGTACGACTTTGGTTGTGTT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/res-accession.fa Wed Feb 01 13:24:16 2017 -0500 @@ -0,0 +1,12 @@ +>one +ACGTACGT +>two +GGTGTGTACGT +>three +ACGTACG +>one_2 +ACGTACGT +>two_2 +GGTGTGTACGT +>three_2 +ACGTACGACTTTGGTTGTGT \ No newline at end of file