Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences
changeset 3:9ad0d336e5ed draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 96128b1b32e31c88f08201fd59a07fb1057aafbe
author | galaxyp |
---|---|
date | Fri, 03 Feb 2017 14:27:56 -0500 |
parents | 379c41d859aa |
children | 8462a4e9f86e |
files | fasta_merge_files_and_filter_unique_sequences.py fasta_merge_files_and_filter_unique_sequences.xml test-data/2.fa test-data/res-accession.fa test-data/res-sequence.fa |
diffstat | 5 files changed, 52 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/fasta_merge_files_and_filter_unique_sequences.py Wed Feb 01 13:24:16 2017 -0500 +++ b/fasta_merge_files_and_filter_unique_sequences.py Fri Feb 03 14:27:56 2017 -0500 @@ -1,19 +1,22 @@ #!/usr/bin/env python import os import sys +import re class Sequence: ''' Holds protein sequence information ''' def __init__(self): self.header = "" + self.accession = "" self.sequence = "" class FASTAReader: """ FASTA db iterator. Returns a single FASTA sequence object. """ - def __init__(self, fasta_name): + def __init__(self, fasta_name, accession_parser): self.fasta_file = open(fasta_name) + self.accession_parser = accession_parser def __iter__(self): return self @@ -30,6 +33,11 @@ seq = Sequence() seq.header = line.rstrip().replace('\n','').replace('\r','') + m = re.search(self.accession_parser, seq.header) + if not m or len(m.groups()) < 1 or len(m.group(1)) == 0: + sys.exit("Could not parse accession from '%s'" % seq.header) + seq.accession = m.group(1) + while True: tail = self.fasta_file.tell() line = self.fasta_file.readline() @@ -47,7 +55,7 @@ def main(): seen_sequences = dict([]) - seen_headers = set([]) + seen_accessions = set([]) out_file = open(sys.argv[1], 'w') if sys.argv[2] == "sequence": @@ -57,26 +65,30 @@ else: sys.exit("2nd argument must be 'sequence' or 'accession'") - for fasta_file in sys.argv[3:]: + accession_parser = sys.argv[3] + for key, value in { '\'' :'__sq__', '\\' : '__backslash__' }.items(): + accession_parser = accession_parser.replace(value, key) + + for fasta_file in sys.argv[4:]: print("Reading entries from '%s'" % fasta_file) - fa_reader = FASTAReader(fasta_file) + fa_reader = FASTAReader(fasta_file, accession_parser) for protein in fa_reader: if unique_sequences: - if protein.header in seen_headers: - print("Skipping protein '%s' with duplicate header" % protein.header) + if protein.accession in seen_accessions: + print("Skipping protein '%s' with duplicate accession" % protein.header) continue - elif protein.sequence in seen_sequences: - print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[protein.sequence])) + elif hash(protein.sequence) in seen_sequences: + print("Skipping protein '%s' with duplicate sequence (first seen as '%s')" % (protein.header, seen_sequences[hash(protein.sequence)])) continue else: - seen_sequences[protein.sequence] = protein.header - seen_headers.add(protein.header) + seen_sequences[hash(protein.sequence)] = protein.header + seen_accessions.add(protein.accession) else: - if protein.header in seen_headers: - print("Skipping protein '%s' with duplicate header" % protein.header) + if protein.accession in seen_accessions: + print("Skipping protein '%s' with duplicate accession" % protein.header) continue else: - seen_headers.add(protein.header) + seen_accessions.add(protein.accession) out_file.write(protein.header) out_file.write(os.linesep) out_file.write(protein.sequence)
--- a/fasta_merge_files_and_filter_unique_sequences.xml Wed Feb 01 13:24:16 2017 -0500 +++ b/fasta_merge_files_and_filter_unique_sequences.xml Fri Feb 03 14:27:56 2017 -0500 @@ -5,7 +5,7 @@ </requirements> <command> python '$__tool_directory__/fasta_merge_files_and_filter_unique_sequences.py' - '$output' $uniqueness_criterion + '$output' $uniqueness_criterion '$accession_parser' #for $input in $inputs: '$input' #end for @@ -16,6 +16,19 @@ <option value="sequence" selected="true">Accession and Sequence</option> <option value="accession">Accession Only</option> </param> + <param name="accession_parser" type="text" label="Accession Parsing Regex" value="^>([^ ]+).*$" help="Regular expression with 1 capture group; the capture group is the accession (which must be unique)"> + <sanitizer> + <valid> + <add preset="string.printable"/> + <remove value="\" /> + <remove value="'" /> + </valid> + <mapping initial="none"> + <add source="\" target="__backslash__" /> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> </inputs> <outputs> <data format="fasta" name="output" label="Merged and Filtered FASTA from ${on_string}"/> @@ -24,19 +37,23 @@ <test> <param name="inputs" value="1.fa,2.fa" ftype="fasta" /> <param name="uniqueness_criterion" value="sequence" /> + <param name="accession_parser" value="^>([^ |]+).*$" /> <output name="output" file="res-sequence.fa" ftype="fasta" /> <assert_stdout> <has_line line="Skipping protein '>one_2' with duplicate sequence (first seen as '>one')" /> <has_line line="Skipping protein '>two_2' with duplicate sequence (first seen as '>two')" /> - <has_line line="Skipping protein '>three_2' with duplicate header" /> + <has_line line="Skipping protein '>three_2|456' with duplicate accession" /> + <has_line line="Skipping protein '>three_2 789' with duplicate accession" /> </assert_stdout> </test> <test> <param name="inputs" value="1.fa,2.fa" ftype="fasta" /> <param name="uniqueness_criterion" value="accession" /> + <param name="accession_parser" value="^>([^ |]+).*$" /> <output name="output" file="res-accession.fa" ftype="fasta" /> <assert_stdout> - <has_line line="Skipping protein '>three_2' with duplicate header" /> + <has_line line="Skipping protein '>three_2|456' with duplicate accession" /> + <has_line line="Skipping protein '>three_2 789' with duplicate accession" /> </assert_stdout> </test> </tests> @@ -49,7 +66,7 @@ If the uniqueness criterion is "Accession and Sequence", only the first appearence of each unique sequence will appear in the output. Otherwise, duplicate sequences are allowed, but only the first appearance of each accession will appear in the output. -In the context of this script, the accession is the entire header line. +The default accession parser will treat everything in the header before the first space as the accession. ------
--- a/test-data/2.fa Wed Feb 01 13:24:16 2017 -0500 +++ b/test-data/2.fa Fri Feb 03 14:27:56 2017 -0500 @@ -2,7 +2,9 @@ ACGTACGT >two_2 GGTGTGTACGT ->three_2 +>three_2|123 ACGTACGACTTTGGTTGTGT ->three_2 +>three_2|456 +ACGTACGACTTTGGTTGTGT +>three_2 789 ACGTACGACTTTGGTTGTGTT