Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences
view fasta_merge_files_and_filter_unique_sequences.py @ 1:74144834b0bd draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
author | galaxyp |
---|---|
date | Fri, 16 Dec 2016 05:19:27 -0500 |
parents | 2904d46167da |
children | 379c41d859aa |
line wrap: on
line source
#!/usr/bin/env python import os import sys class Sequence: ''' Holds protein sequence information ''' def __init__(self): self.header = "" self.sequence = "" class FASTAReader: """ FASTA db iterator. Returns a single FASTA sequence object. """ def __init__(self, fasta_name): self.fasta_file = open(fasta_name) def __iter__(self): return self def __next__(self): ''' Iteration ''' while True: line = self.fasta_file.readline() if not line: raise StopIteration if line[0] == '>': break seq = Sequence() seq.header = line.rstrip().replace('\n','').replace('\r','') while True: tail = self.fasta_file.tell() line = self.fasta_file.readline() if not line: break if line[0] == '>': self.fasta_file.seek(tail) break seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','') return seq # Python 2/3 compat next = __next__ def main(): seen_sequences = set([]) out_file = open(sys.argv[1], 'w') for fasta_file in sys.argv[2:]: fa_reader = FASTAReader(fasta_file) for protein in fa_reader: if protein.sequence in seen_sequences: pass else: seen_sequences.add(protein.sequence) out_file.write(protein.header) out_file.write(os.linesep) out_file.write(protein.sequence) out_file.write(os.linesep) out_file.close() if __name__ == "__main__": main()