Mercurial > repos > galaxyp > fasta_merge_files_and_filter_unique_sequences
diff fasta_merge_files_and_filter_unique_sequences.py @ 0:2904d46167da draft
Uploaded
author | galaxyp |
---|---|
date | Fri, 26 Sep 2014 14:02:14 -0400 |
parents | |
children | 74144834b0bd |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_merge_files_and_filter_unique_sequences.py Fri Sep 26 14:02:14 2014 -0400 @@ -0,0 +1,67 @@ +#!/usr/bin/env python +import sys,os + +#====================================================================== Classes +class Sequence: + ''' Holds protein sequence information ''' + def __init__(self): + self.header = "" + self.sequence = "" + +class FASTAReader: + """ + FASTA db iterator. Returns a single FASTA sequence object. + """ + def __init__(self, fasta_name): + self.fasta_file = open(fasta_name) + + def __iter__(self): + return self + + def __next__(self): + ''' Iteration ''' + while True: + line = self.fasta_file.readline() + if not line: + raise StopIteration + if line[0] == '>': + break + + seq = Sequence() + seq.header = line.rstrip().replace('\n','').replace('\r','') + + while True: + tail = self.fasta_file.tell() + line = self.fasta_file.readline() + if not line: + break + if line[0] == '>': + self.fasta_file.seek(tail) + break + seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','') + return seq + + # Python 2/3 compat + next = __next__ + + +def main(): + seen_sequences = set([]) + + out_file = open(sys.argv[1], 'w') + for fasta_file in sys.argv[2:]: + fa_reader = FASTAReader(fasta_file) + for protein in fa_reader: + if protein.sequence in seen_sequences: + pass + else: + seen_sequences.add(protein.sequence) + + out_file.write(protein.header) + out_file.write(os.linesep) + out_file.write(protein.sequence) + out_file.write(os.linesep) + out_file.close() + +if __name__ == "__main__": + main()