view fasta_merge_files_and_filter_unique_sequences.py @ 1:74144834b0bd draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/fasta_merge_files_and_filter_unique_sequences commit 9f9eba8df62b4db1ef35718d880a1bcda7457b99
author galaxyp
date Fri, 16 Dec 2016 05:19:27 -0500
parents 2904d46167da
children 379c41d859aa
line wrap: on
line source

#!/usr/bin/env python
import os
import sys

class Sequence:
    ''' Holds protein sequence information '''
    def __init__(self):
        self.header = ""
        self.sequence = ""

class FASTAReader:
    """
        FASTA db iterator. Returns a single FASTA sequence object.
    """
    def __init__(self, fasta_name):
        self.fasta_file = open(fasta_name)

    def __iter__(self):
        return self

    def __next__(self):
        ''' Iteration '''
        while True:
            line = self.fasta_file.readline()
            if not line:
                raise StopIteration
            if line[0] == '>':
                break

        seq = Sequence()
        seq.header = line.rstrip().replace('\n','').replace('\r','')

        while True:
            tail = self.fasta_file.tell()
            line = self.fasta_file.readline()
            if not line:
                break
            if line[0] == '>':
                self.fasta_file.seek(tail)
                break
            seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','')
        return seq

    # Python 2/3 compat
    next = __next__


def main():
    seen_sequences = set([])

    out_file = open(sys.argv[1], 'w')
    for fasta_file in sys.argv[2:]:
        fa_reader = FASTAReader(fasta_file)
        for protein in fa_reader:
            if protein.sequence in seen_sequences:
                pass
            else:
                seen_sequences.add(protein.sequence)
                out_file.write(protein.header)
                out_file.write(os.linesep)
                out_file.write(protein.sequence)
                out_file.write(os.linesep)
    out_file.close()

if __name__ == "__main__":
    main()