diff fasta_merge_files_and_filter_unique_sequences.py @ 0:2904d46167da draft

Uploaded
author galaxyp
date Fri, 26 Sep 2014 14:02:14 -0400
parents
children 74144834b0bd
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_merge_files_and_filter_unique_sequences.py	Fri Sep 26 14:02:14 2014 -0400
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+import sys,os
+
+#====================================================================== Classes
+class Sequence:
+    ''' Holds protein sequence information '''
+    def __init__(self):
+        self.header = ""
+        self.sequence = ""
+
+class FASTAReader:
+    """
+        FASTA db iterator. Returns a single FASTA sequence object.
+    """
+    def __init__(self, fasta_name):
+        self.fasta_file = open(fasta_name)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        ''' Iteration '''
+        while True:
+            line = self.fasta_file.readline()
+            if not line:
+                raise StopIteration
+            if line[0] == '>':
+                break
+
+        seq = Sequence()
+        seq.header = line.rstrip().replace('\n','').replace('\r','')
+
+        while True:
+            tail = self.fasta_file.tell()
+            line = self.fasta_file.readline()
+            if not line:
+                break
+            if line[0] == '>':
+                self.fasta_file.seek(tail)
+                break
+            seq.sequence = seq.sequence + line.rstrip().replace('\n','').replace('\r','')
+        return seq
+
+    # Python 2/3 compat
+    next = __next__
+
+
+def main():
+    seen_sequences = set([])
+
+    out_file = open(sys.argv[1], 'w')
+    for fasta_file in sys.argv[2:]:
+        fa_reader = FASTAReader(fasta_file)
+        for protein in fa_reader:
+            if protein.sequence in seen_sequences:
+                pass
+            else:
+                seen_sequences.add(protein.sequence)
+
+                out_file.write(protein.header)
+                out_file.write(os.linesep)
+                out_file.write(protein.sequence)
+                out_file.write(os.linesep)
+    out_file.close()
+
+if __name__ == "__main__":
+    main()