annotate glimmer2seq.py @ 0:841357e0acbf draft

Uploaded
author bgruening
date Sat, 06 Jul 2013 10:09:30 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
841357e0acbf Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
841357e0acbf Uploaded
bgruening
parents:
diff changeset
2 """
841357e0acbf Uploaded
bgruening
parents:
diff changeset
3 Input: DNA FASTA file + Glimmer ORF file
841357e0acbf Uploaded
bgruening
parents:
diff changeset
4 Output: ORF sequences as FASTA file
841357e0acbf Uploaded
bgruening
parents:
diff changeset
5 Author: Bjoern Gruening
841357e0acbf Uploaded
bgruening
parents:
diff changeset
6 """
841357e0acbf Uploaded
bgruening
parents:
diff changeset
7 import sys, os
841357e0acbf Uploaded
bgruening
parents:
diff changeset
8 from Bio import SeqIO
841357e0acbf Uploaded
bgruening
parents:
diff changeset
9 from Bio.SeqRecord import SeqRecord
841357e0acbf Uploaded
bgruening
parents:
diff changeset
10
841357e0acbf Uploaded
bgruening
parents:
diff changeset
11 def glimmer2seq( glimmer_prediction = sys.argv [1], genome_sequence = sys.argv[2], outfile = sys.argv[3] ):
841357e0acbf Uploaded
bgruening
parents:
diff changeset
12 if len(sys.argv) >= 4:
841357e0acbf Uploaded
bgruening
parents:
diff changeset
13 glimmerfile = open( glimmer_prediction, "r")
841357e0acbf Uploaded
bgruening
parents:
diff changeset
14 sequence = open( genome_sequence )
841357e0acbf Uploaded
bgruening
parents:
diff changeset
15 else:
841357e0acbf Uploaded
bgruening
parents:
diff changeset
16 print "Missing input values."
841357e0acbf Uploaded
bgruening
parents:
diff changeset
17 sys.exit()
841357e0acbf Uploaded
bgruening
parents:
diff changeset
18
841357e0acbf Uploaded
bgruening
parents:
diff changeset
19 fastafile = SeqIO.parse(sequence, "fasta")
841357e0acbf Uploaded
bgruening
parents:
diff changeset
20
841357e0acbf Uploaded
bgruening
parents:
diff changeset
21 sequences = dict()
841357e0acbf Uploaded
bgruening
parents:
diff changeset
22 seq_records = list()
841357e0acbf Uploaded
bgruening
parents:
diff changeset
23 for entry in fastafile:
841357e0acbf Uploaded
bgruening
parents:
diff changeset
24 sequences[entry.description] = entry
841357e0acbf Uploaded
bgruening
parents:
diff changeset
25
841357e0acbf Uploaded
bgruening
parents:
diff changeset
26 for line in glimmerfile:
841357e0acbf Uploaded
bgruening
parents:
diff changeset
27 if line.startswith('>'):
841357e0acbf Uploaded
bgruening
parents:
diff changeset
28 entry = sequences[ line[1:].strip() ]
841357e0acbf Uploaded
bgruening
parents:
diff changeset
29 else:
841357e0acbf Uploaded
bgruening
parents:
diff changeset
30 orf_start = int(line[8:17])
841357e0acbf Uploaded
bgruening
parents:
diff changeset
31 orf_end = int(line[18:26])
841357e0acbf Uploaded
bgruening
parents:
diff changeset
32
841357e0acbf Uploaded
bgruening
parents:
diff changeset
33 orf_name = line[0:8]
841357e0acbf Uploaded
bgruening
parents:
diff changeset
34 if orf_start <= orf_end:
841357e0acbf Uploaded
bgruening
parents:
diff changeset
35 seq_records.append( SeqRecord( entry.seq[ orf_start-1 : orf_end ], id = orf_name, description = entry.description ) )
841357e0acbf Uploaded
bgruening
parents:
diff changeset
36 else:
841357e0acbf Uploaded
bgruening
parents:
diff changeset
37 seq_records.append( SeqRecord( entry.seq[ orf_end-1 : orf_start ].reverse_complement(), id = orf_name, description = entry.description ) )
841357e0acbf Uploaded
bgruening
parents:
diff changeset
38
841357e0acbf Uploaded
bgruening
parents:
diff changeset
39 SeqIO.write( seq_records, outfile, "fasta" )
841357e0acbf Uploaded
bgruening
parents:
diff changeset
40 glimmerfile.close()
841357e0acbf Uploaded
bgruening
parents:
diff changeset
41 sequence.close()
841357e0acbf Uploaded
bgruening
parents:
diff changeset
42
841357e0acbf Uploaded
bgruening
parents:
diff changeset
43 if __name__ == "__main__" :
841357e0acbf Uploaded
bgruening
parents:
diff changeset
44 glimmer2seq()