annotate genbank_to_fasta.py @ 0:bcdd1a35e545 draft default tip

planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author portiahollyoak
date Fri, 22 Apr 2016 12:09:14 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
1 #!/usr/bin/env python
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
2 # coding: utf-8
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
3
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
4 import argparse
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
5 import doctest # This will test if the functions are working
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
6
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
7
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
8 def get_id(line):
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
9 """
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
10 This function reads a line and returns the ID name
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
11
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
12 >>> line = 'ID TE standard; DNA; INV; 7411 BP.'
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
13 >>> 'TE'== get_id(line)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
14 True
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
15
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
16 """
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
17 if line.startswith("ID"):
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
18 id = line.split(" ")[1] #split line into 'ID' and rest of line, take rest of line and define as id
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
19 id = id.split(" ")[0] #split id into 'ID name' and rest of line, take ID name and define as id
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
20 return id
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
21
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
22
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
23 def get_seq(line):
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
24 """
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
25 This function reads a sequence line from a genbank file
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
26 and returns a sequence with no spaces or digits
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
27
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
28 >>> line = "AGTGACATAT TCACATACAA AACCACATAA CATAGAGTAA ACATATTGAA AAGCCGCATA 60"
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
29 >>> 'AGTGACATATTCACATACAAAACCACATAACATAGAGTAAACATATTGAAAAGCCGCATA' == get_seq(line)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
30 True
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
31
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
32 """
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
33 seq = []
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
34 for char in line:
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
35 if not char.isdigit() and not char == " ": # If a character is not a digit or space,
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
36 # it will be added to sequence.
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
37 seq.append(char)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
38 seq = "".join(seq)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
39 return seq
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
40
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
41
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
42 def make_seq_dictionary(input_file_handle):
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
43 """
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
44 This function loops over a multi genbank file and returns
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
45 a collection of ID and corresponding sequence in a dictionary.
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
46 """
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
47 seq_d = {} # dictionary with id as key and sequence as value
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
48 next_line_is_seq = False
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
49 for line in input_file_handle:
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
50 line = line.strip() # strips any leading or trailing whitespace
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
51 if line.startswith("ID"):
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
52 id = get_id(line)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
53 seq_d[id]="" # We just create a new key
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
54 if line.startswith("SQ"):
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
55 next_line_is_seq = True # If line starts with 'SQ' then state is true
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
56 continue
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
57 if line.startswith("//"): # If line starts with '//' then state is false
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
58 next_line_is_seq = False
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
59 if next_line_is_seq: # Whatever has been read as true, this is copied to file
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
60 seq = get_seq(line)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
61 seq_d[id] += seq
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
62 return seq_d
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
63
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
64
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
65 def write_seq_d_to_file(seq_d, output):
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
66 """
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
67 This function will write the sequence dictionary to an output file
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
68 """
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
69 for transposon, seq in seq_d.items():
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
70 output.write(">%s\n" % transposon)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
71 output.write("%s\n" % seq)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
72
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
73 description = ( "This script will extract ID names and sequences from a multigenbank"
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
74 "file and format them into a multifasta file." )
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
75
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
76
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
77 parser = argparse.ArgumentParser(description)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
78 parser.add_argument("input", help="A multi-genbank file.")
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
79 parser.add_argument("output", help="Name of the output fasta file.")
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
80 args = parser.parse_args()
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
81
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
82 try:
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
83 with open(args.input, encoding = "utf-8") as input_file_handle:
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
84 # This will perform the tasks
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
85 seq_d = make_seq_dictionary(input_file_handle)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
86 except TypeError:
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
87 with open(args.input) as input_file_handle:
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
88 seq_d = make_seq_dictionary(input_file_handle)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
89
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
90 with open(args.output, "w") as output:
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
91 write_seq_d_to_file(seq_d, output)
bcdd1a35e545 planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff changeset
92