comparison genbank_to_fasta.py @ 0:bcdd1a35e545 draft default tip

planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author portiahollyoak
date Fri, 22 Apr 2016 12:09:14 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:bcdd1a35e545
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 import argparse
5 import doctest # This will test if the functions are working
6
7
8 def get_id(line):
9 """
10 This function reads a line and returns the ID name
11
12 >>> line = 'ID TE standard; DNA; INV; 7411 BP.'
13 >>> 'TE'== get_id(line)
14 True
15
16 """
17 if line.startswith("ID"):
18 id = line.split(" ")[1] #split line into 'ID' and rest of line, take rest of line and define as id
19 id = id.split(" ")[0] #split id into 'ID name' and rest of line, take ID name and define as id
20 return id
21
22
23 def get_seq(line):
24 """
25 This function reads a sequence line from a genbank file
26 and returns a sequence with no spaces or digits
27
28 >>> line = "AGTGACATAT TCACATACAA AACCACATAA CATAGAGTAA ACATATTGAA AAGCCGCATA 60"
29 >>> 'AGTGACATATTCACATACAAAACCACATAACATAGAGTAAACATATTGAAAAGCCGCATA' == get_seq(line)
30 True
31
32 """
33 seq = []
34 for char in line:
35 if not char.isdigit() and not char == " ": # If a character is not a digit or space,
36 # it will be added to sequence.
37 seq.append(char)
38 seq = "".join(seq)
39 return seq
40
41
42 def make_seq_dictionary(input_file_handle):
43 """
44 This function loops over a multi genbank file and returns
45 a collection of ID and corresponding sequence in a dictionary.
46 """
47 seq_d = {} # dictionary with id as key and sequence as value
48 next_line_is_seq = False
49 for line in input_file_handle:
50 line = line.strip() # strips any leading or trailing whitespace
51 if line.startswith("ID"):
52 id = get_id(line)
53 seq_d[id]="" # We just create a new key
54 if line.startswith("SQ"):
55 next_line_is_seq = True # If line starts with 'SQ' then state is true
56 continue
57 if line.startswith("//"): # If line starts with '//' then state is false
58 next_line_is_seq = False
59 if next_line_is_seq: # Whatever has been read as true, this is copied to file
60 seq = get_seq(line)
61 seq_d[id] += seq
62 return seq_d
63
64
65 def write_seq_d_to_file(seq_d, output):
66 """
67 This function will write the sequence dictionary to an output file
68 """
69 for transposon, seq in seq_d.items():
70 output.write(">%s\n" % transposon)
71 output.write("%s\n" % seq)
72
73 description = ( "This script will extract ID names and sequences from a multigenbank"
74 "file and format them into a multifasta file." )
75
76
77 parser = argparse.ArgumentParser(description)
78 parser.add_argument("input", help="A multi-genbank file.")
79 parser.add_argument("output", help="Name of the output fasta file.")
80 args = parser.parse_args()
81
82 try:
83 with open(args.input, encoding = "utf-8") as input_file_handle:
84 # This will perform the tasks
85 seq_d = make_seq_dictionary(input_file_handle)
86 except TypeError:
87 with open(args.input) as input_file_handle:
88 seq_d = make_seq_dictionary(input_file_handle)
89
90 with open(args.output, "w") as output:
91 write_seq_d_to_file(seq_d, output)
92