Mercurial > repos > portiahollyoak > genbank_to_fasta
annotate genbank_to_fasta.py @ 0:bcdd1a35e545 draft default tip
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author | portiahollyoak |
---|---|
date | Fri, 22 Apr 2016 12:09:14 -0400 |
parents | |
children |
rev | line source |
---|---|
0
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
1 #!/usr/bin/env python |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
2 # coding: utf-8 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
3 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
4 import argparse |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
5 import doctest # This will test if the functions are working |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
6 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
7 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
8 def get_id(line): |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
9 """ |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
10 This function reads a line and returns the ID name |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
11 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
12 >>> line = 'ID TE standard; DNA; INV; 7411 BP.' |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
13 >>> 'TE'== get_id(line) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
14 True |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
15 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
16 """ |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
17 if line.startswith("ID"): |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
18 id = line.split(" ")[1] #split line into 'ID' and rest of line, take rest of line and define as id |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
19 id = id.split(" ")[0] #split id into 'ID name' and rest of line, take ID name and define as id |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
20 return id |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
21 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
22 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
23 def get_seq(line): |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
24 """ |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
25 This function reads a sequence line from a genbank file |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
26 and returns a sequence with no spaces or digits |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
27 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
28 >>> line = "AGTGACATAT TCACATACAA AACCACATAA CATAGAGTAA ACATATTGAA AAGCCGCATA 60" |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
29 >>> 'AGTGACATATTCACATACAAAACCACATAACATAGAGTAAACATATTGAAAAGCCGCATA' == get_seq(line) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
30 True |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
31 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
32 """ |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
33 seq = [] |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
34 for char in line: |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
35 if not char.isdigit() and not char == " ": # If a character is not a digit or space, |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
36 # it will be added to sequence. |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
37 seq.append(char) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
38 seq = "".join(seq) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
39 return seq |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
40 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
41 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
42 def make_seq_dictionary(input_file_handle): |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
43 """ |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
44 This function loops over a multi genbank file and returns |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
45 a collection of ID and corresponding sequence in a dictionary. |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
46 """ |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
47 seq_d = {} # dictionary with id as key and sequence as value |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
48 next_line_is_seq = False |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
49 for line in input_file_handle: |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
50 line = line.strip() # strips any leading or trailing whitespace |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
51 if line.startswith("ID"): |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
52 id = get_id(line) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
53 seq_d[id]="" # We just create a new key |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
54 if line.startswith("SQ"): |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
55 next_line_is_seq = True # If line starts with 'SQ' then state is true |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
56 continue |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
57 if line.startswith("//"): # If line starts with '//' then state is false |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
58 next_line_is_seq = False |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
59 if next_line_is_seq: # Whatever has been read as true, this is copied to file |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
60 seq = get_seq(line) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
61 seq_d[id] += seq |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
62 return seq_d |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
63 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
64 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
65 def write_seq_d_to_file(seq_d, output): |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
66 """ |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
67 This function will write the sequence dictionary to an output file |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
68 """ |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
69 for transposon, seq in seq_d.items(): |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
70 output.write(">%s\n" % transposon) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
71 output.write("%s\n" % seq) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
72 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
73 description = ( "This script will extract ID names and sequences from a multigenbank" |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
74 "file and format them into a multifasta file." ) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
75 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
76 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
77 parser = argparse.ArgumentParser(description) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
78 parser.add_argument("input", help="A multi-genbank file.") |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
79 parser.add_argument("output", help="Name of the output fasta file.") |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
80 args = parser.parse_args() |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
81 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
82 try: |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
83 with open(args.input, encoding = "utf-8") as input_file_handle: |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
84 # This will perform the tasks |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
85 seq_d = make_seq_dictionary(input_file_handle) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
86 except TypeError: |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
87 with open(args.input) as input_file_handle: |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
88 seq_d = make_seq_dictionary(input_file_handle) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
89 |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
90 with open(args.output, "w") as output: |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
91 write_seq_d_to_file(seq_d, output) |
bcdd1a35e545
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
portiahollyoak
parents:
diff
changeset
|
92 |