Mercurial > repos > portiahollyoak > genbank_to_fasta
diff genbank_to_fasta.py @ 0:bcdd1a35e545 draft default tip
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
author | portiahollyoak |
---|---|
date | Fri, 22 Apr 2016 12:09:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genbank_to_fasta.py Fri Apr 22 12:09:14 2016 -0400 @@ -0,0 +1,92 @@ +#!/usr/bin/env python +# coding: utf-8 + +import argparse +import doctest # This will test if the functions are working + + +def get_id(line): + """ + This function reads a line and returns the ID name + + >>> line = 'ID TE standard; DNA; INV; 7411 BP.' + >>> 'TE'== get_id(line) + True + + """ + if line.startswith("ID"): + id = line.split(" ")[1] #split line into 'ID' and rest of line, take rest of line and define as id + id = id.split(" ")[0] #split id into 'ID name' and rest of line, take ID name and define as id + return id + + +def get_seq(line): + """ + This function reads a sequence line from a genbank file + and returns a sequence with no spaces or digits + + >>> line = "AGTGACATAT TCACATACAA AACCACATAA CATAGAGTAA ACATATTGAA AAGCCGCATA 60" + >>> 'AGTGACATATTCACATACAAAACCACATAACATAGAGTAAACATATTGAAAAGCCGCATA' == get_seq(line) + True + + """ + seq = [] + for char in line: + if not char.isdigit() and not char == " ": # If a character is not a digit or space, + # it will be added to sequence. + seq.append(char) + seq = "".join(seq) + return seq + + +def make_seq_dictionary(input_file_handle): + """ + This function loops over a multi genbank file and returns + a collection of ID and corresponding sequence in a dictionary. + """ + seq_d = {} # dictionary with id as key and sequence as value + next_line_is_seq = False + for line in input_file_handle: + line = line.strip() # strips any leading or trailing whitespace + if line.startswith("ID"): + id = get_id(line) + seq_d[id]="" # We just create a new key + if line.startswith("SQ"): + next_line_is_seq = True # If line starts with 'SQ' then state is true + continue + if line.startswith("//"): # If line starts with '//' then state is false + next_line_is_seq = False + if next_line_is_seq: # Whatever has been read as true, this is copied to file + seq = get_seq(line) + seq_d[id] += seq + return seq_d + + +def write_seq_d_to_file(seq_d, output): + """ + This function will write the sequence dictionary to an output file + """ + for transposon, seq in seq_d.items(): + output.write(">%s\n" % transposon) + output.write("%s\n" % seq) + +description = ( "This script will extract ID names and sequences from a multigenbank" + "file and format them into a multifasta file." ) + + +parser = argparse.ArgumentParser(description) +parser.add_argument("input", help="A multi-genbank file.") +parser.add_argument("output", help="Name of the output fasta file.") +args = parser.parse_args() + +try: + with open(args.input, encoding = "utf-8") as input_file_handle: + # This will perform the tasks + seq_d = make_seq_dictionary(input_file_handle) +except TypeError: + with open(args.input) as input_file_handle: + seq_d = make_seq_dictionary(input_file_handle) + +with open(args.output, "w") as output: + write_seq_d_to_file(seq_d, output) +