comparison dante_gff_to_dna.py @ 17:1a766f9f623d draft

Uploaded
author petr-novak
date Mon, 16 Sep 2019 03:54:45 -0400
parents d0431a839606
children
comparison
equal deleted inserted replaced
16:0e820310d4dc 17:1a766f9f623d
5 import os 5 import os
6 import textwrap 6 import textwrap
7 from collections import defaultdict 7 from collections import defaultdict
8 from Bio import SeqIO 8 from Bio import SeqIO
9 import configuration 9 import configuration
10 10 from dante_gff_output_filtering import parse_gff_line
11 t_nt_seqs_extraction = time.time() 11 t_nt_seqs_extraction = time.time()
12 12
13 13
14 def str2bool(v): 14 def str2bool(v):
15 if v.lower() in ('yes', 'true', 't', 'y', '1'): 15 if v.lower() in ('yes', 'true', 't', 'y', '1'):
32 32
33 def extract_nt_seqs(DNA_SEQ, DOM_GFF, OUT_DIR, CLASS_TBL, EXTENDED): 33 def extract_nt_seqs(DNA_SEQ, DOM_GFF, OUT_DIR, CLASS_TBL, EXTENDED):
34 ''' Extract nucleotide sequences of protein domains found by DANTE from input DNA seq. 34 ''' Extract nucleotide sequences of protein domains found by DANTE from input DNA seq.
35 Sequences are saved in fasta files separately for each transposon lineage. 35 Sequences are saved in fasta files separately for each transposon lineage.
36 Sequences extraction is based on position of Best_Hit alignment reported by LASTAL. 36 Sequences extraction is based on position of Best_Hit alignment reported by LASTAL.
37 The positions can be extended (optional) based on what part of database domain was aligned (Best_Hit_DB_Pos attribute). 37 The positions can be extended (optional) based on what part of database domain was aligned
38 (Best_Hit_DB_Pos attribute).
38 The strand orientation needs to be considered in extending and extracting the sequence itself 39 The strand orientation needs to be considered in extending and extracting the sequence itself
39 ''' 40 '''
40 [count_comment, first_line] = check_file_start(DOM_GFF) 41 [count_comment, first_line] = check_file_start(DOM_GFF)
41 unique_classes = get_unique_classes(CLASS_TBL) 42 unique_classes = get_unique_classes(CLASS_TBL)
42 files_dict = defaultdict(str) 43 files_dict = defaultdict(str)
47 next(domains) 48 next(domains)
48 seq_id_stored = first_line.split("\t")[0] 49 seq_id_stored = first_line.split("\t")[0]
49 allSeqs = SeqIO.to_dict(SeqIO.parse(DNA_SEQ, 'fasta')) 50 allSeqs = SeqIO.to_dict(SeqIO.parse(DNA_SEQ, 'fasta'))
50 seq_nt = allSeqs[seq_id_stored] 51 seq_nt = allSeqs[seq_id_stored]
51 for line in domains: 52 for line in domains:
52 seq_id = line.split("\t")[0] 53 gff_line = parse_gff_line(line)
53 dom_type = line.split("\t")[8].split(";")[0].split("=")[1] 54 elem_type = gff_line['attributes']['Final_Classification']
54 elem_type = line.split("\t")[8].split(";")[1].split("=")[1] 55 if elem_type == configuration.AMBIGUOUS_TAG:
55 strand = line.split("\t")[6] 56 continue # skip ambiguous classification
56 align_nt_start = int(line.split("\t")[8].split(";")[3].split(":")[ 57 seq_id = gff_line['seqid']
58 dom_type = gff_line['attributes']['Name']
59 strand = gff_line['strand']
60 align_nt_start = int(gff_line['attributes']['Best_Hit'].split(":")[
57 -1].split("-")[0]) 61 -1].split("-")[0])
58 align_nt_end = int(line.split("\t")[8].split(";")[3].split(":")[ 62 align_nt_end = int(gff_line['attributes']['Best_Hit'].split(":")[
59 -1].split("-")[1].split("[")[0]) 63 -1].split("-")[1].split("[")[0])
60 if seq_id != seq_id_stored: 64 if seq_id != seq_id_stored:
61 seq_id_stored = seq_id 65 seq_id_stored = seq_id
62 seq_nt = allSeqs[seq_id_stored] 66 seq_nt = allSeqs[seq_id_stored]
63 if EXTENDED: 67 if EXTENDED:
64 ## which part of database sequence was aligned 68 ## which part of database sequence was aligned
65 db_part = line.split("\t")[8].split(";")[4].split("=")[1] 69 db_part = gff_line['attributes']['Best_Hit_DB_Pos']
70 ## db_part = line.split("\t")[8].split(";")[4].split("=")[1]
66 ## datatabse seq length 71 ## datatabse seq length
67 dom_len = int(db_part.split("of")[1]) 72 dom_len = int(db_part.split("of")[1])
68 ## start of alignment on database seq 73 ## start of alignment on database seq
69 db_start = int(db_part.split("of")[0].split(":")[0]) 74 db_start = int(db_part.split("of")[0].split(":")[0])
70 ## end of alignment on database seq 75 ## end of alignment on database seq