Mercurial > repos > petr-novak > dante
comparison dante_gff_to_dna.py @ 17:1a766f9f623d draft
Uploaded
author | petr-novak |
---|---|
date | Mon, 16 Sep 2019 03:54:45 -0400 |
parents | d0431a839606 |
children |
comparison
equal
deleted
inserted
replaced
16:0e820310d4dc | 17:1a766f9f623d |
---|---|
5 import os | 5 import os |
6 import textwrap | 6 import textwrap |
7 from collections import defaultdict | 7 from collections import defaultdict |
8 from Bio import SeqIO | 8 from Bio import SeqIO |
9 import configuration | 9 import configuration |
10 | 10 from dante_gff_output_filtering import parse_gff_line |
11 t_nt_seqs_extraction = time.time() | 11 t_nt_seqs_extraction = time.time() |
12 | 12 |
13 | 13 |
14 def str2bool(v): | 14 def str2bool(v): |
15 if v.lower() in ('yes', 'true', 't', 'y', '1'): | 15 if v.lower() in ('yes', 'true', 't', 'y', '1'): |
32 | 32 |
33 def extract_nt_seqs(DNA_SEQ, DOM_GFF, OUT_DIR, CLASS_TBL, EXTENDED): | 33 def extract_nt_seqs(DNA_SEQ, DOM_GFF, OUT_DIR, CLASS_TBL, EXTENDED): |
34 ''' Extract nucleotide sequences of protein domains found by DANTE from input DNA seq. | 34 ''' Extract nucleotide sequences of protein domains found by DANTE from input DNA seq. |
35 Sequences are saved in fasta files separately for each transposon lineage. | 35 Sequences are saved in fasta files separately for each transposon lineage. |
36 Sequences extraction is based on position of Best_Hit alignment reported by LASTAL. | 36 Sequences extraction is based on position of Best_Hit alignment reported by LASTAL. |
37 The positions can be extended (optional) based on what part of database domain was aligned (Best_Hit_DB_Pos attribute). | 37 The positions can be extended (optional) based on what part of database domain was aligned |
38 (Best_Hit_DB_Pos attribute). | |
38 The strand orientation needs to be considered in extending and extracting the sequence itself | 39 The strand orientation needs to be considered in extending and extracting the sequence itself |
39 ''' | 40 ''' |
40 [count_comment, first_line] = check_file_start(DOM_GFF) | 41 [count_comment, first_line] = check_file_start(DOM_GFF) |
41 unique_classes = get_unique_classes(CLASS_TBL) | 42 unique_classes = get_unique_classes(CLASS_TBL) |
42 files_dict = defaultdict(str) | 43 files_dict = defaultdict(str) |
47 next(domains) | 48 next(domains) |
48 seq_id_stored = first_line.split("\t")[0] | 49 seq_id_stored = first_line.split("\t")[0] |
49 allSeqs = SeqIO.to_dict(SeqIO.parse(DNA_SEQ, 'fasta')) | 50 allSeqs = SeqIO.to_dict(SeqIO.parse(DNA_SEQ, 'fasta')) |
50 seq_nt = allSeqs[seq_id_stored] | 51 seq_nt = allSeqs[seq_id_stored] |
51 for line in domains: | 52 for line in domains: |
52 seq_id = line.split("\t")[0] | 53 gff_line = parse_gff_line(line) |
53 dom_type = line.split("\t")[8].split(";")[0].split("=")[1] | 54 elem_type = gff_line['attributes']['Final_Classification'] |
54 elem_type = line.split("\t")[8].split(";")[1].split("=")[1] | 55 if elem_type == configuration.AMBIGUOUS_TAG: |
55 strand = line.split("\t")[6] | 56 continue # skip ambiguous classification |
56 align_nt_start = int(line.split("\t")[8].split(";")[3].split(":")[ | 57 seq_id = gff_line['seqid'] |
58 dom_type = gff_line['attributes']['Name'] | |
59 strand = gff_line['strand'] | |
60 align_nt_start = int(gff_line['attributes']['Best_Hit'].split(":")[ | |
57 -1].split("-")[0]) | 61 -1].split("-")[0]) |
58 align_nt_end = int(line.split("\t")[8].split(";")[3].split(":")[ | 62 align_nt_end = int(gff_line['attributes']['Best_Hit'].split(":")[ |
59 -1].split("-")[1].split("[")[0]) | 63 -1].split("-")[1].split("[")[0]) |
60 if seq_id != seq_id_stored: | 64 if seq_id != seq_id_stored: |
61 seq_id_stored = seq_id | 65 seq_id_stored = seq_id |
62 seq_nt = allSeqs[seq_id_stored] | 66 seq_nt = allSeqs[seq_id_stored] |
63 if EXTENDED: | 67 if EXTENDED: |
64 ## which part of database sequence was aligned | 68 ## which part of database sequence was aligned |
65 db_part = line.split("\t")[8].split(";")[4].split("=")[1] | 69 db_part = gff_line['attributes']['Best_Hit_DB_Pos'] |
70 ## db_part = line.split("\t")[8].split(";")[4].split("=")[1] | |
66 ## datatabse seq length | 71 ## datatabse seq length |
67 dom_len = int(db_part.split("of")[1]) | 72 dom_len = int(db_part.split("of")[1]) |
68 ## start of alignment on database seq | 73 ## start of alignment on database seq |
69 db_start = int(db_part.split("of")[0].split(":")[0]) | 74 db_start = int(db_part.split("of")[0].split(":")[0]) |
70 ## end of alignment on database seq | 75 ## end of alignment on database seq |