Mercurial > repos > petr-novak > dante
comparison dante_gff_to_dna.py @ 17:1a766f9f623d draft
Uploaded
| author | petr-novak |
|---|---|
| date | Mon, 16 Sep 2019 03:54:45 -0400 |
| parents | d0431a839606 |
| children |
comparison
equal
deleted
inserted
replaced
| 16:0e820310d4dc | 17:1a766f9f623d |
|---|---|
| 5 import os | 5 import os |
| 6 import textwrap | 6 import textwrap |
| 7 from collections import defaultdict | 7 from collections import defaultdict |
| 8 from Bio import SeqIO | 8 from Bio import SeqIO |
| 9 import configuration | 9 import configuration |
| 10 | 10 from dante_gff_output_filtering import parse_gff_line |
| 11 t_nt_seqs_extraction = time.time() | 11 t_nt_seqs_extraction = time.time() |
| 12 | 12 |
| 13 | 13 |
| 14 def str2bool(v): | 14 def str2bool(v): |
| 15 if v.lower() in ('yes', 'true', 't', 'y', '1'): | 15 if v.lower() in ('yes', 'true', 't', 'y', '1'): |
| 32 | 32 |
| 33 def extract_nt_seqs(DNA_SEQ, DOM_GFF, OUT_DIR, CLASS_TBL, EXTENDED): | 33 def extract_nt_seqs(DNA_SEQ, DOM_GFF, OUT_DIR, CLASS_TBL, EXTENDED): |
| 34 ''' Extract nucleotide sequences of protein domains found by DANTE from input DNA seq. | 34 ''' Extract nucleotide sequences of protein domains found by DANTE from input DNA seq. |
| 35 Sequences are saved in fasta files separately for each transposon lineage. | 35 Sequences are saved in fasta files separately for each transposon lineage. |
| 36 Sequences extraction is based on position of Best_Hit alignment reported by LASTAL. | 36 Sequences extraction is based on position of Best_Hit alignment reported by LASTAL. |
| 37 The positions can be extended (optional) based on what part of database domain was aligned (Best_Hit_DB_Pos attribute). | 37 The positions can be extended (optional) based on what part of database domain was aligned |
| 38 (Best_Hit_DB_Pos attribute). | |
| 38 The strand orientation needs to be considered in extending and extracting the sequence itself | 39 The strand orientation needs to be considered in extending and extracting the sequence itself |
| 39 ''' | 40 ''' |
| 40 [count_comment, first_line] = check_file_start(DOM_GFF) | 41 [count_comment, first_line] = check_file_start(DOM_GFF) |
| 41 unique_classes = get_unique_classes(CLASS_TBL) | 42 unique_classes = get_unique_classes(CLASS_TBL) |
| 42 files_dict = defaultdict(str) | 43 files_dict = defaultdict(str) |
| 47 next(domains) | 48 next(domains) |
| 48 seq_id_stored = first_line.split("\t")[0] | 49 seq_id_stored = first_line.split("\t")[0] |
| 49 allSeqs = SeqIO.to_dict(SeqIO.parse(DNA_SEQ, 'fasta')) | 50 allSeqs = SeqIO.to_dict(SeqIO.parse(DNA_SEQ, 'fasta')) |
| 50 seq_nt = allSeqs[seq_id_stored] | 51 seq_nt = allSeqs[seq_id_stored] |
| 51 for line in domains: | 52 for line in domains: |
| 52 seq_id = line.split("\t")[0] | 53 gff_line = parse_gff_line(line) |
| 53 dom_type = line.split("\t")[8].split(";")[0].split("=")[1] | 54 elem_type = gff_line['attributes']['Final_Classification'] |
| 54 elem_type = line.split("\t")[8].split(";")[1].split("=")[1] | 55 if elem_type == configuration.AMBIGUOUS_TAG: |
| 55 strand = line.split("\t")[6] | 56 continue # skip ambiguous classification |
| 56 align_nt_start = int(line.split("\t")[8].split(";")[3].split(":")[ | 57 seq_id = gff_line['seqid'] |
| 58 dom_type = gff_line['attributes']['Name'] | |
| 59 strand = gff_line['strand'] | |
| 60 align_nt_start = int(gff_line['attributes']['Best_Hit'].split(":")[ | |
| 57 -1].split("-")[0]) | 61 -1].split("-")[0]) |
| 58 align_nt_end = int(line.split("\t")[8].split(";")[3].split(":")[ | 62 align_nt_end = int(gff_line['attributes']['Best_Hit'].split(":")[ |
| 59 -1].split("-")[1].split("[")[0]) | 63 -1].split("-")[1].split("[")[0]) |
| 60 if seq_id != seq_id_stored: | 64 if seq_id != seq_id_stored: |
| 61 seq_id_stored = seq_id | 65 seq_id_stored = seq_id |
| 62 seq_nt = allSeqs[seq_id_stored] | 66 seq_nt = allSeqs[seq_id_stored] |
| 63 if EXTENDED: | 67 if EXTENDED: |
| 64 ## which part of database sequence was aligned | 68 ## which part of database sequence was aligned |
| 65 db_part = line.split("\t")[8].split(";")[4].split("=")[1] | 69 db_part = gff_line['attributes']['Best_Hit_DB_Pos'] |
| 70 ## db_part = line.split("\t")[8].split(";")[4].split("=")[1] | |
| 66 ## datatabse seq length | 71 ## datatabse seq length |
| 67 dom_len = int(db_part.split("of")[1]) | 72 dom_len = int(db_part.split("of")[1]) |
| 68 ## start of alignment on database seq | 73 ## start of alignment on database seq |
| 69 db_start = int(db_part.split("of")[0].split(":")[0]) | 74 db_start = int(db_part.split("of")[0].split(":")[0]) |
| 70 ## end of alignment on database seq | 75 ## end of alignment on database seq |
