Mercurial > repos > cpt > cpt_putative_usp
annotate cpt.py @ 1:f7afd1480d0f draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt | 
|---|---|
| date | Mon, 05 Jun 2023 02:51:59 +0000 | 
| parents | |
| children | da1be0114755 | 
| rev | line source | 
|---|---|
| 1 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 1 #!/usr/bin/env python | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 2 import regex as re | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 3 from Bio.Seq import Seq, reverse_complement, translate | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 4 from Bio.SeqRecord import SeqRecord | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 5 from Bio import SeqIO | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 6 from Bio.Data import CodonTable | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 7 import logging | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 8 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 9 logging.basicConfig() | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 10 log = logging.getLogger() | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 11 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 12 PHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*phage (?P<phage>.*)$") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 13 BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P<host>.*)\s*bacteriophage (?P<phage>.*)$") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 14 STARTS_WITH_PHAGE = re.compile( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 15 "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P<phage>.*)$" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 16 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 17 NEW_STYLE_NAMES = re.compile("(?P<phage>v[A-Z]_[A-Z][a-z]{2}_.*)") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 18 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 19 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 20 def phage_name_parser(name): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 21 host = None | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 22 phage = None | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 23 name = name.replace(", complete genome.", "") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 24 name = name.replace(", complete genome", "") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 25 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 26 m = BACTERIOPHAGE_IN_MIDDLE.match(name) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 27 if m: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 28 host = m.group("host") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 29 phage = m.group("phage") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 30 return (host, phage) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 31 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 32 m = PHAGE_IN_MIDDLE.match(name) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 33 if m: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 34 host = m.group("host") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 35 phage = m.group("phage") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 36 return (host, phage) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 37 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 38 m = STARTS_WITH_PHAGE.match(name) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 39 if m: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 40 phage = m.group("phage") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 41 return (host, phage) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 42 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 43 m = NEW_STYLE_NAMES.match(name) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 44 if m: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 45 phage = m.group("phage") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 46 return (host, phage) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 47 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 48 return (host, phage) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 49 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 50 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 51 class OrfFinder(object): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 52 def __init__(self, table, ftype, ends, min_len, strand): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 53 self.table = table | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 54 self.table_obj = CodonTable.ambiguous_generic_by_id[table] | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 55 self.ends = ends | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 56 self.ftype = ftype | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 57 self.min_len = min_len | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 58 self.starts = sorted(self.table_obj.start_codons) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 59 self.stops = sorted(self.table_obj.stop_codons) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 60 self.re_starts = re.compile("|".join(self.starts)) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 61 self.re_stops = re.compile("|".join(self.stops)) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 62 self.strand = strand | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 63 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 64 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 65 seq_format = "fasta" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 66 log.debug("Genetic code table %i" % self.table) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 67 log.debug("Minimum length %i aa" % self.min_len) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 68 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 69 out_count = 0 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 70 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 71 out_gff3.write("##gff-version 3\n") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 72 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 73 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 74 for i, (f_start, f_end, f_strand, n, t) in enumerate( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 75 self.get_all_peptides(str(record.seq).upper()) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 76 ): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 77 out_count += 1 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 78 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 79 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 80 len(t), | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 81 len(n), | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 82 f_start, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 83 f_end, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 84 f_strand, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 85 record.description, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 86 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 87 fid = record.id + "|%s%i" % (self.ftype, i + 1) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 88 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 89 r = SeqRecord(Seq(n), id=fid, name="", description=descr) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 90 t = SeqRecord(Seq(t), id=fid, name="", description=descr) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 91 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 92 SeqIO.write(r, out_nuc, "fasta") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 93 SeqIO.write(t, out_prot, "fasta") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 94 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 95 nice_strand = "+" if f_strand == +1 else "-" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 96 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 97 out_bed.write( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 98 "\t".join( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 99 map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 100 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 101 + "\n" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 102 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 103 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 104 out_gff3.write( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 105 "\t".join( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 106 map( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 107 str, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 108 [ | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 109 record.id, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 110 "getOrfsOrCds", | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 111 "CDS", | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 112 f_start + 1, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 113 f_end, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 114 ".", | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 115 nice_strand, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 116 0, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 117 "ID=%s.%s.%s" % (self.ftype, idx, i + 1), | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 118 ], | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 119 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 120 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 121 + "\n" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 122 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 123 log.info("Found %i %ss", out_count, self.ftype) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 124 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 125 def start_chop_and_trans(self, s, strict=True): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 126 """Returns offset, trimmed nuc, protein.""" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 127 if strict: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 128 assert s[-3:] in self.stops, s | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 129 assert len(s) % 3 == 0 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 130 for match in self.re_starts.finditer(s, overlapped=True): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 131 # Must check the start is in frame | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 132 start = match.start() | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 133 if start % 3 == 0: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 134 n = s[start:] | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 135 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 136 if strict: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 137 t = translate(n, self.table) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 138 else: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 139 # Use when missing stop codon, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 140 t = "M" + translate(n[3:], self.table, to_stop=True) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 141 yield start, n, t # Edited by CPT to be a generator | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 142 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 143 def break_up_frame(self, s): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 144 """Returns offset, nuc, protein.""" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 145 start = 0 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 146 for match in self.re_stops.finditer(s, overlapped=True): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 147 index = match.start() + 3 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 148 if index % 3 != 0: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 149 continue | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 150 n = s[start:index] | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 151 for (offset, n, t) in self.start_chop_and_trans(n): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 152 if n and len(t) >= self.min_len: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 153 yield start + offset, n, t | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 154 start = index | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 155 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 156 def putative_genes_in_sequence(self, nuc_seq): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 157 """Returns start, end, strand, nucleotides, protein. | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 158 Co-ordinates are Python style zero-based. | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 159 """ | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 160 nuc_seq = nuc_seq.upper() | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 161 # TODO - Refactor to use a generator function (in start order) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 162 # rather than making a list and sorting? | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 163 answer = [] | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 164 full_len = len(nuc_seq) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 165 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 166 for frame in range(0, 3): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 167 for offset, n, t in self.break_up_frame(nuc_seq[frame:]): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 168 start = frame + offset # zero based | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 169 answer.append((start, start + len(n), +1, n, t)) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 170 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 171 rc = reverse_complement(nuc_seq) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 172 for frame in range(0, 3): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 173 for offset, n, t in self.break_up_frame(rc[frame:]): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 174 start = full_len - frame - offset # zero based | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 175 answer.append((start, start - len(n), -1, n, t)) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 176 answer.sort() | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 177 return answer | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 178 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 179 def get_all_peptides(self, nuc_seq): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 180 """Returns start, end, strand, nucleotides, protein. | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 181 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 182 Co-ordinates are Python style zero-based. | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 183 """ | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 184 # Refactored into generator by CPT | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 185 full_len = len(nuc_seq) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 186 if self.strand != "reverse": | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 187 for frame in range(0, 3): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 188 for offset, n, t in self.break_up_frame(nuc_seq[frame:]): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 189 start = frame + offset # zero based | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 190 yield (start, start + len(n), +1, n, t) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 191 if self.strand != "forward": | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 192 rc = reverse_complement(nuc_seq) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 193 for frame in range(0, 3): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 194 for offset, n, t in self.break_up_frame(rc[frame:]): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 195 start = full_len - frame - offset # zero based | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 196 yield (start - len(n), start, -1, n, t) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 197 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 198 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 199 class MGAFinder(object): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 200 def __init__(self, table, ftype, ends, min_len): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 201 self.table = table | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 202 self.table_obj = CodonTable.ambiguous_generic_by_id[table] | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 203 self.ends = ends | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 204 self.ftype = ftype | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 205 self.min_len = min_len | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 206 self.starts = sorted(self.table_obj.start_codons) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 207 self.stops = sorted(self.table_obj.stop_codons) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 208 self.re_starts = re.compile("|".join(self.starts)) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 209 self.re_stops = re.compile("|".join(self.stops)) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 210 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 211 def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 212 seq_format = "fasta" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 213 log.debug("Genetic code table %i" % self.table) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 214 log.debug("Minimum length %i aa" % self.min_len) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 215 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 216 out_count = 0 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 217 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 218 out_gff3.write("##gff-version 3\n") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 219 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 220 for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 221 for i, (f_start, f_end, f_strand, n, t) in enumerate( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 222 self.get_all_peptides(str(record.seq).upper()) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 223 ): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 224 out_count += 1 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 225 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 226 descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 227 len(t), | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 228 len(n), | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 229 f_start, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 230 f_end, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 231 f_strand, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 232 record.description, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 233 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 234 fid = record.id + "|%s%i" % (self.ftype, i + 1) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 235 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 236 r = SeqRecord(Seq(n), id=fid, name="", description=descr) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 237 t = SeqRecord(Seq(t), id=fid, name="", description=descr) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 238 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 239 SeqIO.write(r, out_nuc, "fasta") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 240 SeqIO.write(t, out_prot, "fasta") | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 241 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 242 nice_strand = "+" if f_strand == +1 else "-" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 243 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 244 out_bed.write( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 245 "\t".join( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 246 map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 247 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 248 + "\n" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 249 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 250 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 251 out_gff3.write( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 252 "\t".join( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 253 map( | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 254 str, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 255 [ | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 256 record.id, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 257 "getOrfsOrCds", | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 258 "CDS", | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 259 f_start + 1, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 260 f_end, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 261 ".", | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 262 nice_strand, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 263 0, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 264 "ID=%s.%s.%s" % (self.ftype, idx, i + 1), | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 265 ], | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 266 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 267 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 268 + "\n" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 269 ) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 270 log.info("Found %i %ss", out_count, self.ftype) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 271 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 272 def start_chop_and_trans(self, s, strict=True): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 273 """Returns offset, trimmed nuc, protein.""" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 274 if strict: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 275 assert s[-3:] in self.stops, s | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 276 assert len(s) % 3 == 0 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 277 for match in self.re_starts.finditer(s, overlapped=True): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 278 # Must check the start is in frame | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 279 start = match.start() | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 280 if start % 3 == 0: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 281 n = s[start:] | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 282 assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 283 if strict: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 284 t = translate(n, self.table) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 285 else: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 286 # Use when missing stop codon, | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 287 t = "M" + translate(n[3:], self.table, to_stop=True) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 288 yield start, n, t | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 289 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 290 def break_up_frame(self, s): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 291 """Returns offset, nuc, protein.""" | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 292 start = 0 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 293 for match in self.re_stops.finditer(s, overlapped=True): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 294 index = match.start() + 3 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 295 if index % 3 != 0: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 296 continue | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 297 n = s[start:index] | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 298 for (offset, n, t) in self.start_chop_and_trans(n): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 299 if n and len(t) >= self.min_len: | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 300 yield start + offset, n, t | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 301 start = index | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 302 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 303 def putative_genes_in_sequence(self, nuc_seq): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 304 """Returns start, end, strand, nucleotides, protein. | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 305 Co-ordinates are Python style zero-based. | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 306 """ | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 307 nuc_seq = nuc_seq.upper() | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 308 # TODO - Refactor to use a generator function (in start order) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 309 # rather than making a list and sorting? | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 310 answer = [] | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 311 full_len = len(nuc_seq) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 312 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 313 for frame in range(0, 3): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 314 for offset, n, t in self.break_up_frame(nuc_seq[frame:]): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 315 start = frame + offset # zero based | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 316 answer.append((start, start + len(n), +1, n, t)) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 317 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 318 rc = reverse_complement(nuc_seq) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 319 for frame in range(0, 3): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 320 for offset, n, t in self.break_up_frame(rc[frame:]): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 321 start = full_len - frame - offset # zero based | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 322 answer.append((start, start - len(n), -1, n, t)) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 323 answer.sort() | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 324 return answer | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 325 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 326 def get_all_peptides(self, nuc_seq): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 327 """Returns start, end, strand, nucleotides, protein. | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 328 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 329 Co-ordinates are Python style zero-based. | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 330 """ | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 331 # Refactored into generator by CPT | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 332 | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 333 full_len = len(nuc_seq) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 334 for frame in range(0, 3): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 335 for offset, n, t in self.break_up_frame(nuc_seq[frame:]): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 336 start = frame + offset # zero based | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 337 yield (start, start + len(n), +1, n, t) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 338 rc = reverse_complement(nuc_seq) | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 339 for frame in range(0, 3): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 340 for offset, n, t in self.break_up_frame(rc[frame:]): | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 341 start = full_len - frame - offset # zero based | 
| 
f7afd1480d0f
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
 cpt parents: diff
changeset | 342 yield (start - len(n), start, -1, n, t) | 
