view aragorn_out_to_gff3.py @ 1:d788d1abe238 draft

Uploaded
author bgruening
date Thu, 22 Jan 2015 13:15:51 -0500
parents
children 358f58401cd6
line wrap: on
line source

#!/usr/bin/env python
import re

def start_pattern(string):
    return re.match(r'^[0-9]+\.$', string) \
        or string.startswith('Number of possible') \
        or string.startswith('Searching for')

def blank_line(string):
    return re.match(r'^\s*$', string)

def blocks(iterable):
    accumulator = []
    run_of_blanklines = 0
    for line in iterable:
        # Count blank lines
        if blank_line(line):
            run_of_blanklines += 1
        else:
            run_of_blanklines = 0

        if start_pattern(line) or run_of_blanklines > 2 or 'Mean G+C' in line:
            if accumulator:
                yield accumulator
                accumulator = [line]
        else:
            accumulator.append(line)
    if accumulator:
        yield accumulator

IMPORTANT_INFO = {
    'trna': re.compile(r'tRNA-(?P<codon>[A-Za-z]{3})\((?P<anticodon>[A-Za-z]{3})\)'),
    'trna-alt': re.compile(r'tRNA-\?\((?P<codon>[^\)]+)\)\((?P<anticodon>[A-Za-z]{2,})\)'),
    'bases': re.compile(r'(?P<bases>[0-9]+) bases, %GC = (?P<gc>[0-9.]+)'),
    'sequence': re.compile(r'Sequence (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
    'possible_pseudogene': re.compile(r'(?P<pseudo>Possible Pseudogene)'),
}
INFO_GROUPS = ('codon', 'anticodon', 'bases', 'gc', 'complement', 'start', 'end', 'pseudo')

def important_info(block):
    info = {}
    for line in block:
        for matcher in IMPORTANT_INFO:
            matches = IMPORTANT_INFO[matcher].search(line)
            if matches:
                for group in INFO_GROUPS:
                    try:
                        info[group] = matches.group(group)
                    except:
                        pass
    return info

IMPORTANT_INFO_TMRNA = {
    'tag_peptide': re.compile(r'Tag peptide:\s+(?P<pep>[A-Z*]*)'),
    'location': re.compile(r'Location (?P<complement>[c]{0,1})\[(?P<start>\d+),(?P<end>\d+)\]'),
}
INFO_GROUPS_TMRNA = ('start', 'end', 'pep')

def important_info_tmrna(block):
    info = {}
    for line in block:
        for matcher in IMPORTANT_INFO_TMRNA:
            matches = IMPORTANT_INFO_TMRNA[matcher].search(line)
            if matches:
                for group in INFO_GROUPS_TMRNA:
                    try:
                        info[group] = matches.group(group)
                    except:
                        pass
    return info

import fileinput
stdin_data = []
for line in fileinput.input():
    stdin_data.append(line)

possible_blocks = [line for line in blocks(stdin_data)]

seqid = None
print '##gff-version-3'
# We're off to a GREAT start, if I'm accessing by index you just know that I'm going to do terrible
# awful things
for block_idx in range(len(possible_blocks)):
    block = possible_blocks[block_idx]
    data = None
    fasta_defline = None

    if block[0].startswith('Searching for') or 'nucleotides in sequence' in block[-1]:
        # Try and get a sequence ID out of it
        try:
            fasta_defline = block[-2].strip()
        except:
            # Failing that, ignore it.
            pass
    else:
        # They DUPLICATE results in multiple places, including a fasta version
        # in the 'full report'.
        possible_ugliness = [x for x in block if x.startswith('>t')]
        if len(possible_ugliness) > 0:
            continue

        # However, if it didn't have one of those all important pieces of
        # information, then it's either a different important piece of
        # information, or complete junk
        data = important_info(block)

        # I am not proud of any of this. We essentially say "if that block
        # didn't come up with useful info, then try making it a tmrna"
        if len(data.keys()) == 0:
            data = important_info_tmrna(block)
            # And if that fails, just none it.
            if len(data.keys()) == 0:
                data = None
            else:
                # But if it didn't, confirm that we're a tmRNA
                data['type'] = 'tmRNA'
        else:
            # If we did have keys, and didn't pass through any of the tmRNA
            # checks, we're tRNA
            data['type'] = 'tRNA'

    # If we got a sequence ID in this block, set the defline
    if 'nucleotides in sequence' in block[-1]:
        try:
            fasta_defline = block[-2].strip()
        except:
            pass

    # if a defline is available, try and extract the fasta header ID
    if fasta_defline is not None:
        try:
            seqid = fasta_defline[0:fasta_defline.index(' ')]
        except:
            seqid = fasta_defline

    # If there's data
    if data is not None and len(data.keys()) > 1:

        # Deal with our flags/notes.
        if data['type'] == 'tRNA':
            # Are these acceptable GFF3 tags?
            notes = {
                'Codon': data['codon'],
                'Anticodon': data['anticodon'],
            }
            if 'pseudo' in data:
                notes['Note'] = 'Possible pseudogene'
        else:
            notes = {
                'Note': 'Tag peptide: ' + data['pep'] + ''
            }

        notestr = ';'.join(['%s="%s"' % (k,v) for k,v in notes.iteritems()])

        print '\t'.join([
            seqid,
            'aragorn',
            data['type'],
            data['start'],
            data['end'],
            '.',
            '.',
            '.',
            notestr
        ])