view profrep_refining.py @ 4:e27e86406f56 draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 10:23:50 -0400
parents a5f1638b73be
children
line wrap: on
line source

#!/usr/bin/env python3

import argparse
import numpy as np
import os
from tempfile import NamedTemporaryFile
from collections import defaultdict
import configuration
import sys


def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected')


def check_dom_gff(DOM_GFF):
    ''' Check if the GFF file of protein domains was given '''
    with open(DOM_GFF) as domains_f:
        line = domains_f.readline()
        while line.startswith("#"):
            line = domains_f.readline()
        if len(line.split("\t")) == 9 and "Final_Classification" in line:
            pass
        else:
            raise IOError(
                "There was detected an input GFF file that does not contain domains. Please check it and choose the domains GFF file")


def create_dom_dict(DOM_GFF):
    ''' Create hash table of protein domains for individual sequences '''
    check_dom_gff(DOM_GFF)
    dict_domains = defaultdict(list)
    count_comment = check_file_start(DOM_GFF)
    with open(DOM_GFF, "r") as domains:
        for comment_idx in range(count_comment):
            next(domains)
        for line in domains:
            seq_id = line.split("\t")[0]
            ann_dom_lineage = line.split("\t")[8].split(";")[1].split("=")[-1]
            start_dom = int(line.split("\t")[3])
            end_dom = int(line.split("\t")[4])
            strand_dom = line.split("\t")[6]
            dict_domains[seq_id].append((start_dom, end_dom, ann_dom_lineage,
                                         strand_dom))
    for seq_id in dict_domains.keys():
        dict_domains[seq_id] = sorted(dict_domains[seq_id], key=lambda x: x[0])
    return dict_domains


def check_file_start(gff_file):
    count_comment = 0
    with open(gff_file, "r") as gff_all:
        line = gff_all.readline()
        while line.startswith("#"):
            line = gff_all.readline()
            count_comment += 1
    return count_comment


def interconnect_intervals(OUT_REFINED, gff_removed, GAP_TH, DOM_NUM,
                           dict_domains, domains_classes):
    ''' Second step of refining - INTERVALS INTERCONNECTING:
	Gradually checking regions of GFF which are sorted by starting point.
	Adding regions to be interconnected if the gap between the new and previous one is lower than threshold and theirclassification is the same as the one set by the first region.
	If the conditions are not fullfilled the adding is stopped and this whole expanded region is further evaluated:
	1. if domains are not included in joining (A. choosed as parameter or B. based on the classification the element does not belong to mobile elements):
		-> the fragments are joined reported as one region in refined gff
	2. if domains should be included:
		-> the fragments are joined if domains within the expanded region meets the criteria:
			1. they are at least certain number of domains 
			2. they have equal strand orientation 
			3. they are classified to the last classification level (checked from the class. table of the database) and this matches the classification of the expanded region
		-> otherwise the region is refragmented to previous parts, which are reported as they were in the original repeats gff
	'''
    with open(OUT_REFINED, "w") as joined_intervals:
        start_line = check_file_start(gff_removed)
        with open(gff_removed, "r") as repeats:
            for comment_idx in range(start_line):
                joined_intervals.write(repeats.readline())
            joined = False
            initial = repeats.readline()
            ## if there are repeats in GFF, initialize
            if initial is not "":
                seq_id_ini = initial.rstrip().split("\t")[0]
                start_ini = int(initial.rstrip().split("\t")[3])
                end_ini = int(initial.rstrip().split("\t")[4])
                ann_ini = initial.rstrip().split("\t")[8].split(";")[0].split(
                    "=")[-1]
                starts_part = [start_ini]
                ends_part = [end_ini]
                for line in repeats:
                    seq_id = line.rstrip().split("\t")[0]
                    start = int(line.rstrip().split("\t")[3])
                    end = int(line.rstrip().split("\t")[4])
                    ann = line.rstrip().split("\t")[8].split(";")[0].split(
                        "=")[-1]
                    ## if new sequence is detected the joining process is reset
                    if seq_id != seq_id_ini:
                        ## handle the last expanded region from the previous sequence
                        if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini:
                            # check the domains
                            dict_domains = dom_refining(
                                joined_intervals, dict_domains, seq_id_ini,
                                start_ini, end_ini, ann_ini, starts_part,
                                ends_part, DOM_NUM, domains_classes)
                        else:
                            # write the joined region if domains should not be considered
                            joined_intervals.write(
                                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
                                    seq_id_ini, configuration.SOURCE_PROFREP,
                                    configuration.REPEATS_FEATURE, start_ini,
                                    end_ini, configuration.GFF_EMPTY,
                                    configuration.GFF_EMPTY,
                                    configuration.GFF_EMPTY, ann_ini))
                        # initialize the search for expanded intervals
                        starts_part = [start]
                        ends_part = [end]
                        joined = False
                        seq_id_ini = seq_id
                        start_ini = start
                        end_ini = end
                        ann_ini = ann
                    ## prolonging the potential region to be expanded
                    elif start - end_ini <= GAP_TH and ann == ann_ini:
                        starts_part.append(start)
                        ends_part.append(end)
                        end_ini = end
                        joined = True
                    ## end of expanding the interval
                    else:
                        if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini:
                            dict_domains = dom_refining(
                                joined_intervals, dict_domains, seq_id,
                                start_ini, end_ini, ann_ini, starts_part,
                                ends_part, DOM_NUM, domains_classes)
                        else:
                            joined_intervals.write(
                                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
                                    seq_id, configuration.SOURCE_PROFREP,
                                    configuration.REPEATS_FEATURE, start_ini,
                                    end_ini, configuration.GFF_EMPTY,
                                    configuration.GFF_EMPTY,
                                    configuration.GFF_EMPTY, ann_ini))
                        starts_part = [start]
                        ends_part = [end]
                        # after handling the expanded interval,  set the "joined" indicator to be false, so the next region expanding and potential joining can begin
                        joined = False
                        start_ini = start
                        end_ini = end
                        ann_ini = ann
                ## handle the last potentially expanded region
                if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini:
                    # check the domains
                    dict_domains = dom_refining(
                        joined_intervals, dict_domains, seq_id, start_ini,
                        end_ini, ann_ini, starts_part, ends_part, DOM_NUM,
                        domains_classes)
                else:
                    # write the joined region
                    joined_intervals.write(
                        "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
                            seq_id, configuration.SOURCE_PROFREP,
                            configuration.REPEATS_FEATURE, start_ini, end_ini,
                            configuration.GFF_EMPTY, configuration.GFF_EMPTY,
                            configuration.GFF_EMPTY, ann_ini))
                del (starts_part)
                del (ends_part)


def cluster_regions_for_quality_check(REPEATS_GFF, BORDERS):
    ''' First step of refining - REMOVING LOW CONFIDENCE REPEATS REGIONS
	Create clusters of overlapping regions from REPEATS_GFF. 
	These clusters subsequently serves for removing nested regions of different classification which have significantly lower quality comparing to the region they are nested in. 
	Overhang tolerance of couple of bases on the both sides is tolerated when evaluting the nested regions.	
	'''
    start_line = check_file_start(REPEATS_GFF)
    cluster = []
    end_cluster = 0
    seq_id_ini = None
    gff_removed_parts = NamedTemporaryFile(delete=False)
    with open(REPEATS_GFF, "r") as repeats:
        for comment_idx in range(start_line):
            gff_removed_parts.write(repeats.readline().encode("utf-8"))
        for interval in repeats:
            seq_id = interval.split("\t")[0]
            start = int(interval.split("\t")[3])
            end = int(interval.split("\t")[4])
            ann = interval.split("\t")[8].split(";")[0].split("=")[-1]
            pid = int(interval.rstrip().split("\t")[8].split(";")[1].split(
                "=")[-1])
            if seq_id != seq_id_ini:
                if len(cluster) > 1:
                    remove_low_qual(cluster, BORDERS, gff_removed_parts)
                elif cluster:
                    gff_removed_parts.write(
                        "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
                            cluster[0][0], configuration.SOURCE_PROFREP,
                            configuration.REPEATS_FEATURE, cluster[0][
                                1], cluster[0][2], configuration.GFF_EMPTY,
                            configuration.GFF_EMPTY, configuration.GFF_EMPTY,
                            cluster[0][3], cluster[0][4]).encode("utf-8"))
                seq_id_ini = seq_id
                cluster = []
            ## expanding the cluster
            if start <= end_cluster or end_cluster == 0:
                cluster.append([seq_id, start, end, ann, pid])
                if end > end_cluster:
                    end_cluster = end
            ## if the next region is not overlapping, evaluate the regions in cluster or write down the single region
            else:
                if len(cluster) > 1:
                    remove_low_qual(cluster, BORDERS, gff_removed_parts)
                elif cluster:
                    gff_removed_parts.write(
                        "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
                            cluster[0][0], configuration.SOURCE_PROFREP,
                            configuration.REPEATS_FEATURE, cluster[0][
                                1], cluster[0][2], configuration.GFF_EMPTY,
                            configuration.GFF_EMPTY, configuration.GFF_EMPTY,
                            cluster[0][3], cluster[0][4]).encode("utf-8"))
                end_cluster = end
                cluster = [[seq_id, start, end, ann, pid]]
    if cluster:
        remove_low_qual(cluster, BORDERS, gff_removed_parts)
    gff_removed_parts.close()
    return gff_removed_parts.name


def remove_low_qual(cluster, BORDERS, gff_removed_parts):
    ''' Loop over regions in the cluster which are sorted based on descending quality (PID).
	Regions nested in the currently checked region (with some exceeding borders tolerance) are tested for quality. 
	If the quality difference between the currently highest quality and the nested region is more than a threshold nested one will be removed.
	Already checked region is removed from the list and procedure continues with the next best quality region in the cluster.
	'''
    score_sorted = sorted(cluster, key=lambda x: int(x[4]), reverse=True)
    for interval in score_sorted:
        start_toler = interval[1] - BORDERS
        end_toler = interval[2] + BORDERS
        score_best = interval[4]
        ## difference
        for item in score_sorted[:]:
            if item[1] >= start_toler and item[2] <= end_toler:
                if item[4] <= (1 - configuration.QUALITY_DIFF_TO_REMOVE
                               ) * score_best:  ### 5% difference tolerance
                    score_sorted.remove(item)
    position_sorted = sorted(score_sorted, key=lambda x: int(x[1]))
    for interval_refined in position_sorted:
        gff_removed_parts.write(
            "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
                interval_refined[0], configuration.SOURCE_PROFREP,
                configuration.REPEATS_FEATURE, interval_refined[
                    1], interval_refined[2], configuration.GFF_EMPTY,
                configuration.GFF_EMPTY, configuration.GFF_EMPTY,
                interval_refined[3], interval_refined[4]).encode("utf-8"))


def dom_refining(joined_intervals, dict_domains, seq_id, start_ini, end_ini,
                 ann_ini, starts_part, ends_part, DOM_NUM, domains_classes):
    ''' Check the protein domains within the potential interval to be joined:
	1. appropriate domain has the same classification as interval of repeats to be joined and is classified to the last level 
	2. strands of appropriate domains are  uniform
	3. the count of such appropriate domains is more than a threshold
	'''
    count_dom = 0
    index_dom = 0
    strands = []
    indices_del = []
    for dom_attributes in dict_domains[seq_id]:
        ann_dom = dom_attributes[2]
        if dom_attributes[0] >= start_ini and dom_attributes[1] <= end_ini:
            repeat_class = "|".join(ann_ini.split("|")[2:])
            if ann_dom in domains_classes and ann_dom == repeat_class:
                strands.append(dom_attributes[3])
                count_dom += 1
            index_dom += 1
    ## if criteria for domains are met, write the expanded interval
    if len(set(strands)) <= 1 and count_dom >= DOM_NUM:
        joined_intervals.write(
            "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
                seq_id, configuration.SOURCE_PROFREP, configuration.
                REPEATS_FEATURE, start_ini, end_ini, configuration.GFF_EMPTY,
                configuration.GFF_EMPTY, configuration.GFF_EMPTY, ann_ini))
    ## if criteria are not met, refragment the expanded interval on original regions
    else:
        for part_start, part_end in zip(starts_part, ends_part):
            joined_intervals.write(
                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
                    seq_id, configuration.SOURCE_PROFREP,
                    configuration.REPEATS_FEATURE, part_start, part_end,
                    configuration.GFF_EMPTY, configuration.GFF_EMPTY,
                    configuration.GFF_EMPTY, ann_ini))
    ## delete already checked domains from the dict
    del (dict_domains[seq_id][0:index_dom])
    return dict_domains


def get_unique_classes(CLASS_TBL):
    ''' Get all the lineages of current domains database classification table to subsequently check the protein domains if they are classified up to the last level.
	Only these domains will be considered as valid for interval joining. 
	If their classification is be finite (based on comparing to this list of unique classes) they will not be counted for minimum number of domains criterion within the segment to be joined
	'''
    unique_classes = []
    with open(CLASS_TBL, "r") as class_tbl:
        for line in class_tbl:
            line_class = "|".join(line.rstrip().split("\t")[1:])
            if line_class not in unique_classes:
                unique_classes.append(line_class)
    return unique_classes


def main(args):
    # Command line arguments
    REPEATS_GFF = args.repeats_gff
    DOM_GFF = args.domains_gff
    GAP_TH = args.gap_threshold
    DOM_NUM = args.dom_number
    OUT_REFINED = args.out_refined
    INCLUDE_DOM = args.include_dom
    CLASS_TBL = args.class_tbl
    BORDERS = args.borders

    # first step of refining - removing low confidence repeats regions
    gff_removed = cluster_regions_for_quality_check(REPEATS_GFF, BORDERS)

    # second step of refining - interconnecting repeats regions
    if INCLUDE_DOM:
        unique_classes = get_unique_classes(CLASS_TBL)
        dict_domains = create_dom_dict(DOM_GFF)
        joined_intervals = interconnect_intervals(
            OUT_REFINED, gff_removed, GAP_TH, DOM_NUM, dict_domains,
            unique_classes)
    else:
        joined_intervals = interconnect_intervals(OUT_REFINED, gff_removed,
                                                  GAP_TH, DOM_NUM, None, None)
    os.unlink(gff_removed)


if __name__ == "__main__":

    # Command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-rep_gff',
                        '--repeats_gff',
                        type=str,
                        required=True,
                        help='original repeats regions GFF from PROFREP')
    parser.add_argument(
        '-dom_gff',
        '--domains_gff',
        type=str,
        help=
        'protein domains GFF if you want to support repeats joining by domains information')
    parser.add_argument(
        '-gth',
        '--gap_threshold',
        type=int,
        default=250,
        help='gap tolerance between consecutive repeats to be interconnected')
    parser.add_argument('-our',
                        '--out_refined',
                        type=str,
                        default="output_refined.gff",
                        help='query sequence to be processed')
    parser.add_argument(
        '-dn',
        '--dom_number',
        type=int,
        default=2,
        help='min number of domains present to confirm the region joining')
    parser.add_argument(
        '-id',
        '--include_dom',
        type=str2bool,
        default=False,
        help='include domains information to refine the repeats regions')
    parser.add_argument(
        '-ct',
        '--class_tbl',
        help=
        'classification table of protein domain database to check the level of classification')
    parser.add_argument(
        '-br',
        '--borders',
        type=int,
        default=10,
        help=
        'number of bp tolerated from one or the other side of two overlaping regions when evaluating quality')
    args = parser.parse_args()
    main(args)