Mercurial > repos > rnateam > graphprot_predict_profile

#!/usr/bin/env python3

import argparse as ap
import os
import subprocess
import sys

import gplib

"""

TOOL DEPENDENCIES
=================

GraphProt 1.1.7
Best install via:
https://anaconda.org/bioconda/graphprot
Tested with: miniconda3, conda 4.7.12


Script: What's my job this time, master?
Author: It'll be a though one.
Script: I take this as a given.
Author: Oh yeah?
Script: ... I'm ready.


OUTPUT FILES
============

    data_id.avg_profile
    data_id.avg_profile.peaks.bed
--conf-out
    data_id.avg_profile.p50.peaks.bed
--gen-site-bed
    data_id.avg_profile.genomic_peaks.bed
--conf-out --gen-site-bed
    data_id.avg_profile.p50.genomic_peaks.bed
--ws-pred
    data_id.predictions
--ws-pred --conf-out
    data_id.predictions
    data_id.p50.predictions


EXAMPLE CALLS
=============

flake8 coming out of hotel room. FB enters.
FB: Who is this f*** ???


python graphprot_predict_wrapper.py --model test2.model --params test2.params
--fasta gp_data/test10_predict.fa --data-id test2pred --gp-output
python graphprot_predict_wrapper.py --model test2.model --params test2.params
--fasta gp_data/test10_predict.fa --data-id test2pred
--gen-site-bed gp_data/test10_predict.bed

python graphprot_predict_wrapper.py --model test2.model --params test2.params
--fasta gp_data/test10_predict.fa --data-id test2pred
--gen-site-bed gp_data/test10_predict.bed --conf-out

python graphprot_predict_wrapper.py --model test2.model --params test2.params
--fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred

python graphprot_predict_wrapper.py --model test-data/test.model
--params test-data/test.params --fasta test-data/test_predict.fa
--data-id predtest

python graphprot_predict_wrapper.py --model test-data/test.model
--params test-data/test.params --fasta test-data/test_predict.fa
--data-id predtest --gen-site-bed test-data/test_predict.bed
--sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5

python graphprot_predict_wrapper.py --data-id GraphProt
--fasta test-data/test_predict.fa --model test-data/test.model
--params test-data/test.params --gen-site-bed test-data/test_predict.bed
--sc-thr 0.0 --max-merge-dist 0 --conf-out  --ap-extlr 5

"""


#######################################################################


def setup_argument_parser():
    """Setup argparse parser."""
    help_description = """
    Galaxy wrapper script for GraphProt (-action predict and -action
    predict_profile) to compute whole site or position-wise scores for input
    FASTA sequences.
    By default, profile predictions are calculated, followed by average
    profiles computions and peak regions extraction from average profiles.
    If --ws-pred is set, whole site score predictions on input sequences
    will be run instead.
    If --conf-out is set, sites or peak regions with a score >= the median
    score of positive training sites will be output.
    If --gen-site-bed .bed file is provided, peak regions will be output
    with genomic coordinates too.

    """
    # Define argument parser.
    p = ap.ArgumentParser(
        add_help=False,
        prog="graphprot_predict_wrapper.py",
        description=help_description,
        formatter_class=ap.MetavarTypeHelpFormatter,
    )

    # Argument groups.
    p_man = p.add_argument_group("REQUIRED ARGUMENTS")
    p_opt = p.add_argument_group("OPTIONAL ARGUMENTS")

    # Required arguments.
    p_opt.add_argument("-h", "--help", action="help", help="Print help message")
    p_man.add_argument(
        "--fasta",
        dest="in_fa",
        type=str,
        required=True,
        help="Sequences .fa file to predict" " on (option -fasta)",
    )
    p_man.add_argument(
        "--model",
        dest="in_model",
        type=str,
        required=True,
        help="GraphProt model file to use for predictions" " (option -model)",
    )
    p_man.add_argument(
        "--params",
        dest="in_params",
        type=str,
        required=True,
        help="Parameter file for given model",
    )
    p_man.add_argument(
        "--data-id",
        dest="data_id",
        type=str,
        required=True,
        help="Data ID (option -prefix)",
    )
    # ---> I'm  a conditional argument <---
    p_opt.add_argument(
        "--ws-pred",
        dest="ws_pred",
        default=False,
        action="store_true",
        help="Run a whole site prediction instead "
        "of calculating profiles (default: false)",
    )
    # Additional arguments.
    p_opt.add_argument(
        "--sc-thr",
        dest="score_thr",
        type=float,
        default=0,
        help="Score threshold for extracting "
        "average profile peak regions (default: 0)",
    )
    p_opt.add_argument(
        "--max-merge-dist",
        dest="max_merge_dist",
        type=int,
        default=0,
        choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        help="Maximum merge distance for nearby peak regions"
        " (default: report all non-overlapping regions)",
    )
    p_opt.add_argument(
        "--gen-site-bed",
        dest="genomic_sites_bed",
        type=str,
        help=".bed file specifying the genomic regions of "
        "the input .fa sequences. Corrupt .bed "
        "information will be punished (default: false)",
    )
    p_opt.add_argument(
        "--conf-out",
        dest="conf_out",
        default=False,
        action="store_true",
        help="Output filtered peak regions BED file or "
        "predictions file (if --ws-pred) using the median "
        "positive training site score for filtering "
        "(default: false)",
    )
    p_opt.add_argument(
        "--gp-output",
        dest="gp_output",
        default=False,
        action="store_true",
        help="Print output produced by GraphProt " "(default: false)",
    )
    p_opt.add_argument(
        "--ap-extlr",
        dest="ap_extlr",
        type=int,
        default=5,
        choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        help="Define average profile up- and downstream "
        "extension to produce the average profile. "
        "The mean over small sequence windows "
        "(window length = --ap-extlr*2 + 1) is used to "
        "get position scores, thus the average profile "
        "is more smooth than the initial profile output "
        "by GraphProt (default: 5)",
    )
    return p


#######################################################################

if __name__ == "__main__":

    # Setup argparse.
    parser = setup_argument_parser()
    # Read in command line arguments.
    args = parser.parse_args()

    """
    Do all sorts of sanity checking.

    """
    # Check for Linux.
    assert "linux" in sys.platform, "please use Linux"
    # Check tool availability.
    assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH"
    # Check file inputs.
    assert os.path.exists(args.in_fa), 'input .fa file "%s" not found' % (args.in_fa)
    assert os.path.exists(args.in_model), 'input .model file "%s" not found' % (
        args.in_model
    )
    assert os.path.exists(args.in_params), 'input .params file "%s" not found' % (
        args.in_params
    )
    # Count .fa entries.
    c_in_fa = gplib.count_fasta_headers(args.in_fa)
    assert c_in_fa, 'input .fa file "%s" no headers found' % (args.in_fa)
    print("# input .fa sequences:   %i" % (c_in_fa))
    # Read in FASTA sequences to check for uppercase sequences.
    seqs_dic = gplib.read_fasta_into_dic(args.in_fa)
    # Check for lowercase only sequences, which cause GP to crash.
    error_mess = (
        "input sequences encountered containing "
        "only lowercase characters or lowercase characters in between "
        "uppercase characters. Please provide either all uppercase "
        "sequences or sequences containing uppercase regions surrounded "
        "by lowercase context regions for structure calculation (see "
        "viewpoint concept in original GraphProt publication "
        "for more details)"
    )
    if args.ws_pred:
        bad_ids = gplib.check_seqs_dic_format(seqs_dic)
        assert not bad_ids, "%s" % (error_mess)

    c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic)
    assert c_uc_nt, (
        "no uppercase nucleotides in input .fa sequences. "
        "Please change sequences to uppercase (keep in mind "
        "GraphProt only scores uppercase regions (according "
        "to its viewpoint concept)"
    )
    if not args.ws_pred:
        # Check for lowercase sequences.
        c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic)
        assert not c_lc_nt, (
            "lowercase nucleotides in input .fa not allowed"
            " in profile predictions, since GraphProt only scores "
            "uppercase regions (according to its viewpoint concept)"
        )
    # Check .bed.
    if args.genomic_sites_bed:
        # An array of checks, marvelous.
        assert os.path.exists(
            args.genomic_sites_bed
        ), 'genomic .bed file "%s" not found' % (args.genomic_sites_bed)
        # Check .bed for content.
        assert gplib.count_file_rows(
            args.genomic_sites_bed
        ), 'genomic .bed file "%s" is empty' % (args.genomic_sites_bed)
        # Check .bed for 6-column format.
        assert gplib.bed_check_six_col_format(
            args.genomic_sites_bed
        ), 'genomic .bed file "%s" appears to not ' "be in 6-column .bed format" % (
            args.genomic_sites_bed
        )
        # Check for unique column 4 IDs.
        assert gplib.bed_check_unique_ids(
            args.genomic_sites_bed
        ), 'genomic .bed file "%s" column 4 IDs not unique' % (args.genomic_sites_bed)
        # Read in .bed regions, compare to FASTA sequences.
        seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic)
        reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed)
        for seq_id in seq_len_dic:
            seq_l = seq_len_dic[seq_id]
            assert (
                seq_id in reg_len_dic
            ), 'sequence ID "%s" missing in input .bed "%s"' % (
                seq_id,
                args.genomic_sites_bed,
            )
            reg_l = reg_len_dic[seq_id]
            assert (
                seq_l == reg_l
            ), "sequence length differs from .bed region length (%i != %i)" % (
                seq_l,
                reg_l,
            )
    # Read in model parameters.
    param_dic = gplib.graphprot_get_param_dic(args.in_params)
    # Create GraphProt parameter string.
    param_string = gplib.graphprot_get_param_string(args.in_params)

    """
    Run predictions.

    """
    if args.ws_pred:
        # Do whole site prediction.
        print(
            "Starting whole site predictions on input .fa file"
            " (-action predict) ... "
        )
        check_cmd = (
            "GraphProt.pl -action predict -prefix "
            + args.data_id
            + " -fasta "
            + args.in_fa
            + " "
            + param_string
            + " -model "
            + args.in_model
        )
        output = subprocess.getoutput(check_cmd)
        assert (
            output
        ), "the following call of GraphProt.pl " "produced no output:\n%s" % (check_cmd)
        if args.gp_output:
            print(output)
        ws_predictions_file = args.data_id + ".predictions"
        assert os.path.exists(
            ws_predictions_file
        ), 'Whole site prediction output .predictions file "%s" not found' % (
            ws_predictions_file
        )
        if args.conf_out:
            # Filter by pos_train_ws_pred_median median.
            assert "pos_train_ws_pred_median" in param_dic, (
                "whole site top scores median information " "missing in .params file"
            )
            pos_tr_ws_pred_med = float(param_dic["pos_train_ws_pred_median"])
            # Filtered file.
            filt_ws_predictions_file = args.data_id + ".p50.predictions"
            print(
                "Extracting p50 sites from whole site predictions"
                " (score threshold = %f) ... " % (pos_tr_ws_pred_med)
            )
            gplib.graphprot_filter_predictions_file(
                ws_predictions_file, filt_ws_predictions_file, sc_thr=pos_tr_ws_pred_med
            )
    else:
        # Do profile prediction.
        print(
            "Starting profile predictions on on input .fa file "
            "(-action predict_profile) ... "
        )
        check_cmd = (
            "GraphProt.pl -action predict_profile -prefix "
            + args.data_id
            + " -fasta "
            + args.in_fa
            + " "
            + param_string
            + " -model "
            + args.in_model
        )
        output = subprocess.getoutput(check_cmd)
        assert (
            output
        ), "the following call of GraphProt.pl produced " "no output:\n%s" % (check_cmd)
        if args.gp_output:
            print(output)
        profile_predictions_file = args.data_id + ".profile"
        assert os.path.exists(
            profile_predictions_file
        ), 'Profile prediction output .profile file "%s" not found' % (
            profile_predictions_file
        )

        # Profile prediction output files.
        avg_prof_file = args.data_id + ".avg_profile"
        avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed"
        avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed"
        avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed"
        avg_prof_gen_peaks_p50_file = (
            args.data_id + ".avg_profile.p50.genomic_peaks.bed"
        )

        # Get sequence IDs in order from input .fa file.
        seq_ids_list = gplib.fasta_read_in_ids(args.in_fa)
        # Calculate average profiles.
        print(
            "Getting average profile from profile "
            "(extlr for smoothing: %i) ... " % (args.ap_extlr)
        )
        gplib.graphprot_profile_calc_avg_profile(
            profile_predictions_file,
            avg_prof_file,
            ap_extlr=args.ap_extlr,
            seq_ids_list=seq_ids_list,
            method=2,
        )
        # Extract peak regions on sequences with threshold score 0.
        print(
            "Extracting peak regions from average profile " "(score threshold = 0) ... "
        )
        killpep8 = args.max_merge_dist
        gplib.graphprot_profile_extract_peak_regions(
            avg_prof_file,
            avg_prof_peaks_file,
            max_merge_dist=killpep8,
            sc_thr=args.score_thr,
        )
        # Convert peaks to genomic coordinates.
        if args.genomic_sites_bed:
            print("Converting peak regions to genomic coordinates ... ")
            killit = args.genomic_sites_bed
            gplib.bed_peaks_to_genomic_peaks(
                avg_prof_peaks_file,
                avg_prof_gen_peaks_file,
                print_rows=False,
                genomic_sites_bed=killit,
            )
        # Extract peak regions with threshold score p50.
        if args.conf_out:
            sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr)
            # Filter by pos_tr_ws_pred_med median.
            assert sc_id in param_dic, (
                "average profile extlr %i median "
                "information missing in .params file" % (args.ap_extlr)
            )
            p50_sc_thr = float(param_dic[sc_id])
            print(
                "Extracting p50 peak regions from average profile "
                "(score threshold = %f) ... " % (p50_sc_thr)
            )
            despair = avg_prof_peaks_p50_file
            pain = args.max_merge_dist
            gplib.graphprot_profile_extract_peak_regions(
                avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr
            )
            # Convert peaks to genomic coordinates.
            if args.genomic_sites_bed:
                print("Converting p50 peak regions to " "genomic coordinates ... ")
                madness = args.genomic_sites_bed
                gplib.bed_peaks_to_genomic_peaks(
                    avg_prof_peaks_p50_file,
                    avg_prof_gen_peaks_p50_file,
                    genomic_sites_bed=madness,
                )
    # Done.
    print("Script: I'm done.")
    print("Author: ... ")
author	bgruening
date	Tue, 06 Aug 2024 14:55:18 +0000
parents	ddcf35a868b8
children