Mercurial > repos > mmonot > phageterm
diff PhageTerm.py @ 24:c8f88ae512f3 draft default tip
Uploaded
author | mmonot |
---|---|
date | Tue, 17 Sep 2024 13:35:16 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PhageTerm.py Tue Sep 17 13:35:16 2024 +0000 @@ -0,0 +1,427 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# This file is a part of PhageTerm software +# A tool to determine phage termini and packaging strategy +# and other useful informations using raw sequencing reads. +# (This programs works with sequencing reads from a randomly +# sheared DNA library preparations as Illumina TruSeq paired-end or similar) +# +# ---------------------------------------------------------------------- +# Copyright (C) 2017 Julian Garneau +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# <http://www.gnu.org/licenses/gpl-3.0.html> +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# ---------------------------------------------------------------------- +# +# @author Julian Garneau <julian.garneau@usherbrooke.ca> +# @author Marc Monot <marc.monot@pasteur.fr> +# @author David Bikard <david.bikard@pasteur.fr> + + +### PYTHON Module +# Base +import os +import sys +from optparse import OptionParser, OptionGroup + +# Multiprocessing +import multiprocessing +from multiprocessing import Manager +import numpy as np + +# Project +from _modules.functions_PhageTerm import * + +### MAIN +# Option +usage = """\n\nUsage: %prog -f reads.fastq -r phage_sequence.fasta [-n phage_name -p reads_paired -s seed_lenght -d surrounding -t installation_test -c nbr_core -g host.fasta (warning increase process time)] + + Program: PhageTerm - Analyze phage termini and packaging mode using reads from high-throughput sequenced phage data + Version: 1.0.11 + Contact: Julian Garneau <julian.garneau@usherbrooke.ca> + Contact: David Bikard <david.bikard@pasteur.fr> + Contact: Marc Monot <marc.monot@pasteur.fr> + + You can perform a program test run upon installation using the "-t " option. + Arguments for the -t option can be : 5, 3, DS, DL, M or H. + + Example of test commands : + PhageTerm.py.py -t C5 -> Test run for a 5\' cohesive end (e.g. Lambda) + PhageTerm.py.py -t C3 -> Test run for a 3\' cohesive end (e.g. HK97) + PhageTerm.py.py -t DS -> Test run for a Direct Terminal Repeats end short (e.g. T7) + PhageTerm.py.py -t DL -> Test run for a Direct Terminal Repeats end long (e.g. T5) + PhageTerm.py.py -t H -> Test run for a Headful packaging (e.g. P1) + PhageTerm.py.py -t M -> Test run for a Mu-like packaging (e.g. Mu) + """ + +getopt = OptionParser(usage=usage) + +optreads = OptionGroup(getopt, 'Raw reads file in fastq format') +optreads.add_option('-f', '--fastq', dest='fastq', metavar='FILE', help='Fastq reads from Illumina TruSeq') +getopt.add_option_group(optreads) + +optref = OptionGroup(getopt, 'Phage genome in fasta format') +optref.add_option('-r', '--ref', dest='reference', metavar='FILE', help='Reference phage genome as unique contig in fasta format') +getopt.add_option_group(optref) + +optname = OptionGroup(getopt, 'Name of the phage being analyzed by the user') +optname.add_option('-n', '--phagename', dest='phagename', metavar='STRING', help='Manually enter the name of the phage being analyzed. Used as prefix for output files.') +getopt.add_option_group(optname) + +optseed = OptionGroup(getopt, 'Lenght of the seed used for reads in the mapping process') +optseed.add_option('-s', '--seed', dest='seed', metavar='INT', type="int", help='Manually enter the lenght of the seed used for reads in the mapping process.') +getopt.add_option_group(optseed) + +optsurround = OptionGroup(getopt, 'Lenght of the surrounding region considered for peak value cumulation') +optsurround.add_option('-d', '--surrounding', dest='surround', type="int", metavar='INT', help='Manually enter the lenght of the surrounding used to merge very close peaks in the analysis process.') +getopt.add_option_group(optsurround) + +optcore = OptionGroup(getopt, 'Number of core processors to use (Default: 1)') +optcore.add_option('-c', '--core', dest='core', metavar='INT', type="int", help='Manually enter the number of core you want to use.') +getopt.add_option_group(optcore) + +opthost = OptionGroup(getopt, 'Host genome in fasta format') +opthost.add_option('-g', '--host', dest='host', metavar='FILE', help='Reference host genome as unique contig in fasta format') +getopt.add_option_group(opthost) + +optpaired = OptionGroup(getopt, 'Use paired-end reads') +optpaired.add_option('-p', '--paired', dest='paired', metavar='FILE', help='Use paired-end reads to calculate real insert coverage') +getopt.add_option_group(optpaired) + +optmean = OptionGroup(getopt, 'Defined phage mean coverage') +optmean.add_option('-m', '--mean', dest='mean', metavar='INT', type="int", help='Defined phage mean coverage') +getopt.add_option_group(optmean) + +opttest = OptionGroup(getopt, 'Perform a program test run upon installation') +opttest.add_option('-t', '--test', dest='test', metavar='STRING', help='Perform a program test run upon installation. If you want to perform a test run, use the "-t " option. Arguments for the -t option can be : C5, C3, DS, DL, H or M. C5 -> Test run for a 5\' cohesive end (e.g. Lambda); C3 -> Test run for a 3\' cohesive end (e.g. HK97); DS -> Test run for a short Direct Terminal Repeats end (e.g. T7); DL -> Test run for a long Direct Terminal Repeats end (e.g. T5); H -> Test run for a Headful packaging (e.g. P1); M -> Test run for a Mu-like packaging (e.g. Mu)') +getopt.add_option_group(opttest) + + +###### + +options, arguments = getopt.parse_args() +fastq = options.fastq +reference = options.reference +phagename = options.phagename +seed = options.seed +surrounding = options.surround +core = options.core +host = options.host +paired = options.paired +mean = options.mean +test = options.test + +###### + +if options.fastq == None and options.test == None: + getopt.error('\tNo reads file provided.\n\t\t\tUse -h or --help for more details\n') + +if options.reference == None and options.test == None: + getopt.error('\tNo fasta reference file provided.\n\t\t\tUse -h or --help for more details\n') + +if options.phagename == None and options.test == None: + phagename = "Phagename" + +if options.seed == None: + seed = 20 + +if options.surround == None: + surrounding = 20 + +if options.core == None: + core = 1 + +if options.host == None: + host = "" + +if options.paired == None: + paired = "" + +if options.mean == None: + mean = 250 + +###### + +if options.test == None: + test_run = 0 +else: + test_run = 1 + + +if options.test == "C5": + print "\nPerforming a test run using test phage sequence with 5 prime cohesive overhang :" + print "\npython PhageTerm.py -f test-data/COS-5.fastq -r test-data/COS-5.fasta -n TEST_cohesive_5_prime" + fastq = "test-data/COS-5.fastq" + reference = "test-data/COS-5.fasta" + phagename = "Test-cohesive-5'" + + +elif options.test == "C3": + print "\nPerforming a test run using test phage sequence with 3 prime cohesive overhang:" + print "\npython PhageTerm.py -f test-data/COS-3.fastq -r test-data/COS-3.fasta -n TEST_cohesive_3_prime" + fastq = "test-data/COS-3.fastq" + reference = "test-data/COS-3.fasta" + phagename = "Test-cohesive-3'" + +elif options.test == "DS": + print "\nPerforming a test run using test phage sequence with short direct terminal repeats (DTR-short) :" + print "\npython PhageTerm.py -f test-data/DTR-short.fastq -r test-data/DTR-short.fasta -n TEST_short_direct_terminal_repeats" + fastq = "test-data/DTR-short.fastq" + reference = "test-data/DTR-short.fasta" + phagename = "Test-short-direct-terminal-repeats" + +elif options.test == "DL": + print "\nPerforming a test run using test phage sequence with long direct terminal repeats (DTR-long) :" + print "\npython PhageTerm.py -f test-data/DTR-long.fastq -r test-data/DTR-long.fasta -n TEST_long_direct_terminal_repeats" + fastq = "test-data/DTR-long.fastq" + reference = "test-data/DTR-long.fasta" + phagename = "Test-long-direct-terminal-repeats" + +elif options.test == "H": + print "\nPerforming a test run using test phage sequence with headful packaging" + print "\npython PhageTerm.py -f test-data/Headful.fastq -r test-data/Headful.fasta -n TEST_headful" + fastq = "test-data/Headful.fastq" + reference = "test-data/Headful.fasta" + phagename = "Test-Headful" + surrounding = 0 + +elif options.test == "M": + print "\nPerforming a test run using test phage sequence with Mu-like packaging" + print "\npython PhageTerm.py -f test-data/Mu-like_R1.fastq -p test-data/Mu-like_R2.fastq -r test-data/Mu-like.fasta -n TEST_Mu-like -g test-data/Mu-like_host.fasta" + fastq = "test-data/Mu-like_R1.fastq" + paired = "test-data/Mu-like_R2.fastq" + reference = "test-data/Mu-like.fasta" + host = "test-data/Mu-like_host.fasta" + phagename = "Test-Mu-like" + surrounding = 0 + +###### + + +# CHECK inputs +phagename = checkPhageName(phagename) + +if checkFastaFile(reference): + exit("ERROR in reference file") + +if host != "": + if checkFastaFile(host): + exit("ERROR in reference file") + +# VARIABLE +edge = 500 +insert_max = 1000 +limit_fixed = 35 +limit_preferred = 11 +limit_coverage = max(50,mean*2)/core +Mu_threshold = 0.5 +draw = 0 +if seed < 15: + seed = 15 + +# READS Number +tot_reads = totReads(fastq) +if paired != "": + tot_reads_paired = totReads(paired) + if (tot_reads != tot_reads_paired): + print "\nWARNING: Number of reads between the two reads files differ, using single reads only\n" + paired = "" + +# REFERENCE sequence recovery and edge adds +refseq = genomeFastaRecovery(reference) +refseq = refseq[-edge:] + refseq + refseq[:edge] + +# HOST sequence recovery +hostseq = genomeFastaRecovery(host) +if len(hostseq) != 0 and len(hostseq) < len(refseq): + print "\nHost length < Phage length : removing host sequence." + hostseq = "" +if hostseq != "": + hostseq = hostseq[-edge:] + hostseq + hostseq[:edge] + + +### COVERAGE +print "\nCalculating coverage values, please wait (may take a while)...\n" + +if not test_run and core == 1: + print "If your computer has more than 1 processor, you can use the -c or --core option to speed up the process.\n\n" + +jobs = [] +manager = Manager() +return_dict = manager.dict() + +# Position in core split +file_split = int(tot_reads/core) +position = [] + +l = range(int(tot_reads)) +part = chunks(l, core) +for i in range(core): + position.append(part.next()[0]) + +position = position + [int(tot_reads)] + +for i in range(0, core): + process = multiprocessing.Process(target=readsCoverage, args=(fastq, refseq, hostseq, tot_reads, seed, edge, paired, insert_max, core, i, return_dict, position[i], position[i+1], limit_coverage)) + jobs.append(process) + +for j in jobs: + j.start() + +for j in jobs: + j.join() + +print "\n\nFinished calculating coverage values, the remainder should be completed rapidly\n" + +# merging results +for core_id in range(core): + if core_id == 0: + termini_coverage = return_dict[core_id][0] + whole_coverage = return_dict[core_id][1] + paired_whole_coverage = return_dict[core_id][2] + phage_hybrid_coverage = return_dict[core_id][3] + host_hybrid_coverage = return_dict[core_id][4] + host_whole_coverage = return_dict[core_id][5] + list_hybrid = return_dict[core_id][6] + insert = return_dict[core_id][7].tolist() + paired_missmatch = return_dict[core_id][8] + reads_tested = return_dict[core_id][9] + else: + termini_coverage += return_dict[core_id][0] + whole_coverage += return_dict[core_id][1] + paired_whole_coverage += return_dict[core_id][2] + phage_hybrid_coverage += return_dict[core_id][3] + host_hybrid_coverage += return_dict[core_id][4] + host_whole_coverage += return_dict[core_id][5] + list_hybrid += return_dict[core_id][6] + insert += return_dict[core_id][7].tolist() + paired_missmatch += return_dict[core_id][8] + reads_tested += return_dict[core_id][9] + +termini_coverage = termini_coverage.tolist() +whole_coverage = whole_coverage.tolist() +paired_whole_coverage = paired_whole_coverage.tolist() +phage_hybrid_coverage = phage_hybrid_coverage.tolist() +host_hybrid_coverage = host_hybrid_coverage.tolist() +host_whole_coverage = host_whole_coverage.tolist() +list_hybrid = list_hybrid.tolist() + + +# WHOLE Coverage : Average, Maximum and Minimum +added_whole_coverage, ave_whole_cov = wholeCov(whole_coverage, len(refseq)) +added_paired_whole_coverage, ave_paired_whole_cov = wholeCov(paired_whole_coverage, len(refseq)) +added_host_whole_coverage, ave_host_whole_cov = wholeCov(host_whole_coverage, len(hostseq)) + +drop_cov = testwholeCov(added_whole_coverage, ave_whole_cov, test_run) + +# NORM pic by whole coverage (1 base) +if paired != "": + paired_whole_coverage_test = maxPaired(paired_whole_coverage, whole_coverage) + termini_coverage_norm, mean_nc = normCov(termini_coverage, paired_whole_coverage, ave_whole_cov/1.5, edge) +else: + termini_coverage_norm, mean_nc = normCov(termini_coverage, whole_coverage, ave_whole_cov/1.5, edge) + +# REMOVE edge +termini_coverage[0] = RemoveEdge(termini_coverage[0],edge) +termini_coverage[1] = RemoveEdge(termini_coverage[1],edge) +termini_coverage_norm[0] = RemoveEdge(termini_coverage_norm[0],edge) +termini_coverage_norm[1] = RemoveEdge(termini_coverage_norm[1],edge) +whole_coverage[0] = RemoveEdge(whole_coverage[0],edge) +whole_coverage[1] = RemoveEdge(whole_coverage[1],edge) +paired_whole_coverage[0] = RemoveEdge(paired_whole_coverage[0],edge) +paired_whole_coverage[1] = RemoveEdge(paired_whole_coverage[1],edge) +added_whole_coverage = RemoveEdge(added_whole_coverage,edge) +added_paired_whole_coverage = RemoveEdge(added_paired_whole_coverage,edge) +added_host_whole_coverage = RemoveEdge(added_host_whole_coverage,edge) +phage_hybrid_coverage[0] = RemoveEdge(phage_hybrid_coverage[0],edge) +phage_hybrid_coverage[1] = RemoveEdge(phage_hybrid_coverage[1],edge) +host_whole_coverage[0] = RemoveEdge(host_whole_coverage[0],edge) +host_whole_coverage[1] = RemoveEdge(host_whole_coverage[1],edge) +host_hybrid_coverage[0] = RemoveEdge(host_hybrid_coverage[0],edge) +host_hybrid_coverage[1] = RemoveEdge(host_hybrid_coverage[1],edge) +refseq = RemoveEdge(refseq,edge) +if host != "": + hostseq = RemoveEdge(hostseq,edge) +gen_len = len(refseq) +host_len = len(hostseq) +if options.test == "DL": + gen_len = 100000 + + +# READS Total, Used and Lost +used_reads, lost_reads, lost_perc = usedReads(termini_coverage, reads_tested) + +# PIC Max +picMaxPlus, picMaxMinus, TopFreqH = picMax(termini_coverage, 5) +picMaxPlus_norm, picMaxMinus_norm, TopFreqH_norm = picMax(termini_coverage_norm, 5) +picMaxPlus_host, picMaxMinus_host, TopFreqH_host = picMax(host_whole_coverage, 5) + +### ANALYSIS + +## Close Peaks +picMaxPlus, picOUT_forw = RemoveClosePicMax(picMaxPlus, gen_len, surrounding) +picMaxMinus, picOUT_rev = RemoveClosePicMax(picMaxMinus, gen_len, surrounding) +picMaxPlus_norm, picOUT_norm_forw = RemoveClosePicMax(picMaxPlus_norm, gen_len, surrounding) +picMaxMinus_norm, picOUT_norm_rev = RemoveClosePicMax(picMaxMinus_norm, gen_len, surrounding) + +termini_coverage_close = termini_coverage[:] +termini_coverage_close[0], picOUT_forw = addClosePic(termini_coverage[0], picOUT_forw) +termini_coverage_close[1], picOUT_rev = addClosePic(termini_coverage[1], picOUT_rev) + +termini_coverage_norm_close = termini_coverage_norm[:] +termini_coverage_norm_close[0], picOUT_norm_forw = addClosePic(termini_coverage_norm[0], picOUT_norm_forw, 1) +termini_coverage_norm_close[1], picOUT_norm_rev = addClosePic(termini_coverage_norm[1], picOUT_norm_rev, 1) + + +## Statistical Analysis +picMaxPlus_norm_close, picMaxMinus_norm_close, TopFreqH_norm = picMax(termini_coverage_norm_close, 5) + +if paired != "": + phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(paired_whole_coverage, termini_coverage, termini_coverage_norm, termini_coverage_norm_close) +else: + phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(whole_coverage, termini_coverage, termini_coverage_norm, termini_coverage_norm_close) + + +## LI Analysis +picMaxPlus_close, picMaxMinus_close, TopFreqH = picMax(termini_coverage_close, 5) + +R1, AveFreq = ratioR1(TopFreqH, used_reads, gen_len) +R2 = ratioR(picMaxPlus_close) +R3 = ratioR(picMaxMinus_close) + +ArtPackmode, termini, forward, reverse = packMode(R1, R2, R3) +ArtOrient = orientation(picMaxPlus_close, picMaxMinus_close) +ArtcohesiveSeq, ArtPackmode = sequenceCohesive(ArtPackmode, refseq, picMaxPlus_close, picMaxMinus_close, gen_len/2) + + +### DECISION Process + +# PEAKS Significativity +plus_significant = selectSignificant(phage_plus_norm, 1.0/gen_len, limit_preferred) +minus_significant = selectSignificant(phage_minus_norm, 1.0/gen_len, limit_preferred) + +# DECISION +Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like = decisionProcess(plus_significant, minus_significant, limit_fixed, gen_len, paired, insert, R1, list_hybrid, used_reads, seed, phage_hybrid_coverage, Mu_threshold, refseq, hostseq) + + +### EXPORT Data + +## Statistics +ExportStatistics(phagename, whole_coverage, paired_whole_coverage, termini_coverage, phage_plus_norm, phage_minus_norm, paired, test_run) + +# Sequence +ExportCohesiveSeq(phagename, ArtcohesiveSeq, P_seqcoh, test_run) +ExportPhageSequence(phagename, P_left, P_right, refseq, P_orient, Redundant, Mu_like, P_class, P_seqcoh, test_run) + +# Report +CreateReport(phagename, seed, added_whole_coverage, draw, Redundant, P_left, P_right, Permuted, P_orient, termini_coverage_norm_close, picMaxPlus_norm_close, picMaxMinus_norm_close, gen_len, tot_reads, P_seqcoh, phage_plus_norm, phage_minus_norm, ArtPackmode, termini, forward, reverse, ArtOrient, ArtcohesiveSeq, termini_coverage_close, picMaxPlus_close, picMaxMinus_close, picOUT_norm_forw, picOUT_norm_rev, picOUT_forw, picOUT_rev, lost_perc, ave_whole_cov, R1, R2, R3, host, host_len, host_whole_coverage, picMaxPlus_host, picMaxMinus_host, surrounding, drop_cov, paired, insert, phage_hybrid_coverage, host_hybrid_coverage, added_paired_whole_coverage, Mu_like, test_run, P_class, P_type, P_concat) +