Mercurial > repos > mmonot > phageterm
view phageterm/PhageTerm.py @ 23:26ceb3225190 draft
Deleted selected files
author | mmonot |
---|---|
date | Fri, 26 Jan 2018 04:46:34 -0500 |
parents | 9f98d8d4af75 |
children |
line wrap: on
line source
#! /usr/bin/env python # -*- coding: utf-8 -*- # This file is a part of PhageTerm software # A tool to determine phage termini and packaging strategy # and other useful informations using raw sequencing reads. # (This programs works with sequencing reads from a randomly # sheared DNA library preparations as Illumina TruSeq paired-end or similar) # # ---------------------------------------------------------------------- # Copyright (C) 2017 Julian Garneau # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # <http://www.gnu.org/licenses/gpl-3.0.html> # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # ---------------------------------------------------------------------- # # @author Julian Garneau <julian.garneau@usherbrooke.ca> # @author Marc Monot <marc.monot@pasteur.fr> # @author David Bikard <david.bikard@pasteur.fr> ### PYTHON Module # Base import os import sys from optparse import OptionParser, OptionGroup # Multiprocessing import multiprocessing from multiprocessing import Manager import numpy as np # Project from _modules.functions_PhageTerm import * ### MAIN # Option usage = """\n\nUsage: %prog -f reads.fastq -r phage_sequence.fasta [-n phage_name -p reads_paired -s seed_lenght -d surrounding -t installation_test -c nbr_core -g host.fasta (warning increase process time)] Program: PhageTerm - Analyze phage termini and packaging mode using reads from high-throughput sequenced phage data Version: 1.0.11 Contact: Julian Garneau <julian.garneau@usherbrooke.ca> Contact: David Bikard <david.bikard@pasteur.fr> Contact: Marc Monot <marc.monot@pasteur.fr> You can perform a program test run upon installation using the "-t " option. Arguments for the -t option can be : 5, 3, DS, DL, M or H. Example of test commands : PhageTerm.py.py -t C5 -> Test run for a 5\' cohesive end (e.g. Lambda) PhageTerm.py.py -t C3 -> Test run for a 3\' cohesive end (e.g. HK97) PhageTerm.py.py -t DS -> Test run for a Direct Terminal Repeats end short (e.g. T7) PhageTerm.py.py -t DL -> Test run for a Direct Terminal Repeats end long (e.g. T5) PhageTerm.py.py -t H -> Test run for a Headful packaging (e.g. P1) PhageTerm.py.py -t M -> Test run for a Mu-like packaging (e.g. Mu) """ getopt = OptionParser(usage=usage) optreads = OptionGroup(getopt, 'Raw reads file in fastq format') optreads.add_option('-f', '--fastq', dest='fastq', metavar='FILE', help='Fastq reads from Illumina TruSeq') getopt.add_option_group(optreads) optref = OptionGroup(getopt, 'Phage genome in fasta format') optref.add_option('-r', '--ref', dest='reference', metavar='FILE', help='Reference phage genome as unique contig in fasta format') getopt.add_option_group(optref) optname = OptionGroup(getopt, 'Name of the phage being analyzed by the user') optname.add_option('-n', '--phagename', dest='phagename', metavar='STRING', help='Manually enter the name of the phage being analyzed. Used as prefix for output files.') getopt.add_option_group(optname) optseed = OptionGroup(getopt, 'Lenght of the seed used for reads in the mapping process') optseed.add_option('-s', '--seed', dest='seed', metavar='INT', type="int", help='Manually enter the lenght of the seed used for reads in the mapping process.') getopt.add_option_group(optseed) optsurround = OptionGroup(getopt, 'Lenght of the surrounding region considered for peak value cumulation') optsurround.add_option('-d', '--surrounding', dest='surround', type="int", metavar='INT', help='Manually enter the lenght of the surrounding used to merge very close peaks in the analysis process.') getopt.add_option_group(optsurround) optcore = OptionGroup(getopt, 'Number of core processors to use (Default: 1)') optcore.add_option('-c', '--core', dest='core', metavar='INT', type="int", help='Manually enter the number of core you want to use.') getopt.add_option_group(optcore) opthost = OptionGroup(getopt, 'Host genome in fasta format') opthost.add_option('-g', '--host', dest='host', metavar='FILE', help='Reference host genome as unique contig in fasta format') getopt.add_option_group(opthost) optpaired = OptionGroup(getopt, 'Use paired-end reads') optpaired.add_option('-p', '--paired', dest='paired', metavar='FILE', help='Use paired-end reads to calculate real insert coverage') getopt.add_option_group(optpaired) optmean = OptionGroup(getopt, 'Defined phage mean coverage') optmean.add_option('-m', '--mean', dest='mean', metavar='INT', type="int", help='Defined phage mean coverage') getopt.add_option_group(optmean) opttest = OptionGroup(getopt, 'Perform a program test run upon installation') opttest.add_option('-t', '--test', dest='test', metavar='STRING', help='Perform a program test run upon installation. If you want to perform a test run, use the "-t " option. Arguments for the -t option can be : C5, C3, DS, DL, H or M. C5 -> Test run for a 5\' cohesive end (e.g. Lambda); C3 -> Test run for a 3\' cohesive end (e.g. HK97); DS -> Test run for a short Direct Terminal Repeats end (e.g. T7); DL -> Test run for a long Direct Terminal Repeats end (e.g. T5); H -> Test run for a Headful packaging (e.g. P1); M -> Test run for a Mu-like packaging (e.g. Mu)') getopt.add_option_group(opttest) ###### options, arguments = getopt.parse_args() fastq = options.fastq reference = options.reference phagename = options.phagename seed = options.seed surrounding = options.surround core = options.core host = options.host paired = options.paired mean = options.mean test = options.test ###### if options.fastq == None and options.test == None: getopt.error('\tNo reads file provided.\n\t\t\tUse -h or --help for more details\n') if options.reference == None and options.test == None: getopt.error('\tNo fasta reference file provided.\n\t\t\tUse -h or --help for more details\n') if options.phagename == None and options.test == None: phagename = "Phagename" if options.seed == None: seed = 20 if options.surround == None: surrounding = 20 if options.core == None: core = 1 if options.host == None: host = "" if options.paired == None: paired = "" if options.mean == None: mean = 250 ###### if options.test == None: test_run = 0 else: test_run = 1 if options.test == "C5": print "\nPerforming a test run using test phage sequence with 5 prime cohesive overhang :" print "\npython PhageTerm.py -f test-data/COS-5.fastq -r test-data/COS-5.fasta -n TEST_cohesive_5_prime" fastq = "test-data/COS-5.fastq" reference = "test-data/COS-5.fasta" phagename = "Test-cohesive-5'" elif options.test == "C3": print "\nPerforming a test run using test phage sequence with 3 prime cohesive overhang:" print "\npython PhageTerm.py -f test-data/COS-3.fastq -r test-data/COS-3.fasta -n TEST_cohesive_3_prime" fastq = "test-data/COS-3.fastq" reference = "test-data/COS-3.fasta" phagename = "Test-cohesive-3'" elif options.test == "DS": print "\nPerforming a test run using test phage sequence with short direct terminal repeats (DTR-short) :" print "\npython PhageTerm.py -f test-data/DTR-short.fastq -r test-data/DTR-short.fasta -n TEST_short_direct_terminal_repeats" fastq = "test-data/DTR-short.fastq" reference = "test-data/DTR-short.fasta" phagename = "Test-short-direct-terminal-repeats" elif options.test == "DL": print "\nPerforming a test run using test phage sequence with long direct terminal repeats (DTR-long) :" print "\npython PhageTerm.py -f test-data/DTR-long.fastq -r test-data/DTR-long.fasta -n TEST_long_direct_terminal_repeats" fastq = "test-data/DTR-long.fastq" reference = "test-data/DTR-long.fasta" phagename = "Test-long-direct-terminal-repeats" elif options.test == "H": print "\nPerforming a test run using test phage sequence with headful packaging" print "\npython PhageTerm.py -f test-data/Headful.fastq -r test-data/Headful.fasta -n TEST_headful" fastq = "test-data/Headful.fastq" reference = "test-data/Headful.fasta" phagename = "Test-Headful" surrounding = 0 elif options.test == "M": print "\nPerforming a test run using test phage sequence with Mu-like packaging" print "\npython PhageTerm.py -f test-data/Mu-like_R1.fastq -p test-data/Mu-like_R2.fastq -r test-data/Mu-like.fasta -n TEST_Mu-like -g test-data/Mu-like_host.fasta" fastq = "test-data/Mu-like_R1.fastq" paired = "test-data/Mu-like_R2.fastq" reference = "test-data/Mu-like.fasta" host = "test-data/Mu-like_host.fasta" phagename = "Test-Mu-like" surrounding = 0 ###### # CHECK inputs phagename = checkPhageName(phagename) if checkFastaFile(reference): exit("ERROR in reference file") if host != "": if checkFastaFile(host): exit("ERROR in reference file") # VARIABLE edge = 500 insert_max = 1000 limit_fixed = 35 limit_preferred = 11 limit_coverage = max(50,mean*2)/core Mu_threshold = 0.5 draw = 0 if seed < 15: seed = 15 # READS Number tot_reads = totReads(fastq) if paired != "": tot_reads_paired = totReads(paired) if (tot_reads != tot_reads_paired): print "\nWARNING: Number of reads between the two reads files differ, using single reads only\n" paired = "" # REFERENCE sequence recovery and edge adds refseq = genomeFastaRecovery(reference) refseq = refseq[-edge:] + refseq + refseq[:edge] # HOST sequence recovery hostseq = genomeFastaRecovery(host) if len(hostseq) != 0 and len(hostseq) < len(refseq): print "\nHost length < Phage length : removing host sequence." hostseq = "" if hostseq != "": hostseq = hostseq[-edge:] + hostseq + hostseq[:edge] ### COVERAGE print "\nCalculating coverage values, please wait (may take a while)...\n" if not test_run and core == 1: print "If your computer has more than 1 processor, you can use the -c or --core option to speed up the process.\n\n" jobs = [] manager = Manager() return_dict = manager.dict() # Position in core split file_split = int(tot_reads/core) position = [] l = range(int(tot_reads)) part = chunks(l, core) for i in range(core): position.append(part.next()[0]) position = position + [int(tot_reads)] for i in range(0, core): process = multiprocessing.Process(target=readsCoverage, args=(fastq, refseq, hostseq, tot_reads, seed, edge, paired, insert_max, core, i, return_dict, position[i], position[i+1], limit_coverage)) jobs.append(process) for j in jobs: j.start() for j in jobs: j.join() print "\n\nFinished calculating coverage values, the remainder should be completed rapidly\n" # merging results for core_id in range(core): if core_id == 0: termini_coverage = return_dict[core_id][0] whole_coverage = return_dict[core_id][1] paired_whole_coverage = return_dict[core_id][2] phage_hybrid_coverage = return_dict[core_id][3] host_hybrid_coverage = return_dict[core_id][4] host_whole_coverage = return_dict[core_id][5] list_hybrid = return_dict[core_id][6] insert = return_dict[core_id][7].tolist() paired_missmatch = return_dict[core_id][8] reads_tested = return_dict[core_id][9] else: termini_coverage += return_dict[core_id][0] whole_coverage += return_dict[core_id][1] paired_whole_coverage += return_dict[core_id][2] phage_hybrid_coverage += return_dict[core_id][3] host_hybrid_coverage += return_dict[core_id][4] host_whole_coverage += return_dict[core_id][5] list_hybrid += return_dict[core_id][6] insert += return_dict[core_id][7].tolist() paired_missmatch += return_dict[core_id][8] reads_tested += return_dict[core_id][9] termini_coverage = termini_coverage.tolist() whole_coverage = whole_coverage.tolist() paired_whole_coverage = paired_whole_coverage.tolist() phage_hybrid_coverage = phage_hybrid_coverage.tolist() host_hybrid_coverage = host_hybrid_coverage.tolist() host_whole_coverage = host_whole_coverage.tolist() list_hybrid = list_hybrid.tolist() # WHOLE Coverage : Average, Maximum and Minimum added_whole_coverage, ave_whole_cov = wholeCov(whole_coverage, len(refseq)) added_paired_whole_coverage, ave_paired_whole_cov = wholeCov(paired_whole_coverage, len(refseq)) added_host_whole_coverage, ave_host_whole_cov = wholeCov(host_whole_coverage, len(hostseq)) drop_cov = testwholeCov(added_whole_coverage, ave_whole_cov, test_run) # NORM pic by whole coverage (1 base) if paired != "": paired_whole_coverage_test = maxPaired(paired_whole_coverage, whole_coverage) termini_coverage_norm, mean_nc = normCov(termini_coverage, paired_whole_coverage, ave_whole_cov/1.5, edge) else: termini_coverage_norm, mean_nc = normCov(termini_coverage, whole_coverage, ave_whole_cov/1.5, edge) # REMOVE edge termini_coverage[0] = RemoveEdge(termini_coverage[0],edge) termini_coverage[1] = RemoveEdge(termini_coverage[1],edge) termini_coverage_norm[0] = RemoveEdge(termini_coverage_norm[0],edge) termini_coverage_norm[1] = RemoveEdge(termini_coverage_norm[1],edge) whole_coverage[0] = RemoveEdge(whole_coverage[0],edge) whole_coverage[1] = RemoveEdge(whole_coverage[1],edge) paired_whole_coverage[0] = RemoveEdge(paired_whole_coverage[0],edge) paired_whole_coverage[1] = RemoveEdge(paired_whole_coverage[1],edge) added_whole_coverage = RemoveEdge(added_whole_coverage,edge) added_paired_whole_coverage = RemoveEdge(added_paired_whole_coverage,edge) added_host_whole_coverage = RemoveEdge(added_host_whole_coverage,edge) phage_hybrid_coverage[0] = RemoveEdge(phage_hybrid_coverage[0],edge) phage_hybrid_coverage[1] = RemoveEdge(phage_hybrid_coverage[1],edge) host_whole_coverage[0] = RemoveEdge(host_whole_coverage[0],edge) host_whole_coverage[1] = RemoveEdge(host_whole_coverage[1],edge) host_hybrid_coverage[0] = RemoveEdge(host_hybrid_coverage[0],edge) host_hybrid_coverage[1] = RemoveEdge(host_hybrid_coverage[1],edge) refseq = RemoveEdge(refseq,edge) if host != "": hostseq = RemoveEdge(hostseq,edge) gen_len = len(refseq) host_len = len(hostseq) if options.test == "DL": gen_len = 100000 # READS Total, Used and Lost used_reads, lost_reads, lost_perc = usedReads(termini_coverage, reads_tested) # PIC Max picMaxPlus, picMaxMinus, TopFreqH = picMax(termini_coverage, 5) picMaxPlus_norm, picMaxMinus_norm, TopFreqH_norm = picMax(termini_coverage_norm, 5) picMaxPlus_host, picMaxMinus_host, TopFreqH_host = picMax(host_whole_coverage, 5) ### ANALYSIS ## Close Peaks picMaxPlus, picOUT_forw = RemoveClosePicMax(picMaxPlus, gen_len, surrounding) picMaxMinus, picOUT_rev = RemoveClosePicMax(picMaxMinus, gen_len, surrounding) picMaxPlus_norm, picOUT_norm_forw = RemoveClosePicMax(picMaxPlus_norm, gen_len, surrounding) picMaxMinus_norm, picOUT_norm_rev = RemoveClosePicMax(picMaxMinus_norm, gen_len, surrounding) termini_coverage_close = termini_coverage[:] termini_coverage_close[0], picOUT_forw = addClosePic(termini_coverage[0], picOUT_forw) termini_coverage_close[1], picOUT_rev = addClosePic(termini_coverage[1], picOUT_rev) termini_coverage_norm_close = termini_coverage_norm[:] termini_coverage_norm_close[0], picOUT_norm_forw = addClosePic(termini_coverage_norm[0], picOUT_norm_forw, 1) termini_coverage_norm_close[1], picOUT_norm_rev = addClosePic(termini_coverage_norm[1], picOUT_norm_rev, 1) ## Statistical Analysis picMaxPlus_norm_close, picMaxMinus_norm_close, TopFreqH_norm = picMax(termini_coverage_norm_close, 5) if paired != "": phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(paired_whole_coverage, termini_coverage, termini_coverage_norm, termini_coverage_norm_close) else: phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(whole_coverage, termini_coverage, termini_coverage_norm, termini_coverage_norm_close) ## LI Analysis picMaxPlus_close, picMaxMinus_close, TopFreqH = picMax(termini_coverage_close, 5) R1, AveFreq = ratioR1(TopFreqH, used_reads, gen_len) R2 = ratioR(picMaxPlus_close) R3 = ratioR(picMaxMinus_close) ArtPackmode, termini, forward, reverse = packMode(R1, R2, R3) ArtOrient = orientation(picMaxPlus_close, picMaxMinus_close) ArtcohesiveSeq, ArtPackmode = sequenceCohesive(ArtPackmode, refseq, picMaxPlus_close, picMaxMinus_close, gen_len/2) ### DECISION Process # PEAKS Significativity plus_significant = selectSignificant(phage_plus_norm, 1.0/gen_len, limit_preferred) minus_significant = selectSignificant(phage_minus_norm, 1.0/gen_len, limit_preferred) # DECISION Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like = decisionProcess(plus_significant, minus_significant, limit_fixed, gen_len, paired, insert, R1, list_hybrid, used_reads, seed, phage_hybrid_coverage, Mu_threshold, refseq, hostseq) ### EXPORT Data ## Statistics ExportStatistics(phagename, whole_coverage, paired_whole_coverage, termini_coverage, phage_plus_norm, phage_minus_norm, paired, test_run) # Sequence ExportCohesiveSeq(phagename, ArtcohesiveSeq, P_seqcoh, test_run) ExportPhageSequence(phagename, P_left, P_right, refseq, P_orient, Redundant, Mu_like, P_class, P_seqcoh, test_run) # Report CreateReport(phagename, seed, added_whole_coverage, draw, Redundant, P_left, P_right, Permuted, P_orient, termini_coverage_norm_close, picMaxPlus_norm_close, picMaxMinus_norm_close, gen_len, tot_reads, P_seqcoh, phage_plus_norm, phage_minus_norm, ArtPackmode, termini, forward, reverse, ArtOrient, ArtcohesiveSeq, termini_coverage_close, picMaxPlus_close, picMaxMinus_close, picOUT_norm_forw, picOUT_norm_rev, picOUT_forw, picOUT_rev, lost_perc, ave_whole_cov, R1, R2, R3, host, host_len, host_whole_coverage, picMaxPlus_host, picMaxMinus_host, surrounding, drop_cov, paired, insert, phage_hybrid_coverage, host_hybrid_coverage, added_paired_whole_coverage, Mu_like, test_run, P_class, P_type, P_concat)