Mercurial > repos > vipints > deseq_hts
changeset 11:cec4b4fb30be draft default tip
DEXSeq version 1.6 added
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Tue, 08 Oct 2013 08:22:45 -0400 |
parents | 2fe512c7bfdf |
children | |
files | dexseq-hts_1.0/README dexseq-hts_1.0/bin/dexseq_config.sh dexseq-hts_1.0/bin/dexseq_config.sh.sample dexseq-hts_1.0/galaxy/DEXSeq.xml dexseq-hts_1.0/setup_dexseq-hts.sh dexseq-hts_1.0/src/DEXseq-hts.sh dexseq-hts_1.0/src/dexseq_count.py dexseq-hts_1.0/src/dexseq_prepare_annotation.py dexseq-hts_1.0/src/run_DEXseq.R |
diffstat | 9 files changed, 729 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/README Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,52 @@ +----------------------------------------- +A Galaxy wrapper for DEXSeq version 1.6.0 +----------------------------------------- + +DEXSeq is focused on finding differential exon usage using RNA-seq exon counts between samples with different experimental designs. +DEXSeq: http://www.bioconductor.org/packages/release/bioc/html/DEXSeq.html + +Requirements: +------------- + Python, HTSeq package :- Preprocessing of sequencing reads and GFF files + R, Bio-conductor package :- Required for DEXSeq + SAMTOOLS :- Sequencing read processing + +Contents: +--------- + +./bin + Contains a config file for running DEXSeq in different settings, setup_dexseq-hts.sh + will help to create the config file according to your local work station. + +./setup_dexseq-hts.sh + Setup script, which helps to set the right path to the config file present in the bin + folder. + +./README + Basic information about the wrapper + +./galaxy + Galaxy tool configuration file can be found here. + +./src + All relevant scripts for DEXSeq execution, Please follow the shell script to + understand the details. + +Licenses: +--------- + + If DEXSeq is used to obtain results for scientific publications it should be cited as [1]. + + This wrapper program is free software; you can redistribute it and/or modify it under the + terms of the GNU General Public License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + Copyright(C) 2013 cBio Memorial Sloan Kettering Cancer Center, New York City, USA. + +References +---------- + [1] Simon Anders and Alejandro Reyes and Wolfgang Huber (2012): Detecting differential usage of exons from RNA-seq data. + +Contact: +-------- + support [at] oqtans.org
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/bin/dexseq_config.sh Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,6 @@ +#!/bin/bash +export DEXSEQ_VERSION=1.6.0 +export SAMTOOLS_DIR= +export PYTHON_PATH=/usr/bin/python +export PYTHONPATH= +export R_PATH=
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/bin/dexseq_config.sh.sample Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,6 @@ +#!/bin/bash +export DEXSEQ_VERSION=1.6.0 +export SAMTOOLS_DIR=/home/vipin/samtools-0.18-beta +export PYTHON_PATH=/usr/bin/python +export PYTHONPATH=/home/vipin/lib/python2.7/site-packages/:$PYTHONPATH +export R_PATH=/home/vipin/bin/R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/galaxy/DEXSeq.xml Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,117 @@ +<tool id="dexseq-hts" name="DEXSeq" version="1.6.0"> + <description> Inference of differential exon usage in RNA-Seq</description> + <command interpreter="bash"> + ./../src/DEXseq-hts.sh + $anno_input + $sPaired + $library_type + $minQUAL + $dexseq_out $dexseq_out.extra_files_path +#for $i in $replicate_groups +#for $j in $i.replicates +$j.bam_alignment:#slurp +#end for + +#end for + >> $Log_File </command> + <inputs> + + <!-- Genome annotation in GFF/GTF/GFF3 file --> + <param format="gff,gtf,gff3" name="anno_input" type="data" label="Reference genome annotation file" help="A tab delimited format for storing sequence features and annotations (GFF/GTF/GFF3)."/> + + <!-- RNA-Seq read alignment files--> + <repeat name="replicate_groups" title="Replicate group" min="2"> + <repeat name="replicates" title="Replicate"> + <param format="bam" name="bam_alignment" type="data" label="BAM file of aligned RNA-Seq reads" help="Can be generated from SAM file using the SAMTools."/> + </repeat> + </repeat> + + <!--Paired --> + <param name="sPaired" type="select" display="radio" label="Is this library mate-paired?" help="Indicates whether the data is paired-end (default: single-end)"> + <option value="no" selected="true">Single-end</option> + <option value="yes">Paired-end</option> + </param> + + <!-- RNA-Seq library type--> + <param name="library_type" type="select" display="radio" label="Strand-specific library type" help="Indicates RNA-Seq library preparation protocol (default: strand-specific assay)"> + <option value="yes">Stranded</option> + <option value="no">Non stranded</option> + <option value="reverse">Reverse</option> + </param> + + <!-- Read alignment quality--> + <param name="minQUAL" type="integer" min="0" value="10" label="Minimum alignment quality" help="Skip all reads with alignment quality lower than the given minimum value (default: 10)"/> + </inputs> + + <outputs> + <data format="txt" name="dexseq_out" label="${tool.name} on ${on_string}: Differential Expression"/> + <data format="txt" name="Log_File" label="${tool.name} on ${on_string}: Log"/> + </outputs> + + <tests> + <test> + </test> + </tests> + + <help> + +.. class:: infomark + +**What it does** + +DEXSeq_ is focused on finding differential exon usage using RNA-seq exon counts between samples with different experimental designs. + +.. _DEXSeq: http://www.bioconductor.org/packages/release/bioc/html/DEXSeq.html + +`DEXSeq` requires: + +Genome annotation in GFF file type, containing the necessary information about the transcripts that are to be quantified. + +The BAM alignment files grouped into replicate groups, each containing several replicates. BAM files store the read alignments, The program will also work with only two groups containing only a single replicate each. However, this analysis has less statistical power and is therefore not recommended! + +------ + +**Licenses** + +If **DEXSeq** is used to obtain results for scientific publications it +should be cited as [1]_. + +**References** + +.. [1] Simon Anders and Alejandro Reyes and Wolfgang Huber (2012): `Detecting differential usage of exons from RNA-seq data`_. + +.. _Detecting differential usage of exons from RNA-seq data: http://genome.cshlp.org/content/early/2012/06/21/gr.133744.111.full.pdf+html + +------ + +.. class:: infomark + +**About formats** + +**GFF/GTF format** General Feature Format/Gene Transfer Format is a format for describing genes and other features associated with DNA, RNA and protein sequences. GFF3 lines have nine tab-separated fields: + +1. seqid - The name of a chromosome or scaffold. +2. source - The program that generated this feature. +3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon". +4. start - The starting position of the feature in the sequence. The first base is numbered 1. +5. stop - The ending position of the feature (inclusive). +6. score - A score between 0 and 1000. If there is no score value, enter ".". +7. strand - Valid entries include '+', '-', or '.' (for don't know/care). +8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. +9. attributes - All lines with the same group are linked together into a single item. + +For more information see http://www.sequenceontology.org/gff3.shtml + +**BAM format** The Sequence Alignment/Map (SAM) format is a +tab-limited text format that stores large nucleotide sequence +alignments. BAM is the binary version of a SAM file that allows for +fast and intensive data processing. The format specification and the +description of SAMtools can be found on +http://samtools.sourceforge.net/. + +------ + +DEXSeq-hts Wrapper Version 0.1 (Sept. 2013) + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/setup_dexseq-hts.sh Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,73 @@ +#!/bin/bash +set -e + +DIR=`dirname $0` +. ${DIR}/./bin/dexseq_config.sh + +echo ========================================== +echo DEXSeq-hts setup script \(DEXSeq version $DEXSEQ_VERSION\) +echo ========================================== +echo +echo SAMTools directory \(currently set to \"$SAMTOOLS_DIR\", system version used if left empty\) +read SAMTOOLS_DIR +if [ "$SAMTOOLS_DIR" == "" ]; +then + if [ "$(which samtools)" != "" ] ; + then + SAMTOOLS_DIR=$(dirname $(which samtools)) + else + echo samtools not found + exit -1 ; + fi +fi +echo '=>' Setting SAMTools directory to \"$SAMTOOLS_DIR\" +echo + +echo Path to the python binary \(currently set to \"$PYTHON_PATH\", system version used, if left empty\) +read PYTHON_PATH +if [ "$PYTHON_PATH" == "" ]; +then + PYTHON_PATH=`which python` + if [ "$PYTHON_PATH" == "" ]; + then + echo python not found + exit -1 + fi +fi +echo '=>' Setting Python path to \"$PYTHON_PATH\" +echo + +echo Path to HTSeq library files \(currently set to \"$PYTHONPATH\", system version is used if left empty\) +read PYTHONPATH +echo '=>' Setting HTSeq path to \"$PYTHONPATH\" +echo + +echo Path to the R binary \(currently set to \"$R_PATH\", system version used, if left empty\) +read R_PATH +if [ "$R_PATH" == "" ]; +then + R_PATH=`which R` + if [ "$R_PATH" == "" ]; + then + echo R not found + exit -1 + fi +fi +echo '=>' Setting R path to \"$R_PATH\" +echo + +cp -p bin/dexseq_config.sh bin/dexseq_config.sh.bk +grep -v -e SAMTOOLS_DIR -e PYTHON_PATH -e PYTHONPATH -e R_PATH -e $DEXSEQ_VERSION bin/dexseq_config.sh.bk > bin/dexseq_config.sh +echo +echo +echo generating config file + +echo export DEXSEQ_VERSION=$DEXSEQ_VERSION >> bin/dexseq_config.sh +echo export SAMTOOLS_DIR=$SAMTOOLS_DIR >> bin/dexseq_config.sh +echo export PYTHON_PATH=$PYTHON_PATH >> bin/dexseq_config.sh +echo export PYTHONPATH=$PYTHONPATH >> bin/dexseq_config.sh +echo export R_PATH=$R_PATH >> bin/dexseq_config.sh + +echo +echo Done. +echo
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/src/DEXseq-hts.sh Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,98 @@ +#/bin/bash +## +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# Copyright (C) 2013 Memorial Sloan-Kettering Cancer Center +## + +set -e + +PROG=`basename $0` +DIR=`dirname $0` + +. ${DIR}/../bin/dexseq_config.sh + +echo +echo ${PROG}: Oqtans http://galaxy.cbio.mskcc.org Galaxy wrapper for the DEXSeq version ${DEXSEQ_VERSION}. +echo +echo DEXSeq: Detecting differential usage of exons from RNA-seq data. +echo + +## input arguments from the interface +GFF_IN=${1} +shift +MATE_PAIR=${1} +shift +LIBTP=${1} +shift +minQL=${1} +shift +RES_FILE=${1} +shift +RES_WD=${1} +shift + +## associated array with sequencing type. +declare -A SEQ_TYPE=( [no]=SE [yes]=PE ) + +echo %%%%%%%%%%%%%%%%%%%%%%% +echo % 1. Data preparation % +echo %%%%%%%%%%%%%%%%%%%%%%% +echo + +mkdir -p ${RES_WD} +echo extra file path $RES_WD +tmpGTF=`mktemp --tmpdir=/tmp` + +echo load the genome annotation in GFF file + +${PYTHON_PATH} ${DIR}/dexseq_prepare_annotation.py ${GFF_IN} ${tmpGTF} +echo genome annotation stored in ${tmpGTF} +echo + +echo %%%%%%%%%%%%%%%%%%%% +echo % 2. Read counting % +echo %%%%%%%%%%%%%%%%%%%% +echo + +tmpFILE=`mktemp --tmpdir=/tmp` +echo $tmpFILE +echo -e '\t'condition'\t'libType > ${tmpFILE}_CONDITIONS.tab + +COND=0 +for REPLICATE_GROUP in $@ +do + IFS=':' + COND=$((COND+1)) + for BAM_FILE in ${REPLICATE_GROUP} + do + ## different group information + REPNAME=$(basename ${BAM_FILE%.dat}) + echo -e ${REPNAME}"\t"$COND"\t"${SEQ_TYPE[$MATE_PAIR]} >> ${tmpFILE}_CONDITIONS.tab + + ## counting the reads + ${SAMTOOLS_DIR}/samtools view -h $BAM_FILE | ${PYTHON_PATH} ${DIR}/dexseq_count.py -p ${MATE_PAIR} -s ${LIBTP} -a ${minQL} ${tmpGTF} - ${RES_WD}/${REPNAME} + + echo + done + echo conuted condition ${COND} +done +echo counted reads map to each exon. +echo + +echo %%%%%%%%%%%%%%%%%%%%%%%%%%% +echo % 3. Differential testing % +echo %%%%%%%%%%%%%%%%%%%%%%%%%%% +echo + +echo "cat ${DIR}/run_DEXseq.R | $R_PATH --slave --args $tmpFILE $RES_WD $tmpGTF ${RES_FILE} $#" +cat ${DIR}/run_DEXseq.R | $R_PATH --slave --args $tmpFILE $RES_WD $tmpGTF ${RES_FILE} + +## clean up +rm -fr ${RES_WD} ${tmpGTF} ${tmpFILE} +echo %%%%%%%% +echo % Done % +echo %%%%%%%%
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/src/dexseq_count.py Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,173 @@ +import sys, itertools, optparse + +optParser = optparse.OptionParser( + + usage = "python %prog [options] <flattened_gff_file> <sam_file> <output_file>", + + description= + "This script counts how many reads in <sam_file> fall onto each exonic " + + "part given in <flattened_gff_file> and outputs a list of counts in " + + "<output_file>, for further analysis with the DEXSeq Bioconductor package. " + + "(Notes: The <flattened_gff_file> should be produced with the script " + + "dexseq_prepare_annotation.py). <sam_file> may be '-' to indicate standard input.", + + epilog = + "Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " + + "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " + + "Public License v3. Part of the 'DEXSeq' package." ) + +optParser.add_option( "-p", "--paired", type="choice", dest="paired", + choices = ( "no", "yes" ), default = "no", + help = "'yes' or 'no'. Indicates whether the data is paired-end (default: no)" ) + +optParser.add_option( "-s", "--stranded", type="choice", dest="stranded", + choices = ( "yes", "no", "reverse" ), default = "yes", + help = "'yes', 'no', or 'reverse'. Indicates whether the data is " + + "from a strand-specific assay (default: yes ). " + + "Be sure to switch to 'no' if you use a non strand-specific RNA-Seq library " + + "preparation protocol. 'reverse' inverts strands and is neede for certain " + + "protocols, e.g. paired-end with circularization." ) + +optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual", + default = 10, + help = "skip all reads with alignment quality lower than the given " + + "minimum value (default: 10)" ) + +if len( sys.argv ) == 1: + optParser.print_help() + sys.exit(1) + +(opts, args) = optParser.parse_args() + +if len( args ) != 3: + sys.stderr.write( sys.argv[0] + ": Error: Please provide three arguments.\n" ) + sys.stderr.write( " Call with '-h' to get usage information.\n" ) + sys.exit( 1 ) + +try: + import HTSeq +except ImportError: + sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" ) + sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" ) + sys.exit(1) + +gff_file = args[0] +sam_file = args[1] +out_file = args[2] +stranded = opts.stranded == "yes" or opts.stranded == "reverse" +reverse = opts.stranded == "reverse" +is_PE = opts.paired == "yes" +minaqual = opts.minaqual + +if sam_file == "-": + sam_file = sys.stdin + +# Step 1: Read in the GFF file as generated by aggregate_genes.py +# and put everything into a GenomicArrayOfSets + +features = HTSeq.GenomicArrayOfSets( "auto", stranded=stranded ) +for f in HTSeq.GFF_Reader( gff_file ): + if f.type == "exonic_part": + f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number'] + features[f.iv] += f + +# initialise counters +num_reads = 0 +counts = {} +counts[ '_empty' ] = 0 +counts[ '_ambiguous' ] = 0 +counts[ '_lowaqual' ] = 0 +counts[ '_notaligned' ] = 0 + +# put a zero for each feature ID +for iv, s in features.steps(): + for f in s: + counts[ f.name ] = 0 + +#We need this little helper below: +def reverse_strand( s ): + if s == "+": + return "-" + elif s == "-": + return "+" + else: + raise SystemError, "illegal strand" + +# Now go through the aligned reads + +if not is_PE: + + num_reads = 0 + for a in HTSeq.SAM_Reader( sam_file ): + if not a.aligned: + counts[ '_notaligned' ] += 1 + continue + if a.aQual < minaqual: + counts[ '_lowaqual' ] += 1 + continue + rs = set() + for cigop in a.cigar: + if cigop.type != "M": + continue + if reverse: + cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) + for iv, s in features[cigop.ref_iv].steps( ): + rs = rs.union( s ) + set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) + if len( set_of_gene_names ) == 0: + counts[ '_empty' ] += 1 + elif len( set_of_gene_names ) > 1: + counts[ '_ambiguous' ] +=1 + else: + for f in rs: + counts[ f.name ] += 1 + num_reads += 1 + if num_reads % 100000 == 0: + sys.stdout.write( "%d reads processed.\n" % num_reads ) + +else: # paired-end + + num_reads = 0 + for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ): + rs = set() + if af and ar and not af.aligned and not ar.aligned: + counts[ '_notaligned' ] += 1 + continue + if af and ar and not af.aQual < minaqual and ar.aQual < minaqual: + counts[ '_lowaqual' ] += 1 + continue + if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys(): + for cigop in af.cigar: + if cigop.type != "M": + continue + if reverse: + cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) + for iv, s in features[cigop.ref_iv].steps(): + rs = rs.union( s ) + if ar and ar.aligned and ar.aQual >= minaqual and ar.iv.chrom in features.chrom_vectors.keys(): + for cigop in ar.cigar: + if cigop.type != "M": + continue + if not reverse: + cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) + for iv, s in features[cigop.ref_iv].steps(): + rs = rs.union( s ) + set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) + if len( set_of_gene_names ) == 0: + counts[ '_empty' ] += 1 + elif len( set_of_gene_names ) > 1: + counts[ '_ambiguous' ] = 0 + else: + for f in rs: + counts[ f.name ] += 1 + num_reads += 1 + if num_reads % 100000 == 0: + sys.stderr.write( "%d reads processed.\n" % num_reads ) + + +# Step 3: Write out the results + +fout = open( out_file, "w" ) +for fn in sorted( counts.keys() ): + fout.write( "%s\t%d\n" % ( fn, counts[fn] ) ) +fout.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/src/dexseq_prepare_annotation.py Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,131 @@ +import sys, collections, itertools, os.path + +if len( sys.argv ) != 3: + sys.stderr.write( "Script to prepare annotation for DEXSeq.\n\n" ) + sys.stderr.write( "Usage: python %s <in.gtf> <out.gff>\n\n" % os.path.basename(sys.argv[0]) ) + sys.stderr.write( "This script takes an annotation file in Ensembl GTF format\n" ) + sys.stderr.write( "and outputs a 'flattened' annotation file suitable for use\n" ) + sys.stderr.write( "with the count_in_exons.py script.\n" ) + sys.exit(1) + +try: + import HTSeq +except ImportError: + sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" ) + sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" ) + sys.exit(1) + +gtf_file = sys.argv[1] +out_file = sys.argv[2] + +## make sure that it can handle GFF files. +parent_child_map = dict() +for feature in HTSeq.GFF_Reader( gtf_file ): + if feature.type in ['mRNA', + 'transcript', + 'ncRNA', + 'miRNA', + 'pseudogenic_transcript', + 'rRNA', + 'snoRNA', + 'snRNA', + 'tRNA', + 'scRNA']: + parent_child_map[feature.attr['ID']] = feature.attr['Parent'] + +# Step 1: Store all exons with their gene and transcript ID +# in a GenomicArrayOfSets + +exons = HTSeq.GenomicArrayOfSets( "auto", stranded=True ) +for f in HTSeq.GFF_Reader( gtf_file ): + if not f.type in ["exon", "pseudogenic_exon"]: + continue + if not f.attr.get('gene_id'): + f.attr['gene_id'] = parent_child_map[f.attr['Parent']] + f.attr['transcript_id'] = f.attr['Parent'] + f.attr['gene_id'] = f.attr['gene_id'].replace( ":", "_" ) + exons[f.iv] += ( f.attr['gene_id'], f.attr['transcript_id'] ) + +# Step 2: Form sets of overlapping genes + +# We produce the dict 'gene_sets', whose values are sets of gene IDs. Each set +# contains IDs of genes that overlap, i.e., share bases (on the same strand). +# The keys of 'gene_sets' are the IDs of all genes, and each key refers to +# the set that contains the gene. +# Each gene set forms an 'aggregate gene'. + +gene_sets = collections.defaultdict( lambda: set() ) +for iv, s in exons.steps(): + # For each step, make a set, 'full_set' of all the gene IDs occuring + # in the present step, and also add all those gene IDs, whch have been + # seen earlier to co-occur with each of the currently present gene IDs. + full_set = set() + for gene_id, transcript_id in s: + full_set.add( gene_id ) + full_set |= gene_sets[ gene_id ] + # Make sure that all genes that are now in full_set get associated + # with full_set, i.e., get to know about their new partners + for gene_id in full_set: + assert gene_sets[ gene_id ] <= full_set + gene_sets[ gene_id ] = full_set + + +# Step 3: Go through the steps again to get the exonic sections. Each step +# becomes an 'exonic part'. The exonic part is associated with an +# aggregate gene, i.e., a gene set as determined in the previous step, +# and a transcript set, containing all transcripts that occur in the step. +# The results are stored in the dict 'aggregates', which contains, for each +# aggregate ID, a list of all its exonic_part features. + +aggregates = collections.defaultdict( lambda: list() ) +for iv, s in exons.steps( ): + # Skip empty steps + if len(s) == 0: + continue + # Take one of the gene IDs, find the others via gene sets, and + # form the aggregate ID from all of them + gene_id = list(s)[0][0] + assert set( gene_id for gene_id, transcript_id in s ) <= gene_sets[ gene_id ] + aggregate_id = '+'.join( gene_sets[ gene_id ] ) + # Make the feature and store it in 'aggregates' + f = HTSeq.GenomicFeature( aggregate_id, "exonic_part", iv ) + f.source = os.path.basename( sys.argv[1] ) + f.attr = {} + f.attr[ 'gene_id' ] = aggregate_id + transcript_set = set( ( transcript_id for gene_id, transcript_id in s ) ) + f.attr[ 'transcripts' ] = '+'.join( transcript_set ) + aggregates[ aggregate_id ].append( f ) + + +# Step 4: For each aggregate, number the exonic parts + +aggregate_features = [] +for l in aggregates.values(): + for i in xrange( len(l)-1 ): + assert l[i].name == l[i+1].name, str(l[i+1]) + " has wrong name" + assert l[i].iv.end <= l[i+1].iv.start, str(l[i+1]) + " starts too early" + if l[i].iv.chrom != l[i+1].iv.chrom: + raise ValueError, "Same name found on two chromosomes: %s, %s" % ( str(l[i]), str(l[i+1]) ) + if l[i].iv.strand != l[i+1].iv.strand: + raise ValueError, "Same name found on two strands: %s, %s" % ( str(l[i]), str(l[i+1]) ) + aggr_feat = HTSeq.GenomicFeature( l[0].name, "aggregate_gene", + HTSeq.GenomicInterval( l[0].iv.chrom, l[0].iv.start, + l[-1].iv.end, l[0].iv.strand ) ) + aggr_feat.source = os.path.basename( sys.argv[1] ) + aggr_feat.attr = { 'gene_id': aggr_feat.name } + for i in xrange( len(l) ): + l[i].attr['exonic_part_number'] = "%03d" % ( i+1 ) + aggregate_features.append( aggr_feat ) + + +# Step 5: Sort the aggregates, then write everything out + +aggregate_features.sort( key = lambda f: ( f.iv.chrom, f.iv.start ) ) + +fout = open( out_file, "w" ) +for aggr_feat in aggregate_features: + fout.write( aggr_feat.get_gff_line() ) + for f in aggregates[ aggr_feat.name ]: + fout.write( f.get_gff_line() ) + +fout.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/src/run_DEXseq.R Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,73 @@ +### load DEXSeq package +suppressMessages(require("DEXSeq")) + +### get arguments 1: INFILE, 2: OUTFILE 3:SIZE +args <- commandArgs() +INFILE<-args[4] +EXTRAPATH<-args[5] +annodb<-args[6] +OUTFILE<-args[7] + +INFILE_CONDS=c(paste(INFILE, "_CONDITIONS.tab", sep="")) + +### read count data from file +condsTable <- read.delim( INFILE_CONDS, header=TRUE, stringsAsFactors=TRUE, row.names=1 ) +#head(condsTable) + +conditions<-factor( condsTable[ , 1] ) +#print(conditions) + +## unique condition to define the pair of tests +uniq_conds <- unique(conditions) +#print(uniq_conds) + +## all possible pairs of conditions +pw_tests <- list() +for(i in 1:(length(uniq_conds)-1)) { + for(j in (i+1):length(uniq_conds)) { + pw_tests[[length(pw_tests)+1]] <- c(uniq_conds[i], uniq_conds[j]) + } +} +#print(pw_tests) + +tab <- NULL +## testing all possible pairs of conditions +for(i in 1:length(pw_tests)) { + ## header name + test_pair_name <- c(paste(pw_tests[[i]][1], "__vs__", pw_tests[[i]][2], sep="")) + print(test_pair_name) + + sub.data <- subset(condsTable, (conditions %in% c(pw_tests[[i]][1],pw_tests[[i]][2]))) + sub.data[[1]]<-as.factor(sub.data[[1]]) + + ecs = read.HTSeqCounts(countfiles=file.path(EXTRAPATH, row.names(sub.data)), design=sub.data, flattenedfile=annodb) + + ## Normalisation + ecs <- estimateSizeFactors(ecs) + print(sizeFactors(ecs)) + + suppressMessages(require("parallel")) + ## Dispersion estimation + ecs <- estimateDispersions(ecs, nCores=2) + ecs <- fitDispersionFunction(ecs) + + ## Testing for differential exon usage + ecs <- testForDEU(ecs, nCores=2) + + ## estimate the fold change + ecs <- estimatelog2FoldChanges(ecs, nCores=2) + + ## Result table + resultTable <- DEUresultTable(ecs) + print(head(resultTable)) + + colnames(resultTable) <- paste(test_pair_name, colnames(resultTable), sep=":") + #print(colnames(resultTable)) + + if(is.null(tab)) { + tab<- resultTable + } + else tab<- cbind(tab, resultTable) +} +## printing the result to out file +write.table(tab, OUTFILE, quote=F, sep="\t", row.names=F)