Mercurial > repos > vipints > deseq_hts
diff dexseq-hts_1.0/src/dexseq_count.py @ 11:cec4b4fb30be draft default tip
DEXSeq version 1.6 added
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Tue, 08 Oct 2013 08:22:45 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dexseq-hts_1.0/src/dexseq_count.py Tue Oct 08 08:22:45 2013 -0400 @@ -0,0 +1,173 @@ +import sys, itertools, optparse + +optParser = optparse.OptionParser( + + usage = "python %prog [options] <flattened_gff_file> <sam_file> <output_file>", + + description= + "This script counts how many reads in <sam_file> fall onto each exonic " + + "part given in <flattened_gff_file> and outputs a list of counts in " + + "<output_file>, for further analysis with the DEXSeq Bioconductor package. " + + "(Notes: The <flattened_gff_file> should be produced with the script " + + "dexseq_prepare_annotation.py). <sam_file> may be '-' to indicate standard input.", + + epilog = + "Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " + + "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " + + "Public License v3. Part of the 'DEXSeq' package." ) + +optParser.add_option( "-p", "--paired", type="choice", dest="paired", + choices = ( "no", "yes" ), default = "no", + help = "'yes' or 'no'. Indicates whether the data is paired-end (default: no)" ) + +optParser.add_option( "-s", "--stranded", type="choice", dest="stranded", + choices = ( "yes", "no", "reverse" ), default = "yes", + help = "'yes', 'no', or 'reverse'. Indicates whether the data is " + + "from a strand-specific assay (default: yes ). " + + "Be sure to switch to 'no' if you use a non strand-specific RNA-Seq library " + + "preparation protocol. 'reverse' inverts strands and is neede for certain " + + "protocols, e.g. paired-end with circularization." ) + +optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual", + default = 10, + help = "skip all reads with alignment quality lower than the given " + + "minimum value (default: 10)" ) + +if len( sys.argv ) == 1: + optParser.print_help() + sys.exit(1) + +(opts, args) = optParser.parse_args() + +if len( args ) != 3: + sys.stderr.write( sys.argv[0] + ": Error: Please provide three arguments.\n" ) + sys.stderr.write( " Call with '-h' to get usage information.\n" ) + sys.exit( 1 ) + +try: + import HTSeq +except ImportError: + sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" ) + sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" ) + sys.exit(1) + +gff_file = args[0] +sam_file = args[1] +out_file = args[2] +stranded = opts.stranded == "yes" or opts.stranded == "reverse" +reverse = opts.stranded == "reverse" +is_PE = opts.paired == "yes" +minaqual = opts.minaqual + +if sam_file == "-": + sam_file = sys.stdin + +# Step 1: Read in the GFF file as generated by aggregate_genes.py +# and put everything into a GenomicArrayOfSets + +features = HTSeq.GenomicArrayOfSets( "auto", stranded=stranded ) +for f in HTSeq.GFF_Reader( gff_file ): + if f.type == "exonic_part": + f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number'] + features[f.iv] += f + +# initialise counters +num_reads = 0 +counts = {} +counts[ '_empty' ] = 0 +counts[ '_ambiguous' ] = 0 +counts[ '_lowaqual' ] = 0 +counts[ '_notaligned' ] = 0 + +# put a zero for each feature ID +for iv, s in features.steps(): + for f in s: + counts[ f.name ] = 0 + +#We need this little helper below: +def reverse_strand( s ): + if s == "+": + return "-" + elif s == "-": + return "+" + else: + raise SystemError, "illegal strand" + +# Now go through the aligned reads + +if not is_PE: + + num_reads = 0 + for a in HTSeq.SAM_Reader( sam_file ): + if not a.aligned: + counts[ '_notaligned' ] += 1 + continue + if a.aQual < minaqual: + counts[ '_lowaqual' ] += 1 + continue + rs = set() + for cigop in a.cigar: + if cigop.type != "M": + continue + if reverse: + cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) + for iv, s in features[cigop.ref_iv].steps( ): + rs = rs.union( s ) + set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) + if len( set_of_gene_names ) == 0: + counts[ '_empty' ] += 1 + elif len( set_of_gene_names ) > 1: + counts[ '_ambiguous' ] +=1 + else: + for f in rs: + counts[ f.name ] += 1 + num_reads += 1 + if num_reads % 100000 == 0: + sys.stdout.write( "%d reads processed.\n" % num_reads ) + +else: # paired-end + + num_reads = 0 + for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ): + rs = set() + if af and ar and not af.aligned and not ar.aligned: + counts[ '_notaligned' ] += 1 + continue + if af and ar and not af.aQual < minaqual and ar.aQual < minaqual: + counts[ '_lowaqual' ] += 1 + continue + if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys(): + for cigop in af.cigar: + if cigop.type != "M": + continue + if reverse: + cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) + for iv, s in features[cigop.ref_iv].steps(): + rs = rs.union( s ) + if ar and ar.aligned and ar.aQual >= minaqual and ar.iv.chrom in features.chrom_vectors.keys(): + for cigop in ar.cigar: + if cigop.type != "M": + continue + if not reverse: + cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) + for iv, s in features[cigop.ref_iv].steps(): + rs = rs.union( s ) + set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) + if len( set_of_gene_names ) == 0: + counts[ '_empty' ] += 1 + elif len( set_of_gene_names ) > 1: + counts[ '_ambiguous' ] = 0 + else: + for f in rs: + counts[ f.name ] += 1 + num_reads += 1 + if num_reads % 100000 == 0: + sys.stderr.write( "%d reads processed.\n" % num_reads ) + + +# Step 3: Write out the results + +fout = open( out_file, "w" ) +for fn in sorted( counts.keys() ): + fout.write( "%s\t%d\n" % ( fn, counts[fn] ) ) +fout.close()