Mercurial > repos > vipints > deseq_hts
comparison dexseq-hts_1.0/src/dexseq_count.py @ 11:cec4b4fb30be draft default tip
DEXSeq version 1.6 added
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Tue, 08 Oct 2013 08:22:45 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
10:2fe512c7bfdf | 11:cec4b4fb30be |
---|---|
1 import sys, itertools, optparse | |
2 | |
3 optParser = optparse.OptionParser( | |
4 | |
5 usage = "python %prog [options] <flattened_gff_file> <sam_file> <output_file>", | |
6 | |
7 description= | |
8 "This script counts how many reads in <sam_file> fall onto each exonic " + | |
9 "part given in <flattened_gff_file> and outputs a list of counts in " + | |
10 "<output_file>, for further analysis with the DEXSeq Bioconductor package. " + | |
11 "(Notes: The <flattened_gff_file> should be produced with the script " + | |
12 "dexseq_prepare_annotation.py). <sam_file> may be '-' to indicate standard input.", | |
13 | |
14 epilog = | |
15 "Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " + | |
16 "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " + | |
17 "Public License v3. Part of the 'DEXSeq' package." ) | |
18 | |
19 optParser.add_option( "-p", "--paired", type="choice", dest="paired", | |
20 choices = ( "no", "yes" ), default = "no", | |
21 help = "'yes' or 'no'. Indicates whether the data is paired-end (default: no)" ) | |
22 | |
23 optParser.add_option( "-s", "--stranded", type="choice", dest="stranded", | |
24 choices = ( "yes", "no", "reverse" ), default = "yes", | |
25 help = "'yes', 'no', or 'reverse'. Indicates whether the data is " + | |
26 "from a strand-specific assay (default: yes ). " + | |
27 "Be sure to switch to 'no' if you use a non strand-specific RNA-Seq library " + | |
28 "preparation protocol. 'reverse' inverts strands and is neede for certain " + | |
29 "protocols, e.g. paired-end with circularization." ) | |
30 | |
31 optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual", | |
32 default = 10, | |
33 help = "skip all reads with alignment quality lower than the given " + | |
34 "minimum value (default: 10)" ) | |
35 | |
36 if len( sys.argv ) == 1: | |
37 optParser.print_help() | |
38 sys.exit(1) | |
39 | |
40 (opts, args) = optParser.parse_args() | |
41 | |
42 if len( args ) != 3: | |
43 sys.stderr.write( sys.argv[0] + ": Error: Please provide three arguments.\n" ) | |
44 sys.stderr.write( " Call with '-h' to get usage information.\n" ) | |
45 sys.exit( 1 ) | |
46 | |
47 try: | |
48 import HTSeq | |
49 except ImportError: | |
50 sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" ) | |
51 sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" ) | |
52 sys.exit(1) | |
53 | |
54 gff_file = args[0] | |
55 sam_file = args[1] | |
56 out_file = args[2] | |
57 stranded = opts.stranded == "yes" or opts.stranded == "reverse" | |
58 reverse = opts.stranded == "reverse" | |
59 is_PE = opts.paired == "yes" | |
60 minaqual = opts.minaqual | |
61 | |
62 if sam_file == "-": | |
63 sam_file = sys.stdin | |
64 | |
65 # Step 1: Read in the GFF file as generated by aggregate_genes.py | |
66 # and put everything into a GenomicArrayOfSets | |
67 | |
68 features = HTSeq.GenomicArrayOfSets( "auto", stranded=stranded ) | |
69 for f in HTSeq.GFF_Reader( gff_file ): | |
70 if f.type == "exonic_part": | |
71 f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number'] | |
72 features[f.iv] += f | |
73 | |
74 # initialise counters | |
75 num_reads = 0 | |
76 counts = {} | |
77 counts[ '_empty' ] = 0 | |
78 counts[ '_ambiguous' ] = 0 | |
79 counts[ '_lowaqual' ] = 0 | |
80 counts[ '_notaligned' ] = 0 | |
81 | |
82 # put a zero for each feature ID | |
83 for iv, s in features.steps(): | |
84 for f in s: | |
85 counts[ f.name ] = 0 | |
86 | |
87 #We need this little helper below: | |
88 def reverse_strand( s ): | |
89 if s == "+": | |
90 return "-" | |
91 elif s == "-": | |
92 return "+" | |
93 else: | |
94 raise SystemError, "illegal strand" | |
95 | |
96 # Now go through the aligned reads | |
97 | |
98 if not is_PE: | |
99 | |
100 num_reads = 0 | |
101 for a in HTSeq.SAM_Reader( sam_file ): | |
102 if not a.aligned: | |
103 counts[ '_notaligned' ] += 1 | |
104 continue | |
105 if a.aQual < minaqual: | |
106 counts[ '_lowaqual' ] += 1 | |
107 continue | |
108 rs = set() | |
109 for cigop in a.cigar: | |
110 if cigop.type != "M": | |
111 continue | |
112 if reverse: | |
113 cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) | |
114 for iv, s in features[cigop.ref_iv].steps( ): | |
115 rs = rs.union( s ) | |
116 set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) | |
117 if len( set_of_gene_names ) == 0: | |
118 counts[ '_empty' ] += 1 | |
119 elif len( set_of_gene_names ) > 1: | |
120 counts[ '_ambiguous' ] +=1 | |
121 else: | |
122 for f in rs: | |
123 counts[ f.name ] += 1 | |
124 num_reads += 1 | |
125 if num_reads % 100000 == 0: | |
126 sys.stdout.write( "%d reads processed.\n" % num_reads ) | |
127 | |
128 else: # paired-end | |
129 | |
130 num_reads = 0 | |
131 for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ): | |
132 rs = set() | |
133 if af and ar and not af.aligned and not ar.aligned: | |
134 counts[ '_notaligned' ] += 1 | |
135 continue | |
136 if af and ar and not af.aQual < minaqual and ar.aQual < minaqual: | |
137 counts[ '_lowaqual' ] += 1 | |
138 continue | |
139 if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys(): | |
140 for cigop in af.cigar: | |
141 if cigop.type != "M": | |
142 continue | |
143 if reverse: | |
144 cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) | |
145 for iv, s in features[cigop.ref_iv].steps(): | |
146 rs = rs.union( s ) | |
147 if ar and ar.aligned and ar.aQual >= minaqual and ar.iv.chrom in features.chrom_vectors.keys(): | |
148 for cigop in ar.cigar: | |
149 if cigop.type != "M": | |
150 continue | |
151 if not reverse: | |
152 cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand ) | |
153 for iv, s in features[cigop.ref_iv].steps(): | |
154 rs = rs.union( s ) | |
155 set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] ) | |
156 if len( set_of_gene_names ) == 0: | |
157 counts[ '_empty' ] += 1 | |
158 elif len( set_of_gene_names ) > 1: | |
159 counts[ '_ambiguous' ] = 0 | |
160 else: | |
161 for f in rs: | |
162 counts[ f.name ] += 1 | |
163 num_reads += 1 | |
164 if num_reads % 100000 == 0: | |
165 sys.stderr.write( "%d reads processed.\n" % num_reads ) | |
166 | |
167 | |
168 # Step 3: Write out the results | |
169 | |
170 fout = open( out_file, "w" ) | |
171 for fn in sorted( counts.keys() ): | |
172 fout.write( "%s\t%d\n" % ( fn, counts[fn] ) ) | |
173 fout.close() |