11
|
1 import sys, itertools, optparse
|
|
2
|
|
3 optParser = optparse.OptionParser(
|
|
4
|
|
5 usage = "python %prog [options] <flattened_gff_file> <sam_file> <output_file>",
|
|
6
|
|
7 description=
|
|
8 "This script counts how many reads in <sam_file> fall onto each exonic " +
|
|
9 "part given in <flattened_gff_file> and outputs a list of counts in " +
|
|
10 "<output_file>, for further analysis with the DEXSeq Bioconductor package. " +
|
|
11 "(Notes: The <flattened_gff_file> should be produced with the script " +
|
|
12 "dexseq_prepare_annotation.py). <sam_file> may be '-' to indicate standard input.",
|
|
13
|
|
14 epilog =
|
|
15 "Written by Simon Anders (sanders@fs.tum.de), European Molecular Biology " +
|
|
16 "Laboratory (EMBL). (c) 2010. Released under the terms of the GNU General " +
|
|
17 "Public License v3. Part of the 'DEXSeq' package." )
|
|
18
|
|
19 optParser.add_option( "-p", "--paired", type="choice", dest="paired",
|
|
20 choices = ( "no", "yes" ), default = "no",
|
|
21 help = "'yes' or 'no'. Indicates whether the data is paired-end (default: no)" )
|
|
22
|
|
23 optParser.add_option( "-s", "--stranded", type="choice", dest="stranded",
|
|
24 choices = ( "yes", "no", "reverse" ), default = "yes",
|
|
25 help = "'yes', 'no', or 'reverse'. Indicates whether the data is " +
|
|
26 "from a strand-specific assay (default: yes ). " +
|
|
27 "Be sure to switch to 'no' if you use a non strand-specific RNA-Seq library " +
|
|
28 "preparation protocol. 'reverse' inverts strands and is neede for certain " +
|
|
29 "protocols, e.g. paired-end with circularization." )
|
|
30
|
|
31 optParser.add_option( "-a", "--minaqual", type="int", dest="minaqual",
|
|
32 default = 10,
|
|
33 help = "skip all reads with alignment quality lower than the given " +
|
|
34 "minimum value (default: 10)" )
|
|
35
|
|
36 if len( sys.argv ) == 1:
|
|
37 optParser.print_help()
|
|
38 sys.exit(1)
|
|
39
|
|
40 (opts, args) = optParser.parse_args()
|
|
41
|
|
42 if len( args ) != 3:
|
|
43 sys.stderr.write( sys.argv[0] + ": Error: Please provide three arguments.\n" )
|
|
44 sys.stderr.write( " Call with '-h' to get usage information.\n" )
|
|
45 sys.exit( 1 )
|
|
46
|
|
47 try:
|
|
48 import HTSeq
|
|
49 except ImportError:
|
|
50 sys.stderr.write( "Could not import HTSeq. Please install the HTSeq Python framework\n" )
|
|
51 sys.stderr.write( "available from http://www-huber.embl.de/users/anders/HTSeq\n" )
|
|
52 sys.exit(1)
|
|
53
|
|
54 gff_file = args[0]
|
|
55 sam_file = args[1]
|
|
56 out_file = args[2]
|
|
57 stranded = opts.stranded == "yes" or opts.stranded == "reverse"
|
|
58 reverse = opts.stranded == "reverse"
|
|
59 is_PE = opts.paired == "yes"
|
|
60 minaqual = opts.minaqual
|
|
61
|
|
62 if sam_file == "-":
|
|
63 sam_file = sys.stdin
|
|
64
|
|
65 # Step 1: Read in the GFF file as generated by aggregate_genes.py
|
|
66 # and put everything into a GenomicArrayOfSets
|
|
67
|
|
68 features = HTSeq.GenomicArrayOfSets( "auto", stranded=stranded )
|
|
69 for f in HTSeq.GFF_Reader( gff_file ):
|
|
70 if f.type == "exonic_part":
|
|
71 f.name = f.attr['gene_id'] + ":" + f.attr['exonic_part_number']
|
|
72 features[f.iv] += f
|
|
73
|
|
74 # initialise counters
|
|
75 num_reads = 0
|
|
76 counts = {}
|
|
77 counts[ '_empty' ] = 0
|
|
78 counts[ '_ambiguous' ] = 0
|
|
79 counts[ '_lowaqual' ] = 0
|
|
80 counts[ '_notaligned' ] = 0
|
|
81
|
|
82 # put a zero for each feature ID
|
|
83 for iv, s in features.steps():
|
|
84 for f in s:
|
|
85 counts[ f.name ] = 0
|
|
86
|
|
87 #We need this little helper below:
|
|
88 def reverse_strand( s ):
|
|
89 if s == "+":
|
|
90 return "-"
|
|
91 elif s == "-":
|
|
92 return "+"
|
|
93 else:
|
|
94 raise SystemError, "illegal strand"
|
|
95
|
|
96 # Now go through the aligned reads
|
|
97
|
|
98 if not is_PE:
|
|
99
|
|
100 num_reads = 0
|
|
101 for a in HTSeq.SAM_Reader( sam_file ):
|
|
102 if not a.aligned:
|
|
103 counts[ '_notaligned' ] += 1
|
|
104 continue
|
|
105 if a.aQual < minaqual:
|
|
106 counts[ '_lowaqual' ] += 1
|
|
107 continue
|
|
108 rs = set()
|
|
109 for cigop in a.cigar:
|
|
110 if cigop.type != "M":
|
|
111 continue
|
|
112 if reverse:
|
|
113 cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
|
|
114 for iv, s in features[cigop.ref_iv].steps( ):
|
|
115 rs = rs.union( s )
|
|
116 set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )
|
|
117 if len( set_of_gene_names ) == 0:
|
|
118 counts[ '_empty' ] += 1
|
|
119 elif len( set_of_gene_names ) > 1:
|
|
120 counts[ '_ambiguous' ] +=1
|
|
121 else:
|
|
122 for f in rs:
|
|
123 counts[ f.name ] += 1
|
|
124 num_reads += 1
|
|
125 if num_reads % 100000 == 0:
|
|
126 sys.stdout.write( "%d reads processed.\n" % num_reads )
|
|
127
|
|
128 else: # paired-end
|
|
129
|
|
130 num_reads = 0
|
|
131 for af, ar in HTSeq.pair_SAM_alignments( HTSeq.SAM_Reader( sam_file ) ):
|
|
132 rs = set()
|
|
133 if af and ar and not af.aligned and not ar.aligned:
|
|
134 counts[ '_notaligned' ] += 1
|
|
135 continue
|
|
136 if af and ar and not af.aQual < minaqual and ar.aQual < minaqual:
|
|
137 counts[ '_lowaqual' ] += 1
|
|
138 continue
|
|
139 if af and af.aligned and af.aQual >= minaqual and af.iv.chrom in features.chrom_vectors.keys():
|
|
140 for cigop in af.cigar:
|
|
141 if cigop.type != "M":
|
|
142 continue
|
|
143 if reverse:
|
|
144 cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
|
|
145 for iv, s in features[cigop.ref_iv].steps():
|
|
146 rs = rs.union( s )
|
|
147 if ar and ar.aligned and ar.aQual >= minaqual and ar.iv.chrom in features.chrom_vectors.keys():
|
|
148 for cigop in ar.cigar:
|
|
149 if cigop.type != "M":
|
|
150 continue
|
|
151 if not reverse:
|
|
152 cigop.ref_iv.strand = reverse_strand( cigop.ref_iv.strand )
|
|
153 for iv, s in features[cigop.ref_iv].steps():
|
|
154 rs = rs.union( s )
|
|
155 set_of_gene_names = set( [ f.name.split(":")[0] for f in rs ] )
|
|
156 if len( set_of_gene_names ) == 0:
|
|
157 counts[ '_empty' ] += 1
|
|
158 elif len( set_of_gene_names ) > 1:
|
|
159 counts[ '_ambiguous' ] = 0
|
|
160 else:
|
|
161 for f in rs:
|
|
162 counts[ f.name ] += 1
|
|
163 num_reads += 1
|
|
164 if num_reads % 100000 == 0:
|
|
165 sys.stderr.write( "%d reads processed.\n" % num_reads )
|
|
166
|
|
167
|
|
168 # Step 3: Write out the results
|
|
169
|
|
170 fout = open( out_file, "w" )
|
|
171 for fn in sorted( counts.keys() ):
|
|
172 fout.write( "%s\t%d\n" % ( fn, counts[fn] ) )
|
|
173 fout.close()
|