comparison gene_fraction/src/SamRatio.h @ 0:f95150c37d38 draft default tip

planemo upload for repository https://github.com/ChrisD11/Tools commit ddc95e5d6b5f2c0a5340c0bc384aa822db8856d5
author chrisd
date Sun, 21 Feb 2016 23:31:55 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f95150c37d38
1 #ifndef SAM_RATIO_H
2 #define SAM_RATIO_H
3
4 #include <iostream>
5 #include <fstream>
6 #include <algorithm>
7
8 #include "FastaRecord.h"
9 #include "Alignments.h"
10 #include "args.h"
11
12 typedef std::vector<std::pair<int,char>> cigar_str;
13
14 /**
15 *
16 */
17 struct header {
18 std::string level = "Level";
19 std::string iteration = "Iteration";
20 std::string gene_id = "Gene id";
21 std::string gene_fraction = "Gene Fraction";
22 std::string hits = "Hits";
23 };
24
25 /**
26 * Reports the total number of bases that were touched for each
27 * gene. This is largely calculated using the positional and seq
28 * information found in fields four and ten of each alignment
29 */
30 void analyze_coverage(FastaRecord &record, Alignments &alignment) {
31 record.update_gene_hits();
32
33 cigar_str cigar = alignment.cigar();
34
35 int len;
36 char op;
37
38 int occurrence;
39 int pos_in_gene = alignment.pos();
40
41 int start, stop;
42 int base_hits = record._base_hits.size(); // func this
43 int read_length = alignment.seq().length(); //func this
44
45 if(pos_in_gene == 1) {
46 occurrence = 0;
47 for(int i = 0; i < cigar.size(); i++) {
48 len = cigar[i].first;
49 op = cigar[i].second;
50
51 switch(op) {
52 case 'M':
53 occurrence += len;
54 break;
55 case 'I':
56 occurrence += len;
57 break;
58 default:
59 break;
60 }
61 }
62
63 start = read_length - occurrence;
64 stop = start + read_length;
65
66 for(int i = start; i < base_hits; i++) {
67 if(i == stop) break;
68 record._base_hits[i] = 1;
69 }
70 }
71 else {
72 start = pos_in_gene - 1;
73 stop = start + read_length;
74
75 for(int i = start; i < base_hits; i++) {
76 if(i == stop) break;
77 record._base_hits[i] = 1;
78 }
79 }
80 }
81
82 /**
83 * Returns gene fraction of fasta record
84 * Returns -1 if gene fraction is not greater than threshold
85 */
86 double coverage(const FastaRecord &record, const int &threshold) {
87 double gene_coverage;
88
89 int base_hits, gene_size;
90
91 base_hits = record.get_base_hits();
92 gene_size = record.gene().length();
93
94 gene_coverage = (static_cast<double>(base_hits)/static_cast<double>(gene_size))*100;
95
96 if(gene_coverage > threshold)
97 return gene_coverage;
98 return -1;
99 }
100
101 /**
102 * Writes header to output file when
103 * reading from stdin
104 */
105 void bam_stream_header() {
106 header h;
107 char sep = ',';
108
109 std::cout << h.level << sep << h.iteration << sep
110 << h.gene_id << sep << h.gene_fraction << sep
111 << h.hits << '\n';
112 }
113
114 /**
115 * Writes header to output file when
116 * reading from sam file
117 */
118 void file_header(const std::string &out_fp, const std::string &sam_fp) {
119 header h;
120 std::ofstream out(out_fp.c_str(), std::ofstream::app );
121 char sep = ',';
122
123 //out << "@" << sam_fp << '\n';
124 out << h.level << sep << h.iteration << sep
125 << h.gene_id << sep << h.gene_fraction << sep
126 << h.hits << '\n';
127 out.close();
128 }
129
130 /**
131 *
132 */
133 void create_header(cmd_args &args) {
134 if(args.bam_stream) {
135 bam_stream_header();
136 }
137 else {
138 file_header(args.out_fp, args.sam_fp);
139 }
140 }
141
142 /**
143 * Writes results to output file when reading from
144 * stdin
145 */
146 void bam_stream_results(std::vector<FastaRecord> &records,
147 const int &level, const int &iteration,
148 cmd_args &args) {
149
150 double gene_fraction;
151 int hits_seen;
152 std::string id;
153 char sep = ',';
154
155 for(auto &rec : records) {
156 gene_fraction = coverage(rec, args.threshold);
157 hits_seen = rec.gene_hits();
158 id = rec.gene_id();
159
160 if(gene_fraction > 0) {
161 std::cout << level << sep << iteration << sep
162 << id << sep << gene_fraction << sep
163 << hits_seen << '\n';
164 }
165 }
166 }
167
168 /**
169 * Write results when reading sam file from
170 * path
171 */
172 void file_results(std::vector<FastaRecord> &records,
173 const int level, const int &iteration,
174 cmd_args &args) {
175
176 std::ofstream out(args.out_fp.c_str(), std::ofstream::app);
177
178 double gene_fraction;
179 int hits_seen;
180 std::string id;
181 char sep = ',';
182
183 for(auto &rec : records) {
184 gene_fraction = coverage(rec, args.threshold);
185 hits_seen = rec.gene_hits();
186 id = rec.gene_id();
187
188 if(gene_fraction > 0) {
189 out << level << sep << iteration << sep
190 << id << sep << gene_fraction << sep
191 << hits_seen << '\n';
192 }
193 }
194 out.close();
195 }
196
197 /**
198 *
199 */
200 void report_results(std::vector<FastaRecord> &records,
201 const int level, const int &iteration,
202 cmd_args &args) {
203
204 if(args.bam_stream) {
205 bam_stream_results(records,level,iteration,args);
206 }
207 else {
208 file_results(records,level,iteration,args);
209 }
210 }
211
212 /**
213 * Generates a sequence of samples from sam file specified
214 * by the starting level, max level, and skip pattern
215 */
216 void generate_samples(std::vector<FastaRecord> &records,
217 std::vector<Alignments> &alignments,
218 cmd_args &args) {
219
220 int read_count = alignments.size();
221 int sample_size;
222
223 srand(unsigned(time(0)));
224
225 std::vector<int> sequence(read_count);
226 iota(sequence.begin(), sequence.end(), 0);
227
228 create_header(args);
229
230 for(int level = args.min; level <= args.max; level += args.skip) {
231 for(int sample = 0; sample < args.s_per_lev; sample++) {
232 random_shuffle(sequence.begin(), sequence.end(), randomize);
233 sample_size = round(((static_cast<double>(level)/100)*read_count));
234 std::vector<int> chosen(sequence.begin(), sequence.begin()+sample_size);
235
236 for(int a_idx = 0; a_idx < chosen.size(); a_idx++) {
237 std::string rname = alignments[chosen[a_idx]].rname();
238 int gene_idx = FastaRecord::find_gene(records, rname);
239 analyze_coverage(records[gene_idx], alignments[chosen[a_idx]]);
240 }
241 report_results(records,level,sample+1,args);
242 FastaRecord::reset_base_hits(records);
243 FastaRecord::reset_gene_hits(records);
244 }
245 }
246 }
247
248 #endif /* SAM_RATIO_H */