diff gene_fraction/src/SamRatio.h @ 0:f95150c37d38 draft default tip

planemo upload for repository https://github.com/ChrisD11/Tools commit ddc95e5d6b5f2c0a5340c0bc384aa822db8856d5
author chrisd
date Sun, 21 Feb 2016 23:31:55 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/SamRatio.h	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,248 @@
+#ifndef SAM_RATIO_H
+#define SAM_RATIO_H
+
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+
+#include "FastaRecord.h"
+#include "Alignments.h"
+#include "args.h"
+
+typedef std::vector<std::pair<int,char>> cigar_str;
+
+/**
+ *
+ */
+struct header {
+	std::string level =             "Level";
+        std::string iteration =         "Iteration";
+        std::string gene_id =           "Gene id";
+        std::string gene_fraction =     "Gene Fraction";
+        std::string hits =              "Hits";
+};
+
+/**
+ * Reports the total number of bases that were touched for each
+ * gene. This is largely calculated using the positional and seq
+ * information found in fields four and ten of each alignment
+ */
+void analyze_coverage(FastaRecord &record, Alignments &alignment) {
+	record.update_gene_hits();
+
+	cigar_str cigar = alignment.cigar();
+
+	int len;
+	char op;
+
+	int occurrence;
+	int pos_in_gene = alignment.pos();
+
+	int start, stop;
+	int base_hits = record._base_hits.size(); // func this
+	int read_length = alignment.seq().length(); //func this
+
+	if(pos_in_gene == 1) {
+		occurrence = 0;
+		for(int i = 0; i < cigar.size(); i++) {
+			len = cigar[i].first;
+			op = cigar[i].second;
+
+			switch(op) {
+				case 'M':
+					occurrence += len;
+					break;
+				case 'I':
+					occurrence += len;
+					break;
+				default:
+					break;
+			}
+		}
+
+		start = read_length - occurrence;
+		stop = start + read_length;
+
+		for(int i = start; i < base_hits; i++) {
+			if(i == stop) break;
+			record._base_hits[i] = 1;
+		}
+	}
+	else {
+		start = pos_in_gene - 1;
+		stop = start + read_length;
+
+		for(int i = start; i < base_hits; i++) {
+			if(i == stop) break;
+			record._base_hits[i] = 1;
+		}
+	}
+}
+
+/**
+ * Returns gene fraction of fasta record
+ * Returns -1 if gene fraction is not greater than threshold
+ */
+double coverage(const FastaRecord &record, const int &threshold) {
+        double gene_coverage;
+
+        int base_hits, gene_size;
+
+        base_hits = record.get_base_hits();
+        gene_size = record.gene().length();
+
+        gene_coverage = (static_cast<double>(base_hits)/static_cast<double>(gene_size))*100;
+
+        if(gene_coverage > threshold)
+                return gene_coverage;
+        return -1;
+}
+
+/**
+ * Writes header to output file when
+ * reading from stdin
+ */
+void bam_stream_header() {
+	header h;
+	char sep = ',';
+
+	std::cout << h.level << sep << h.iteration << sep 
+                  << h.gene_id << sep << h.gene_fraction << sep 
+                  << h.hits << '\n';
+}
+
+/**
+ * Writes header to output file when
+ * reading from sam file
+ */
+void file_header(const std::string &out_fp, const std::string &sam_fp) {
+	header h;
+	std::ofstream out(out_fp.c_str(), std::ofstream::app );
+	char sep = ',';
+
+	//out << "@" << sam_fp << '\n';
+	out << h.level << sep << h.iteration << sep 
+            << h.gene_id << sep << h.gene_fraction << sep 
+            << h.hits << '\n';
+	out.close();
+}
+
+/**
+ *
+ */
+void create_header(cmd_args &args) {
+	if(args.bam_stream) {
+		bam_stream_header();
+	}
+	else {
+		file_header(args.out_fp, args.sam_fp);
+	}
+}
+
+/**
+ * Writes results to output file when reading from
+ * stdin
+ */
+void bam_stream_results(std::vector<FastaRecord> &records,
+                        const int &level, const int &iteration,
+                        cmd_args &args) {
+
+	double gene_fraction;
+	int hits_seen;
+	std::string id;
+	char sep = ',';
+
+	for(auto &rec : records) {
+		gene_fraction = coverage(rec, args.threshold);
+		hits_seen = rec.gene_hits();
+		id = rec.gene_id();
+
+		if(gene_fraction > 0) {
+			std::cout << level << sep << iteration << sep
+			          << id << sep << gene_fraction << sep
+			          << hits_seen << '\n';			
+		}
+	}
+}
+
+/**
+ * Write results when reading sam file from
+ * path
+ */
+void file_results(std::vector<FastaRecord> &records,
+                  const int level, const int &iteration,
+                  cmd_args &args) {
+
+	std::ofstream out(args.out_fp.c_str(), std::ofstream::app);
+	
+	double gene_fraction;
+	int hits_seen;
+	std::string id;
+	char sep = ',';
+
+	for(auto &rec : records) {
+		gene_fraction = coverage(rec, args.threshold);
+		hits_seen = rec.gene_hits();
+		id = rec.gene_id();
+
+		if(gene_fraction > 0) {
+			out << level << sep << iteration << sep
+			    << id << sep << gene_fraction << sep
+                            << hits_seen << '\n';
+		}
+	}
+	out.close();
+}
+
+/**
+ *
+ */
+void report_results(std::vector<FastaRecord> &records,
+		    const int level, const int &iteration,
+		    cmd_args &args) {
+
+	if(args.bam_stream) {
+		bam_stream_results(records,level,iteration,args);
+	}
+	else {
+		file_results(records,level,iteration,args);
+	}
+}
+
+/**
+ * Generates a sequence of samples from sam file specified
+ * by the starting level, max level, and skip pattern
+ */
+void generate_samples(std::vector<FastaRecord> &records,
+                      std::vector<Alignments> &alignments,
+		      cmd_args &args) {
+
+	int read_count = alignments.size();
+	int sample_size;
+
+	srand(unsigned(time(0)));
+
+	std::vector<int> sequence(read_count);
+	iota(sequence.begin(), sequence.end(), 0);
+
+	create_header(args);	
+
+	for(int level = args.min; level <= args.max; level += args.skip) {
+		for(int sample = 0; sample < args.s_per_lev; sample++) {
+			random_shuffle(sequence.begin(), sequence.end(), randomize);
+			sample_size = round(((static_cast<double>(level)/100)*read_count));
+			std::vector<int> chosen(sequence.begin(), sequence.begin()+sample_size);
+
+			for(int a_idx = 0; a_idx < chosen.size(); a_idx++) {
+				std::string rname = alignments[chosen[a_idx]].rname();
+				int gene_idx = FastaRecord::find_gene(records, rname);	
+				analyze_coverage(records[gene_idx], alignments[chosen[a_idx]]);
+			}
+			report_results(records,level,sample+1,args);
+			FastaRecord::reset_base_hits(records);
+			FastaRecord::reset_gene_hits(records);
+		}
+	}
+}
+
+#endif /* SAM_RATIO_H */