changeset 0:f95150c37d38 draft default tip

planemo upload for repository https://github.com/ChrisD11/Tools commit ddc95e5d6b5f2c0a5340c0bc384aa822db8856d5
author chrisd
date Sun, 21 Feb 2016 23:31:55 -0500
parents
children
files README csa gene_fraction.xml gene_fraction/src/Alignments.cpp gene_fraction/src/Alignments.h gene_fraction/src/Alignments.o gene_fraction/src/Fasta.cpp gene_fraction/src/Fasta.h gene_fraction/src/Fasta.o gene_fraction/src/FastaRecord.cpp gene_fraction/src/FastaRecord.h gene_fraction/src/FastaRecord.o gene_fraction/src/Makefile gene_fraction/src/Sam.cpp gene_fraction/src/Sam.h gene_fraction/src/Sam.o gene_fraction/src/SamRatio.h gene_fraction/src/alignment_util.h gene_fraction/src/args.h gene_fraction/src/dir_util.h gene_fraction/src/gene_fraction.xml gene_fraction/src/int_util.h gene_fraction/src/main.cpp gene_fraction/src/main.o gene_fraction/src/ref.fa gene_fraction/src/res gene_fraction/src/test.sam
diffstat 26 files changed, 1091 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file csa has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction.xml	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,64 @@
+<tool id="gene_fraction" name="Coverage sampler" version="0.1.0">
+    <requirements>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+    <command><![CDATA[
+	$__tool_directory__/csa
+	  -amr_fp $input1
+	  -sam_fp $input2
+	  -min $min
+	  -max $max
+	  -t $threshold
+	  -skip $skip
+	  -samples $samples
+	  -out_fp $output1
+    ]]></command>
+    <inputs>
+	<param type="data" name="input1" format="fasta" />
+	<param type="data" name="input2" format="sam" />
+	<param name="min" type="integer" label="Starting sample level"
+	       value="1" min="1" max="100" help="(-min)" />
+	<param name="max" type="integer" label="Ending sample level"
+	       value="1" min="1" max="100" help="(-max)" />
+	<param name="threshold" type="integer" label="Gene threshold"
+	       value="1" min="1" max="100" help="(-t)" />
+	<param name="skip" type="integer" label="Amount of levels to skip"
+	       value="1" min="1" max="100" help="(-skip)" />
+	<param name="samples" type="integer" label="Amount of samples per level"
+	       value="1" min="1" max="100" help="(-samples)" />
+    </inputs>
+    <outputs>
+	<data name="output1" format="tabular" />
+    </outputs>
+    <tests>
+      <test>
+      <param name="input1" value="ref.fa"/>
+      <param name="input2" value="test.sam"/>
+      <output name="outfil1" file="out.txt"/>
+      </test>
+    </tests>
+    <help><![CDATA[
+        
+Program: Coverage Sampler 
+Contact: Chris Dean <cdean11@rams.colostate.edu
+
+Usage: csa [options]
+
+Options:
+
+    -amr_fp    amr database path
+    -sam_fp    sam file path
+    -min       starting level
+    -max       ending level
+    -skip      amount of levels to skip
+    -t         gene fraction threshold
+    -samples   amount of samples per level
+    -d         directory parsing
+    -bam       bam file parsing
+    -out_fp    output file path
+
+
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Alignments.cpp	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,35 @@
+#include "Alignments.h"
+#include "int_util.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include <iostream>
+#include <sstream>
+
+Alignments::Alignments(std::string alignment) : _alignment(alignment) {
+	fill_alignment_fields(alignment);
+}
+
+void Alignments::fill_alignment_fields(const std::string &alignment) {
+	std::istringstream ss(alignment);
+	ss >> field.QNAME >> field.FLAG >> field.RNAME >> field.POS >>
+	      field.MAPQ >> field.CIGAR >> field.RNEXT >> field.PNEXT >>
+	      field.TLEN >> field.SEQ >> field.QUAL;
+}
+
+std::vector<std::pair<int,char>> Alignments::cigar() {
+	return get_cigar_operations(field.CIGAR);
+}
+
+std::vector<std::pair<int,char>> Alignments::get_cigar_operations(const std::string &cigar) {
+	std::vector<std::pair<int,char>> p;
+    	int count;
+    	char operation;
+
+    	std::istringstream ss(cigar);
+    	while(ss >> count >> operation) {
+        	p.push_back(std::make_pair(count, operation));
+    	}
+
+    	return p;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Alignments.h	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,68 @@
+#ifndef ALIGNMENTS_H
+#define ALIGNMENTS_H
+
+#include <string>
+#include <vector>
+
+/**
+ * Stores information about an alignment
+ */
+struct alignment_fields {
+    std::string QNAME;
+    int FLAG;
+    std::string RNAME;
+    int POS;
+    int MAPQ;
+    std::string CIGAR;
+    std::string RNEXT;
+    int PNEXT;
+    int TLEN;
+    std::string SEQ;
+    std::string QUAL;
+};
+
+/**
+ * Class for dealing with alignments
+ */
+class Alignments {
+public:
+	/**
+ 	 * Ctor that initializes alignment
+ 	 */ 
+	Alignments(std::string alignment);
+
+	/**
+ 	 * Stores information about each of the eleven
+ 	 * required alignment fields
+ 	 */ 
+    	void fill_alignment_fields(const std::string &alignment);
+
+    	std::vector<std::pair<int,char>> cigar();
+
+	inline std::string alignment() const	    { return _alignment;  };
+
+    	inline std::string qname() const            { return field.QNAME; };
+    	inline std::string rname() const            { return field.RNAME; };
+    	inline std::string cigar() const            { return field.CIGAR; };
+    	inline std::string rnext() const            { return field.RNEXT; };
+    	inline std::string seq() const              { return field.SEQ;   };
+    	inline std::string qual() const             { return field.QUAL;  };
+
+    	inline int flag() const                     { return field.FLAG;  };
+    	inline int pos() const                      { return field.POS;   };
+    	inline int mapq() const                     { return field.MAPQ;  };
+    	inline int pnext() const                    { return field.PNEXT; };
+    	inline int tlen() const			    { return field.TLEN;  };
+
+private:
+	/**
+         * Returns a pair of cigar operations as (occurrence, operation)
+         * Ex: 10M5I -> (10, M), (5, I)
+         */
+    	std::vector<std::pair<int,char>> get_cigar_operations(const std::string &cigar);
+
+    	std::string _alignment;
+    	alignment_fields field;
+};
+
+#endif /* ALIGNMENTS_H */
Binary file gene_fraction/src/Alignments.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Fasta.cpp	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,39 @@
+#include "Fasta.h"
+#include "args.h"
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+
+Fasta::Fasta(std::string amr_fp) : _amr_fp(amr_fp) {}
+
+void Fasta::read_fasta(const std::string &amr_fp) {
+	std::ifstream in(amr_fp.c_str());
+	if(!in) {
+		usage();
+		exit(EXIT_FAILURE);
+	}
+
+	std::string gene_id, gene, line;
+	while(std::getline(in, line)) {
+		std::size_t gene_idx = line.find(" ");
+		
+		if(gene_idx != std::string::npos)
+			gene_id = line.substr(1, gene_idx-1);
+		else
+			gene_id = line.substr(1, line.length());
+
+		std::getline(in, gene);
+		records.push_back(FastaRecord(gene_id, gene));
+	}
+	in.close();
+	
+	FastaRecord::sort_by_gene_id(records);
+}
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Fasta.h	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,28 @@
+#ifndef FASTA_H
+#define FASTA_H
+
+#include <string>
+#include <vector>
+#include "FastaRecord.h"
+
+/**
+ * Class for dealing with fasta files
+ */
+class Fasta {
+public:
+	/** 
+ 	 * Constructor that sets amr file path 
+ 	 */
+	Fasta(std::string amr_fp);
+
+	/**
+ 	 * Reads fasta file from file path 
+ 	 */
+	void read_fasta(const std::string &amr_fp);
+	
+	std::vector<FastaRecord> records;
+private:
+	std::string _amr_fp;
+};
+
+#endif /* FASTA_H */
Binary file gene_fraction/src/Fasta.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/FastaRecord.cpp	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,62 @@
+#include "FastaRecord.h"
+
+#include <algorithm>
+
+FastaRecord::FastaRecord(std::string gene_id, std::string gene) :
+	_gene_id(gene_id), _gene(gene), _base_hits(_gene.length(), 0),
+	_gene_hits(0) {}
+
+std::string FastaRecord::gene_id() const { return _gene_id; }
+
+std::string FastaRecord::gene() const { return _gene; }
+
+void FastaRecord::update_gene_hits() {
+	_gene_hits++;
+}
+
+int FastaRecord::gene_hits() const {
+	return _gene_hits;
+}
+
+int FastaRecord::get_base_hits() const {
+	return static_cast<int>(count(_base_hits.begin(), _base_hits.end(), 1));
+}
+
+int FastaRecord::find_gene(const std::vector<FastaRecord> &records,
+			   const std::string &gene_id, std::string seq) {
+	int gene_index;
+
+	std::vector<FastaRecord>::const_iterator low;
+	// binary search for fasta record index
+	low = std::lower_bound(records.begin(), records.end(), FastaRecord(gene_id, seq),
+                                   [](const FastaRecord &a, const FastaRecord &b)
+                                   { return a.gene_id() < b.gene_id(); });
+	gene_index = (low - records.begin());
+
+	return gene_index;
+}
+
+void FastaRecord::sort_by_gene_id(std::vector<FastaRecord> &records) {
+	// sort records by gene id
+	sort(records.begin(), records.end(), [](const FastaRecord &a, const FastaRecord &b) { return a.gene_id() < b.gene_id(); });
+}
+
+void FastaRecord::reset_base_hits(std::vector<FastaRecord> &records) {
+	for_each(records.begin(), records.end(), [](FastaRecord &a) { std::fill(a.base_hits().begin(), a.base_hits().end(), 0); });
+}
+
+void FastaRecord::reset_gene_hits(std::vector<FastaRecord> &records) {
+	for_each(records.begin(), records.end(), [](FastaRecord &a) { a._gene_hits = 0; });
+}
+
+std::vector<int> &FastaRecord::base_hits() {
+        return _base_hits;
+}
+
+
+
+
+
+
+	
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/FastaRecord.h	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,86 @@
+#ifndef FASTA_RECORD_H
+#define FASTA_RECORD_H
+
+#include <string>
+#include <vector>
+
+/**
+ * Class for dealing with fasta records
+ */
+class FastaRecord {
+public:
+	/**
+ 	 * Ctor that initializes gene id and gene
+ 	 */ 
+	FastaRecord(std::string gene_id, std::string gene);
+
+	/**
+ 	 * Returns a string gene id
+ 	 */
+	std::string gene_id() const;
+
+	/**
+ 	 * Returns the gene associated with gene id
+ 	 */
+	std::string gene() const;
+
+	/**
+ 	 * Returns the total base hits for a gene
+ 	 */ 
+	int get_base_hits() const;
+
+	/**
+ 	 * Returns the amount of genes that were hit
+ 	 * during the gene fraction calculation
+ 	 */ 
+	int gene_hits() const;
+	
+	/**
+ 	 *
+ 	 */ 
+	void update_base_hits(const int &index);
+
+	/**
+ 	 *
+ 	 */ 
+	void update_gene_hits();
+
+	/**
+ 	 * Searches for a fasta record corresponding
+ 	 * to gene id
+ 	 */ 
+	static int find_gene(const std::vector<FastaRecord> &records, 
+			     const std::string &gene_id, 
+                             std::string seq = "");
+
+	/**
+ 	 * Sorts fasta records by gene id
+ 	 */ 
+	static void sort_by_gene_id(std::vector<FastaRecord> &records);
+
+	/**
+ 	 * Resets base hits vector to 0's. 
+ 	 * This occurs after each sample is processed
+ 	 */ 
+	static void reset_base_hits(std::vector<FastaRecord> &records);
+
+	/**
+ 	 * Resets gene hits primitive to 0.
+ 	 * This happens after each sample is processed 
+ 	 */
+	static void reset_gene_hits(std::vector<FastaRecord> &records);
+
+	std::vector<int> &base_hits();
+
+	std::string _gene_id;
+	std::string _gene;
+	std::vector<int> _base_hits;
+
+private:
+	int _gene_hits;
+};
+
+
+
+
+#endif /* FASTA_RECORD_H */
Binary file gene_fraction/src/FastaRecord.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Makefile	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,14 @@
+output: main.o Sam.o Alignments.o Fasta.o FastaRecord.o
+	g++ -std=c++11 main.o Sam.o Alignments.o Fasta.o FastaRecord.o -o csa
+main.o: main.cpp
+	g++ -c -std=c++11 main.cpp
+Sam.o: Sam.cpp
+	g++ -c -std=c++11 Sam.cpp
+Alignments.o: Alignments.cpp
+	g++ -c -std=c++11 Alignments.cpp
+Fasta.o: Fasta.cpp
+	g++ -c -std=c++11 Fasta.cpp
+FastaRecord.o: FastaRecord.cpp
+	g++ -c -std=c++11 FastaRecord.cpp
+clean:
+	rm *.o csa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Sam.cpp	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,41 @@
+#include "Sam.h"
+#include "args.h"
+#include "dir_util.h"
+#include "alignment_util.h"
+
+#include <iostream>
+#include <fstream>
+
+Sam::Sam(std::string sam_fp) : _sam_fp(sam_fp) {}
+
+void Sam::read_sam(cmd_args args) {
+	if(args.bam_stream) read_from_stdin();
+	else read_from_file(args.sam_fp);
+}
+
+void Sam::read_from_stdin() {
+	std::string line;
+	while(std::getline(std::cin, line)) {
+		if(line[0] == '@') continue;
+		alignment.push_back(line);
+	}
+}
+
+void Sam::read_from_file(const std::string &sam_fp) {
+	std::ifstream in(sam_fp.c_str());
+	if(!in) {
+		usage();
+		exit(EXIT_FAILURE);
+	}
+
+	std::string line;
+	while(getline(in, line)) {
+		if(line[0] == '@') continue;
+		if(is_good_alignment(line))
+			alignment.push_back(line);
+	}
+
+	in.close();
+}
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/Sam.h	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,48 @@
+#ifndef SAM_H
+#define SAM_H
+
+#include <string>
+#include <vector>
+
+#include "args.h"
+#include "Alignments.h"
+
+/**
+ * Class for dealing with sam files
+ */
+
+class Sam {
+public:
+	/**
+ 	 * Ctor initializes sam file path
+ 	 */ 
+	Sam(std::string sam_fp);
+	void read_sam(cmd_args args);
+
+	/**
+ 	 * Reads sam file from stdin 
+ 	 */ 
+	void read_from_stdin();
+
+	/**
+         * Reads sam file from directory or file path
+         */	 
+	void read_from_file(const std::string &sam_fp);
+
+	/**
+ 	 *
+ 	 */	 
+	void read_from_dir(const std::string &sam_dir_fp);
+
+	std::vector<Alignments> alignment;
+
+private:
+	std::string _sam_fp;
+};
+
+
+
+
+
+#endif /* SAM_H */
+
Binary file gene_fraction/src/Sam.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/SamRatio.h	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,248 @@
+#ifndef SAM_RATIO_H
+#define SAM_RATIO_H
+
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+
+#include "FastaRecord.h"
+#include "Alignments.h"
+#include "args.h"
+
+typedef std::vector<std::pair<int,char>> cigar_str;
+
+/**
+ *
+ */
+struct header {
+	std::string level =             "Level";
+        std::string iteration =         "Iteration";
+        std::string gene_id =           "Gene id";
+        std::string gene_fraction =     "Gene Fraction";
+        std::string hits =              "Hits";
+};
+
+/**
+ * Reports the total number of bases that were touched for each
+ * gene. This is largely calculated using the positional and seq
+ * information found in fields four and ten of each alignment
+ */
+void analyze_coverage(FastaRecord &record, Alignments &alignment) {
+	record.update_gene_hits();
+
+	cigar_str cigar = alignment.cigar();
+
+	int len;
+	char op;
+
+	int occurrence;
+	int pos_in_gene = alignment.pos();
+
+	int start, stop;
+	int base_hits = record._base_hits.size(); // func this
+	int read_length = alignment.seq().length(); //func this
+
+	if(pos_in_gene == 1) {
+		occurrence = 0;
+		for(int i = 0; i < cigar.size(); i++) {
+			len = cigar[i].first;
+			op = cigar[i].second;
+
+			switch(op) {
+				case 'M':
+					occurrence += len;
+					break;
+				case 'I':
+					occurrence += len;
+					break;
+				default:
+					break;
+			}
+		}
+
+		start = read_length - occurrence;
+		stop = start + read_length;
+
+		for(int i = start; i < base_hits; i++) {
+			if(i == stop) break;
+			record._base_hits[i] = 1;
+		}
+	}
+	else {
+		start = pos_in_gene - 1;
+		stop = start + read_length;
+
+		for(int i = start; i < base_hits; i++) {
+			if(i == stop) break;
+			record._base_hits[i] = 1;
+		}
+	}
+}
+
+/**
+ * Returns gene fraction of fasta record
+ * Returns -1 if gene fraction is not greater than threshold
+ */
+double coverage(const FastaRecord &record, const int &threshold) {
+        double gene_coverage;
+
+        int base_hits, gene_size;
+
+        base_hits = record.get_base_hits();
+        gene_size = record.gene().length();
+
+        gene_coverage = (static_cast<double>(base_hits)/static_cast<double>(gene_size))*100;
+
+        if(gene_coverage > threshold)
+                return gene_coverage;
+        return -1;
+}
+
+/**
+ * Writes header to output file when
+ * reading from stdin
+ */
+void bam_stream_header() {
+	header h;
+	char sep = ',';
+
+	std::cout << h.level << sep << h.iteration << sep 
+                  << h.gene_id << sep << h.gene_fraction << sep 
+                  << h.hits << '\n';
+}
+
+/**
+ * Writes header to output file when
+ * reading from sam file
+ */
+void file_header(const std::string &out_fp, const std::string &sam_fp) {
+	header h;
+	std::ofstream out(out_fp.c_str(), std::ofstream::app );
+	char sep = ',';
+
+	//out << "@" << sam_fp << '\n';
+	out << h.level << sep << h.iteration << sep 
+            << h.gene_id << sep << h.gene_fraction << sep 
+            << h.hits << '\n';
+	out.close();
+}
+
+/**
+ *
+ */
+void create_header(cmd_args &args) {
+	if(args.bam_stream) {
+		bam_stream_header();
+	}
+	else {
+		file_header(args.out_fp, args.sam_fp);
+	}
+}
+
+/**
+ * Writes results to output file when reading from
+ * stdin
+ */
+void bam_stream_results(std::vector<FastaRecord> &records,
+                        const int &level, const int &iteration,
+                        cmd_args &args) {
+
+	double gene_fraction;
+	int hits_seen;
+	std::string id;
+	char sep = ',';
+
+	for(auto &rec : records) {
+		gene_fraction = coverage(rec, args.threshold);
+		hits_seen = rec.gene_hits();
+		id = rec.gene_id();
+
+		if(gene_fraction > 0) {
+			std::cout << level << sep << iteration << sep
+			          << id << sep << gene_fraction << sep
+			          << hits_seen << '\n';			
+		}
+	}
+}
+
+/**
+ * Write results when reading sam file from
+ * path
+ */
+void file_results(std::vector<FastaRecord> &records,
+                  const int level, const int &iteration,
+                  cmd_args &args) {
+
+	std::ofstream out(args.out_fp.c_str(), std::ofstream::app);
+	
+	double gene_fraction;
+	int hits_seen;
+	std::string id;
+	char sep = ',';
+
+	for(auto &rec : records) {
+		gene_fraction = coverage(rec, args.threshold);
+		hits_seen = rec.gene_hits();
+		id = rec.gene_id();
+
+		if(gene_fraction > 0) {
+			out << level << sep << iteration << sep
+			    << id << sep << gene_fraction << sep
+                            << hits_seen << '\n';
+		}
+	}
+	out.close();
+}
+
+/**
+ *
+ */
+void report_results(std::vector<FastaRecord> &records,
+		    const int level, const int &iteration,
+		    cmd_args &args) {
+
+	if(args.bam_stream) {
+		bam_stream_results(records,level,iteration,args);
+	}
+	else {
+		file_results(records,level,iteration,args);
+	}
+}
+
+/**
+ * Generates a sequence of samples from sam file specified
+ * by the starting level, max level, and skip pattern
+ */
+void generate_samples(std::vector<FastaRecord> &records,
+                      std::vector<Alignments> &alignments,
+		      cmd_args &args) {
+
+	int read_count = alignments.size();
+	int sample_size;
+
+	srand(unsigned(time(0)));
+
+	std::vector<int> sequence(read_count);
+	iota(sequence.begin(), sequence.end(), 0);
+
+	create_header(args);	
+
+	for(int level = args.min; level <= args.max; level += args.skip) {
+		for(int sample = 0; sample < args.s_per_lev; sample++) {
+			random_shuffle(sequence.begin(), sequence.end(), randomize);
+			sample_size = round(((static_cast<double>(level)/100)*read_count));
+			std::vector<int> chosen(sequence.begin(), sequence.begin()+sample_size);
+
+			for(int a_idx = 0; a_idx < chosen.size(); a_idx++) {
+				std::string rname = alignments[chosen[a_idx]].rname();
+				int gene_idx = FastaRecord::find_gene(records, rname);	
+				analyze_coverage(records[gene_idx], alignments[chosen[a_idx]]);
+			}
+			report_results(records,level,sample+1,args);
+			FastaRecord::reset_base_hits(records);
+			FastaRecord::reset_gene_hits(records);
+		}
+	}
+}
+
+#endif /* SAM_RATIO_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/alignment_util.h	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,96 @@
+#ifndef ALIGNMENT_UTIL_H
+#define ALIGNMENT_UTIL_H
+
+#include <string>
+#include <vector>
+
+#include <boost/algorithm/string.hpp>
+
+#include "int_util.h"
+
+// macro to check if read mapped
+#define READ_UNMAPPED 4
+
+/**
+ * Splits alignment into separate parts
+ */
+std::vector<std::string> split_alignment(std::string &alignment) {
+    	std::vector<std::string> parts;
+
+    	boost::trim_if(alignment, boost::is_any_of("\t "));
+	// split on tab delimeter
+    	boost::split(parts, alignment, boost::is_any_of("\t "), boost::token_compress_on);
+	
+    	return parts;
+}
+
+/**
+ * Validates bit flag
+ */
+bool is_good_flag(const int &bit_flag) {
+    	if( (bit_flag & READ_UNMAPPED) > 0) return false;
+    	return true;
+}
+
+/**
+ * Validates rname
+ */
+bool is_good_rname(const std::string &rname) {
+    	return rname.compare("*") != 0;
+}
+
+/**
+ * Validates pos
+ */
+bool is_good_pos(const int &pos) {
+    	return pos > 0;
+}
+
+/**
+ * Validates cigar
+ */
+bool is_good_cigar(const std::string &cigar) {
+    	return cigar.compare("*") != 0;
+}
+
+/**
+ * Validates seq
+ */
+bool is_good_seq(const std::string &seq) {
+    	return seq.compare("*") != 0;
+}
+
+/**
+ * Validates alignment fields
+ */
+bool fields_are_good(std::vector<std::string> &parts) {
+    	int bit_flag = s_to_i(parts[1]);
+    	int pos = s_to_i(parts[3]);
+
+    	std::string rname = parts[2];
+    	std::string cigar = parts[5];
+    	std::string seq = parts[9];
+
+    	if(!(is_good_flag(bit_flag))) return false;
+    	if(!(is_good_pos(pos))) return false;
+    	if(!(is_good_rname(rname))) return false;
+    	if(!(is_good_cigar(cigar))) return false;
+    	if(!(is_good_seq(seq))) return false;
+
+    	return true;
+}
+
+/**
+ * Stores alignments that pass validity checks
+ */
+bool is_good_alignment(std::string &alignment) {
+    	std::vector<std::string> alignment_parts;
+
+    	alignment_parts = split_alignment(alignment);
+
+    	if(!(fields_are_good(alignment_parts)))
+        	return false;
+    	return true;
+}
+
+#endif /* ALIGNMENT_UTIL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/args.h	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,90 @@
+#ifndef ARGS_H
+#define ARGS_H
+
+#include "int_util.h"
+
+#include <string>
+#include <vector>
+#include <list>
+
+static void usage() {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "Program: Coverage Sampler \n");
+        fprintf(stderr, "Contact: Chris Dean <cdean11@rams.colostate.edu\n\n");
+        fprintf(stderr, "Usage: csa [options]\n\n");
+        fprintf(stderr, "Options:\n\n");
+        fprintf(stderr, "    -amr_fp    amr database path\n");
+        fprintf(stderr, "    -sam_fp    sam file path\n");
+        fprintf(stderr, "    -min       starting level\n");
+        fprintf(stderr, "    -max       ending level\n");
+        fprintf(stderr, "    -skip      amount of levels to skip\n");
+        fprintf(stderr, "    -t         gene fraction threshold\n");
+        fprintf(stderr, "    -samples   amount of samples per level\n");
+        fprintf(stderr, "    -d         directory parsing\n");
+        fprintf(stderr, "    -bam       bam file parsing\n");
+        fprintf(stderr, "    -out_fp    output file path\n\n");
+}
+
+/**
+ * Encapsulates information input
+ * from the command line.
+ */
+struct cmd_args {
+	std::string amr_fp;
+	std::string sam_fp;
+	std::list<std::string> sam_dir_fp;
+	std::string out_fp;
+	
+	int threshold;
+	int min;
+	int max;
+	int skip;
+	int s_per_lev;
+
+	bool sam_dir = false;		/* This will be set to true when parsing a
+                                           directory of sam files. */
+	bool bam_stream = false;	/* This will be set to true when executing
+                                           samtools view -h example.bam | csa > output
+                                           from the command line. */
+};
+
+/**
+ * Returns a struct of command line arguments.
+ */
+static inline cmd_args
+parse_command_line(int argc, char *argv[]) {
+	std::vector<std::string> args(argv, argv + argc);
+
+	cmd_args arg;
+
+	for(int i = 1; i < argc; i++) {
+		if(args[i].compare("-amr_fp") == 0)
+			arg.amr_fp = args[++i];
+		else if(args[i].compare("-sam_fp") == 0) 
+			arg.sam_fp = args[++i];
+		else if(args[i].compare("-out_fp") == 0)
+			arg.out_fp = args[++i];
+		else if(args[i].compare("-t") == 0)
+			arg.threshold = s_to_i(args[++i]);
+		else if(args[i].compare("-min") == 0)
+			arg.min = s_to_i(args[++i]);
+		else if(args[i].compare("-max") == 0)
+			arg.max = s_to_i(args[++i]);
+		else if(args[i].compare("-skip") == 0)
+			arg.skip = s_to_i(args[++i]);
+		else if(args[i].compare("-samples") == 0)
+			arg.s_per_lev = s_to_i(args[++i]);
+		else if(args[i].compare("-d") == 0)
+			arg.sam_dir = true;
+		else if(args[i].compare("-bam") == 0)
+			arg.bam_stream = true;
+		else {
+			usage();
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	return arg;
+}
+
+#endif /* ARGS_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/dir_util.h	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,43 @@
+#ifndef DIR_UTIL_H
+#define DIR_UTIL_H
+
+#include <list>
+#include <iostream>
+#include <string>
+#include <dirent.h>
+
+/*
+ * Parses a directory of sam files
+ */
+static inline std::list<std::string>
+parse_sam_dir(const std::string &directory) {
+        DIR *dir;
+        struct dirent *ent;
+        std::string dir_path = directory;
+
+        dir = opendir(dir_path.c_str());
+	// is dir open/valid?
+        if(dir == NULL) {
+                std::cout << "Not a valid directory" << std::endl;
+                exit(EXIT_FAILURE);
+        }
+
+        std::list<std::string> sam_files;
+        std::string fn;
+        std::string ext;
+        std::string file_type = ".sam";
+
+	// get all files with a .sam file extension
+        while((ent = readdir(dir)) != NULL) {
+                fn = dir_path + std::string(ent->d_name);
+                ext = fn.substr(fn.length()-4, fn.length());
+                if(ext.compare(file_type) == 0) {
+                        sam_files.push_back(fn);
+                }
+        }
+        closedir(dir);
+
+        return sam_files;
+}
+
+#endif /* DIR_UTIL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/gene_fraction.xml	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,57 @@
+<tool id="gene_fraction" name="Coverage sampler" version="0.1.0">
+    <requirements>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+    <command><![CDATA[
+	csa
+	    -amr_fp $input1
+            -sam_fp $input2
+	    -min $min
+	    -max $max
+	    -skip $skip
+	    -t $threshold
+	    -samples $samples
+	    -out_fp $output1
+    ]]></command>
+    <inputs>
+	<param type="data" name="input1" format="fasta" />
+	<param type="data" name="input2" format="sam" />
+	<param name="min" type="integer" label="Minimum sample value"
+	       value="1" min="1" max="100" help="(-min)" />
+	<param name="max" type="integer" label="Maximum sample value"
+		value="1" min="1" max="100" help="(-max)" />
+	<param name="skip" type="integer" label="Amount of levels to skip"
+		value="1" min="1" max="100" help="(-skip)" />
+	<param name="threshold" type="integer" label="Gene fraction threshold"
+		value="1" min="1" max="100" help="(-t)" />
+	<param name="samples" type="integer" label="Amount of samples per level"
+		value="1" min="1" max="100" help="(-samples)" />
+    </inputs>
+    <outputs>
+	<data name="output1" format="tabular" />
+    </outputs>
+    <help><![CDATA[
+        
+Program: Coverage Sampler 
+Contact: Chris Dean <cdean11@rams.colostate.edu
+
+Usage: csa [options]
+
+Options:
+
+    -amr_fp    amr database path
+    -sam_fp    sam file path
+    -min       starting level
+    -max       ending level
+    -skip      amount of levels to skip
+    -t         gene fraction threshold
+    -samples   amount of samples per level
+    -d         directory parsing
+    -bam       bam file parsing
+    -out_fp    output file path
+
+
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/int_util.h	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,29 @@
+#ifndef INT_UTIL_H
+#define INT_UTIL_H
+
+#include <string>
+#include <sstream>
+
+/**
+ * Given a string, return its integer.
+ */
+static inline int
+s_to_i(const std::string &s) {
+	std::istringstream ss(s);
+	int i;
+	ss >> i;
+	return i;
+}
+
+/**
+ * Given an integer, return a random number
+ * between 0 and i.
+ */
+static inline int
+randomize(const int &i) {
+	return rand() % i;
+}
+
+#endif /*INT_UTIL_H */
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/main.cpp	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,37 @@
+#include <string>
+#include <iostream>
+#include <vector>
+
+#include "int_util.h"
+#include "dir_util.h"
+#include "args.h"
+#include "Fasta.h"
+#include "Sam.h"
+#include "SamRatio.h"
+
+using namespace std;
+
+int main(int argc, char *argv[]) {
+	cmd_args args;
+	args = parse_command_line(argc, argv);
+
+	Fasta f(args.amr_fp);
+	f.read_fasta(args.amr_fp);	
+
+	if(args.sam_dir) {
+		list<string> sam_files = parse_sam_dir(args.sam_fp);
+		for(auto &fn : sam_files) {
+			args.sam_fp = fn;
+			Sam s(args.sam_fp);
+			s.read_sam(args);
+			generate_samples(f.records, s.alignment, args);
+		}
+	}
+	else {
+		Sam s(args.sam_fp);
+		s.read_sam(args);
+		generate_samples(f.records, s.alignment, args);
+	}
+
+	return 0;
+}
Binary file gene_fraction/src/main.o has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/ref.fa	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,2 @@
+>seq1
+AAAAAAAAAA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/res	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,3 @@
+@test.sam
+Level,Iteration,Gene id,Gene Fraction,Hits
+100,1,seq1,50,1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_fraction/src/test.sam	Sun Feb 21 23:31:55 2016 -0500
@@ -0,0 +1,1 @@
+QNAME	0	seq1	3	0	5M	0	0	0	AAAAA	;;;;;