Mercurial > repos > chrisd > testing
changeset 0:f95150c37d38 draft default tip
planemo upload for repository https://github.com/ChrisD11/Tools commit ddc95e5d6b5f2c0a5340c0bc384aa822db8856d5
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction.xml Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,64 @@ +<tool id="gene_fraction" name="Coverage sampler" version="0.1.0"> + <requirements> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command><![CDATA[ + $__tool_directory__/csa + -amr_fp $input1 + -sam_fp $input2 + -min $min + -max $max + -t $threshold + -skip $skip + -samples $samples + -out_fp $output1 + ]]></command> + <inputs> + <param type="data" name="input1" format="fasta" /> + <param type="data" name="input2" format="sam" /> + <param name="min" type="integer" label="Starting sample level" + value="1" min="1" max="100" help="(-min)" /> + <param name="max" type="integer" label="Ending sample level" + value="1" min="1" max="100" help="(-max)" /> + <param name="threshold" type="integer" label="Gene threshold" + value="1" min="1" max="100" help="(-t)" /> + <param name="skip" type="integer" label="Amount of levels to skip" + value="1" min="1" max="100" help="(-skip)" /> + <param name="samples" type="integer" label="Amount of samples per level" + value="1" min="1" max="100" help="(-samples)" /> + </inputs> + <outputs> + <data name="output1" format="tabular" /> + </outputs> + <tests> + <test> + <param name="input1" value="ref.fa"/> + <param name="input2" value="test.sam"/> + <output name="outfil1" file="out.txt"/> + </test> + </tests> + <help><![CDATA[ + +Program: Coverage Sampler +Contact: Chris Dean <cdean11@rams.colostate.edu + +Usage: csa [options] + +Options: + + -amr_fp amr database path + -sam_fp sam file path + -min starting level + -max ending level + -skip amount of levels to skip + -t gene fraction threshold + -samples amount of samples per level + -d directory parsing + -bam bam file parsing + -out_fp output file path + + + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Alignments.cpp Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,35 @@ +#include "Alignments.h" +#include "int_util.h" + +#include <boost/algorithm/string.hpp> + +#include <iostream> +#include <sstream> + +Alignments::Alignments(std::string alignment) : _alignment(alignment) { + fill_alignment_fields(alignment); +} + +void Alignments::fill_alignment_fields(const std::string &alignment) { + std::istringstream ss(alignment); + ss >> field.QNAME >> field.FLAG >> field.RNAME >> field.POS >> + field.MAPQ >> field.CIGAR >> field.RNEXT >> field.PNEXT >> + field.TLEN >> field.SEQ >> field.QUAL; +} + +std::vector<std::pair<int,char>> Alignments::cigar() { + return get_cigar_operations(field.CIGAR); +} + +std::vector<std::pair<int,char>> Alignments::get_cigar_operations(const std::string &cigar) { + std::vector<std::pair<int,char>> p; + int count; + char operation; + + std::istringstream ss(cigar); + while(ss >> count >> operation) { + p.push_back(std::make_pair(count, operation)); + } + + return p; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Alignments.h Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,68 @@ +#ifndef ALIGNMENTS_H +#define ALIGNMENTS_H + +#include <string> +#include <vector> + +/** + * Stores information about an alignment + */ +struct alignment_fields { + std::string QNAME; + int FLAG; + std::string RNAME; + int POS; + int MAPQ; + std::string CIGAR; + std::string RNEXT; + int PNEXT; + int TLEN; + std::string SEQ; + std::string QUAL; +}; + +/** + * Class for dealing with alignments + */ +class Alignments { +public: + /** + * Ctor that initializes alignment + */ + Alignments(std::string alignment); + + /** + * Stores information about each of the eleven + * required alignment fields + */ + void fill_alignment_fields(const std::string &alignment); + + std::vector<std::pair<int,char>> cigar(); + + inline std::string alignment() const { return _alignment; }; + + inline std::string qname() const { return field.QNAME; }; + inline std::string rname() const { return field.RNAME; }; + inline std::string cigar() const { return field.CIGAR; }; + inline std::string rnext() const { return field.RNEXT; }; + inline std::string seq() const { return field.SEQ; }; + inline std::string qual() const { return field.QUAL; }; + + inline int flag() const { return field.FLAG; }; + inline int pos() const { return field.POS; }; + inline int mapq() const { return field.MAPQ; }; + inline int pnext() const { return field.PNEXT; }; + inline int tlen() const { return field.TLEN; }; + +private: + /** + * Returns a pair of cigar operations as (occurrence, operation) + * Ex: 10M5I -> (10, M), (5, I) + */ + std::vector<std::pair<int,char>> get_cigar_operations(const std::string &cigar); + + std::string _alignment; + alignment_fields field; +}; + +#endif /* ALIGNMENTS_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Fasta.cpp Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,39 @@ +#include "Fasta.h" +#include "args.h" + +#include <iostream> +#include <fstream> +#include <vector> +#include <string> + +Fasta::Fasta(std::string amr_fp) : _amr_fp(amr_fp) {} + +void Fasta::read_fasta(const std::string &amr_fp) { + std::ifstream in(amr_fp.c_str()); + if(!in) { + usage(); + exit(EXIT_FAILURE); + } + + std::string gene_id, gene, line; + while(std::getline(in, line)) { + std::size_t gene_idx = line.find(" "); + + if(gene_idx != std::string::npos) + gene_id = line.substr(1, gene_idx-1); + else + gene_id = line.substr(1, line.length()); + + std::getline(in, gene); + records.push_back(FastaRecord(gene_id, gene)); + } + in.close(); + + FastaRecord::sort_by_gene_id(records); +} + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Fasta.h Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,28 @@ +#ifndef FASTA_H +#define FASTA_H + +#include <string> +#include <vector> +#include "FastaRecord.h" + +/** + * Class for dealing with fasta files + */ +class Fasta { +public: + /** + * Constructor that sets amr file path + */ + Fasta(std::string amr_fp); + + /** + * Reads fasta file from file path + */ + void read_fasta(const std::string &amr_fp); + + std::vector<FastaRecord> records; +private: + std::string _amr_fp; +}; + +#endif /* FASTA_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/FastaRecord.cpp Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,62 @@ +#include "FastaRecord.h" + +#include <algorithm> + +FastaRecord::FastaRecord(std::string gene_id, std::string gene) : + _gene_id(gene_id), _gene(gene), _base_hits(_gene.length(), 0), + _gene_hits(0) {} + +std::string FastaRecord::gene_id() const { return _gene_id; } + +std::string FastaRecord::gene() const { return _gene; } + +void FastaRecord::update_gene_hits() { + _gene_hits++; +} + +int FastaRecord::gene_hits() const { + return _gene_hits; +} + +int FastaRecord::get_base_hits() const { + return static_cast<int>(count(_base_hits.begin(), _base_hits.end(), 1)); +} + +int FastaRecord::find_gene(const std::vector<FastaRecord> &records, + const std::string &gene_id, std::string seq) { + int gene_index; + + std::vector<FastaRecord>::const_iterator low; + // binary search for fasta record index + low = std::lower_bound(records.begin(), records.end(), FastaRecord(gene_id, seq), + [](const FastaRecord &a, const FastaRecord &b) + { return a.gene_id() < b.gene_id(); }); + gene_index = (low - records.begin()); + + return gene_index; +} + +void FastaRecord::sort_by_gene_id(std::vector<FastaRecord> &records) { + // sort records by gene id + sort(records.begin(), records.end(), [](const FastaRecord &a, const FastaRecord &b) { return a.gene_id() < b.gene_id(); }); +} + +void FastaRecord::reset_base_hits(std::vector<FastaRecord> &records) { + for_each(records.begin(), records.end(), [](FastaRecord &a) { std::fill(a.base_hits().begin(), a.base_hits().end(), 0); }); +} + +void FastaRecord::reset_gene_hits(std::vector<FastaRecord> &records) { + for_each(records.begin(), records.end(), [](FastaRecord &a) { a._gene_hits = 0; }); +} + +std::vector<int> &FastaRecord::base_hits() { + return _base_hits; +} + + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/FastaRecord.h Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,86 @@ +#ifndef FASTA_RECORD_H +#define FASTA_RECORD_H + +#include <string> +#include <vector> + +/** + * Class for dealing with fasta records + */ +class FastaRecord { +public: + /** + * Ctor that initializes gene id and gene + */ + FastaRecord(std::string gene_id, std::string gene); + + /** + * Returns a string gene id + */ + std::string gene_id() const; + + /** + * Returns the gene associated with gene id + */ + std::string gene() const; + + /** + * Returns the total base hits for a gene + */ + int get_base_hits() const; + + /** + * Returns the amount of genes that were hit + * during the gene fraction calculation + */ + int gene_hits() const; + + /** + * + */ + void update_base_hits(const int &index); + + /** + * + */ + void update_gene_hits(); + + /** + * Searches for a fasta record corresponding + * to gene id + */ + static int find_gene(const std::vector<FastaRecord> &records, + const std::string &gene_id, + std::string seq = ""); + + /** + * Sorts fasta records by gene id + */ + static void sort_by_gene_id(std::vector<FastaRecord> &records); + + /** + * Resets base hits vector to 0's. + * This occurs after each sample is processed + */ + static void reset_base_hits(std::vector<FastaRecord> &records); + + /** + * Resets gene hits primitive to 0. + * This happens after each sample is processed + */ + static void reset_gene_hits(std::vector<FastaRecord> &records); + + std::vector<int> &base_hits(); + + std::string _gene_id; + std::string _gene; + std::vector<int> _base_hits; + +private: + int _gene_hits; +}; + + + + +#endif /* FASTA_RECORD_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Makefile Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,14 @@ +output: main.o Sam.o Alignments.o Fasta.o FastaRecord.o + g++ -std=c++11 main.o Sam.o Alignments.o Fasta.o FastaRecord.o -o csa +main.o: main.cpp + g++ -c -std=c++11 main.cpp +Sam.o: Sam.cpp + g++ -c -std=c++11 Sam.cpp +Alignments.o: Alignments.cpp + g++ -c -std=c++11 Alignments.cpp +Fasta.o: Fasta.cpp + g++ -c -std=c++11 Fasta.cpp +FastaRecord.o: FastaRecord.cpp + g++ -c -std=c++11 FastaRecord.cpp +clean: + rm *.o csa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Sam.cpp Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,41 @@ +#include "Sam.h" +#include "args.h" +#include "dir_util.h" +#include "alignment_util.h" + +#include <iostream> +#include <fstream> + +Sam::Sam(std::string sam_fp) : _sam_fp(sam_fp) {} + +void Sam::read_sam(cmd_args args) { + if(args.bam_stream) read_from_stdin(); + else read_from_file(args.sam_fp); +} + +void Sam::read_from_stdin() { + std::string line; + while(std::getline(std::cin, line)) { + if(line[0] == '@') continue; + alignment.push_back(line); + } +} + +void Sam::read_from_file(const std::string &sam_fp) { + std::ifstream in(sam_fp.c_str()); + if(!in) { + usage(); + exit(EXIT_FAILURE); + } + + std::string line; + while(getline(in, line)) { + if(line[0] == '@') continue; + if(is_good_alignment(line)) + alignment.push_back(line); + } + + in.close(); +} + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/Sam.h Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,48 @@ +#ifndef SAM_H +#define SAM_H + +#include <string> +#include <vector> + +#include "args.h" +#include "Alignments.h" + +/** + * Class for dealing with sam files + */ + +class Sam { +public: + /** + * Ctor initializes sam file path + */ + Sam(std::string sam_fp); + void read_sam(cmd_args args); + + /** + * Reads sam file from stdin + */ + void read_from_stdin(); + + /** + * Reads sam file from directory or file path + */ + void read_from_file(const std::string &sam_fp); + + /** + * + */ + void read_from_dir(const std::string &sam_dir_fp); + + std::vector<Alignments> alignment; + +private: + std::string _sam_fp; +}; + + + + + +#endif /* SAM_H */ +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/SamRatio.h Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,248 @@ +#ifndef SAM_RATIO_H +#define SAM_RATIO_H + +#include <iostream> +#include <fstream> +#include <algorithm> + +#include "FastaRecord.h" +#include "Alignments.h" +#include "args.h" + +typedef std::vector<std::pair<int,char>> cigar_str; + +/** + * + */ +struct header { + std::string level = "Level"; + std::string iteration = "Iteration"; + std::string gene_id = "Gene id"; + std::string gene_fraction = "Gene Fraction"; + std::string hits = "Hits"; +}; + +/** + * Reports the total number of bases that were touched for each + * gene. This is largely calculated using the positional and seq + * information found in fields four and ten of each alignment + */ +void analyze_coverage(FastaRecord &record, Alignments &alignment) { + record.update_gene_hits(); + + cigar_str cigar = alignment.cigar(); + + int len; + char op; + + int occurrence; + int pos_in_gene = alignment.pos(); + + int start, stop; + int base_hits = record._base_hits.size(); // func this + int read_length = alignment.seq().length(); //func this + + if(pos_in_gene == 1) { + occurrence = 0; + for(int i = 0; i < cigar.size(); i++) { + len = cigar[i].first; + op = cigar[i].second; + + switch(op) { + case 'M': + occurrence += len; + break; + case 'I': + occurrence += len; + break; + default: + break; + } + } + + start = read_length - occurrence; + stop = start + read_length; + + for(int i = start; i < base_hits; i++) { + if(i == stop) break; + record._base_hits[i] = 1; + } + } + else { + start = pos_in_gene - 1; + stop = start + read_length; + + for(int i = start; i < base_hits; i++) { + if(i == stop) break; + record._base_hits[i] = 1; + } + } +} + +/** + * Returns gene fraction of fasta record + * Returns -1 if gene fraction is not greater than threshold + */ +double coverage(const FastaRecord &record, const int &threshold) { + double gene_coverage; + + int base_hits, gene_size; + + base_hits = record.get_base_hits(); + gene_size = record.gene().length(); + + gene_coverage = (static_cast<double>(base_hits)/static_cast<double>(gene_size))*100; + + if(gene_coverage > threshold) + return gene_coverage; + return -1; +} + +/** + * Writes header to output file when + * reading from stdin + */ +void bam_stream_header() { + header h; + char sep = ','; + + std::cout << h.level << sep << h.iteration << sep + << h.gene_id << sep << h.gene_fraction << sep + << h.hits << '\n'; +} + +/** + * Writes header to output file when + * reading from sam file + */ +void file_header(const std::string &out_fp, const std::string &sam_fp) { + header h; + std::ofstream out(out_fp.c_str(), std::ofstream::app ); + char sep = ','; + + //out << "@" << sam_fp << '\n'; + out << h.level << sep << h.iteration << sep + << h.gene_id << sep << h.gene_fraction << sep + << h.hits << '\n'; + out.close(); +} + +/** + * + */ +void create_header(cmd_args &args) { + if(args.bam_stream) { + bam_stream_header(); + } + else { + file_header(args.out_fp, args.sam_fp); + } +} + +/** + * Writes results to output file when reading from + * stdin + */ +void bam_stream_results(std::vector<FastaRecord> &records, + const int &level, const int &iteration, + cmd_args &args) { + + double gene_fraction; + int hits_seen; + std::string id; + char sep = ','; + + for(auto &rec : records) { + gene_fraction = coverage(rec, args.threshold); + hits_seen = rec.gene_hits(); + id = rec.gene_id(); + + if(gene_fraction > 0) { + std::cout << level << sep << iteration << sep + << id << sep << gene_fraction << sep + << hits_seen << '\n'; + } + } +} + +/** + * Write results when reading sam file from + * path + */ +void file_results(std::vector<FastaRecord> &records, + const int level, const int &iteration, + cmd_args &args) { + + std::ofstream out(args.out_fp.c_str(), std::ofstream::app); + + double gene_fraction; + int hits_seen; + std::string id; + char sep = ','; + + for(auto &rec : records) { + gene_fraction = coverage(rec, args.threshold); + hits_seen = rec.gene_hits(); + id = rec.gene_id(); + + if(gene_fraction > 0) { + out << level << sep << iteration << sep + << id << sep << gene_fraction << sep + << hits_seen << '\n'; + } + } + out.close(); +} + +/** + * + */ +void report_results(std::vector<FastaRecord> &records, + const int level, const int &iteration, + cmd_args &args) { + + if(args.bam_stream) { + bam_stream_results(records,level,iteration,args); + } + else { + file_results(records,level,iteration,args); + } +} + +/** + * Generates a sequence of samples from sam file specified + * by the starting level, max level, and skip pattern + */ +void generate_samples(std::vector<FastaRecord> &records, + std::vector<Alignments> &alignments, + cmd_args &args) { + + int read_count = alignments.size(); + int sample_size; + + srand(unsigned(time(0))); + + std::vector<int> sequence(read_count); + iota(sequence.begin(), sequence.end(), 0); + + create_header(args); + + for(int level = args.min; level <= args.max; level += args.skip) { + for(int sample = 0; sample < args.s_per_lev; sample++) { + random_shuffle(sequence.begin(), sequence.end(), randomize); + sample_size = round(((static_cast<double>(level)/100)*read_count)); + std::vector<int> chosen(sequence.begin(), sequence.begin()+sample_size); + + for(int a_idx = 0; a_idx < chosen.size(); a_idx++) { + std::string rname = alignments[chosen[a_idx]].rname(); + int gene_idx = FastaRecord::find_gene(records, rname); + analyze_coverage(records[gene_idx], alignments[chosen[a_idx]]); + } + report_results(records,level,sample+1,args); + FastaRecord::reset_base_hits(records); + FastaRecord::reset_gene_hits(records); + } + } +} + +#endif /* SAM_RATIO_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/alignment_util.h Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,96 @@ +#ifndef ALIGNMENT_UTIL_H +#define ALIGNMENT_UTIL_H + +#include <string> +#include <vector> + +#include <boost/algorithm/string.hpp> + +#include "int_util.h" + +// macro to check if read mapped +#define READ_UNMAPPED 4 + +/** + * Splits alignment into separate parts + */ +std::vector<std::string> split_alignment(std::string &alignment) { + std::vector<std::string> parts; + + boost::trim_if(alignment, boost::is_any_of("\t ")); + // split on tab delimeter + boost::split(parts, alignment, boost::is_any_of("\t "), boost::token_compress_on); + + return parts; +} + +/** + * Validates bit flag + */ +bool is_good_flag(const int &bit_flag) { + if( (bit_flag & READ_UNMAPPED) > 0) return false; + return true; +} + +/** + * Validates rname + */ +bool is_good_rname(const std::string &rname) { + return rname.compare("*") != 0; +} + +/** + * Validates pos + */ +bool is_good_pos(const int &pos) { + return pos > 0; +} + +/** + * Validates cigar + */ +bool is_good_cigar(const std::string &cigar) { + return cigar.compare("*") != 0; +} + +/** + * Validates seq + */ +bool is_good_seq(const std::string &seq) { + return seq.compare("*") != 0; +} + +/** + * Validates alignment fields + */ +bool fields_are_good(std::vector<std::string> &parts) { + int bit_flag = s_to_i(parts[1]); + int pos = s_to_i(parts[3]); + + std::string rname = parts[2]; + std::string cigar = parts[5]; + std::string seq = parts[9]; + + if(!(is_good_flag(bit_flag))) return false; + if(!(is_good_pos(pos))) return false; + if(!(is_good_rname(rname))) return false; + if(!(is_good_cigar(cigar))) return false; + if(!(is_good_seq(seq))) return false; + + return true; +} + +/** + * Stores alignments that pass validity checks + */ +bool is_good_alignment(std::string &alignment) { + std::vector<std::string> alignment_parts; + + alignment_parts = split_alignment(alignment); + + if(!(fields_are_good(alignment_parts))) + return false; + return true; +} + +#endif /* ALIGNMENT_UTIL_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/args.h Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,90 @@ +#ifndef ARGS_H +#define ARGS_H + +#include "int_util.h" + +#include <string> +#include <vector> +#include <list> + +static void usage() { + fprintf(stderr, "\n"); + fprintf(stderr, "Program: Coverage Sampler \n"); + fprintf(stderr, "Contact: Chris Dean <cdean11@rams.colostate.edu\n\n"); + fprintf(stderr, "Usage: csa [options]\n\n"); + fprintf(stderr, "Options:\n\n"); + fprintf(stderr, " -amr_fp amr database path\n"); + fprintf(stderr, " -sam_fp sam file path\n"); + fprintf(stderr, " -min starting level\n"); + fprintf(stderr, " -max ending level\n"); + fprintf(stderr, " -skip amount of levels to skip\n"); + fprintf(stderr, " -t gene fraction threshold\n"); + fprintf(stderr, " -samples amount of samples per level\n"); + fprintf(stderr, " -d directory parsing\n"); + fprintf(stderr, " -bam bam file parsing\n"); + fprintf(stderr, " -out_fp output file path\n\n"); +} + +/** + * Encapsulates information input + * from the command line. + */ +struct cmd_args { + std::string amr_fp; + std::string sam_fp; + std::list<std::string> sam_dir_fp; + std::string out_fp; + + int threshold; + int min; + int max; + int skip; + int s_per_lev; + + bool sam_dir = false; /* This will be set to true when parsing a + directory of sam files. */ + bool bam_stream = false; /* This will be set to true when executing + samtools view -h example.bam | csa > output + from the command line. */ +}; + +/** + * Returns a struct of command line arguments. + */ +static inline cmd_args +parse_command_line(int argc, char *argv[]) { + std::vector<std::string> args(argv, argv + argc); + + cmd_args arg; + + for(int i = 1; i < argc; i++) { + if(args[i].compare("-amr_fp") == 0) + arg.amr_fp = args[++i]; + else if(args[i].compare("-sam_fp") == 0) + arg.sam_fp = args[++i]; + else if(args[i].compare("-out_fp") == 0) + arg.out_fp = args[++i]; + else if(args[i].compare("-t") == 0) + arg.threshold = s_to_i(args[++i]); + else if(args[i].compare("-min") == 0) + arg.min = s_to_i(args[++i]); + else if(args[i].compare("-max") == 0) + arg.max = s_to_i(args[++i]); + else if(args[i].compare("-skip") == 0) + arg.skip = s_to_i(args[++i]); + else if(args[i].compare("-samples") == 0) + arg.s_per_lev = s_to_i(args[++i]); + else if(args[i].compare("-d") == 0) + arg.sam_dir = true; + else if(args[i].compare("-bam") == 0) + arg.bam_stream = true; + else { + usage(); + exit(EXIT_FAILURE); + } + } + + return arg; +} + +#endif /* ARGS_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/dir_util.h Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,43 @@ +#ifndef DIR_UTIL_H +#define DIR_UTIL_H + +#include <list> +#include <iostream> +#include <string> +#include <dirent.h> + +/* + * Parses a directory of sam files + */ +static inline std::list<std::string> +parse_sam_dir(const std::string &directory) { + DIR *dir; + struct dirent *ent; + std::string dir_path = directory; + + dir = opendir(dir_path.c_str()); + // is dir open/valid? + if(dir == NULL) { + std::cout << "Not a valid directory" << std::endl; + exit(EXIT_FAILURE); + } + + std::list<std::string> sam_files; + std::string fn; + std::string ext; + std::string file_type = ".sam"; + + // get all files with a .sam file extension + while((ent = readdir(dir)) != NULL) { + fn = dir_path + std::string(ent->d_name); + ext = fn.substr(fn.length()-4, fn.length()); + if(ext.compare(file_type) == 0) { + sam_files.push_back(fn); + } + } + closedir(dir); + + return sam_files; +} + +#endif /* DIR_UTIL_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/gene_fraction.xml Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,57 @@ +<tool id="gene_fraction" name="Coverage sampler" version="0.1.0"> + <requirements> + </requirements> + <stdio> + <exit_code range="1:" /> + </stdio> + <command><![CDATA[ + csa + -amr_fp $input1 + -sam_fp $input2 + -min $min + -max $max + -skip $skip + -t $threshold + -samples $samples + -out_fp $output1 + ]]></command> + <inputs> + <param type="data" name="input1" format="fasta" /> + <param type="data" name="input2" format="sam" /> + <param name="min" type="integer" label="Minimum sample value" + value="1" min="1" max="100" help="(-min)" /> + <param name="max" type="integer" label="Maximum sample value" + value="1" min="1" max="100" help="(-max)" /> + <param name="skip" type="integer" label="Amount of levels to skip" + value="1" min="1" max="100" help="(-skip)" /> + <param name="threshold" type="integer" label="Gene fraction threshold" + value="1" min="1" max="100" help="(-t)" /> + <param name="samples" type="integer" label="Amount of samples per level" + value="1" min="1" max="100" help="(-samples)" /> + </inputs> + <outputs> + <data name="output1" format="tabular" /> + </outputs> + <help><![CDATA[ + +Program: Coverage Sampler +Contact: Chris Dean <cdean11@rams.colostate.edu + +Usage: csa [options] + +Options: + + -amr_fp amr database path + -sam_fp sam file path + -min starting level + -max ending level + -skip amount of levels to skip + -t gene fraction threshold + -samples amount of samples per level + -d directory parsing + -bam bam file parsing + -out_fp output file path + + + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/int_util.h Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,29 @@ +#ifndef INT_UTIL_H +#define INT_UTIL_H + +#include <string> +#include <sstream> + +/** + * Given a string, return its integer. + */ +static inline int +s_to_i(const std::string &s) { + std::istringstream ss(s); + int i; + ss >> i; + return i; +} + +/** + * Given an integer, return a random number + * between 0 and i. + */ +static inline int +randomize(const int &i) { + return rand() % i; +} + +#endif /*INT_UTIL_H */ + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/main.cpp Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,37 @@ +#include <string> +#include <iostream> +#include <vector> + +#include "int_util.h" +#include "dir_util.h" +#include "args.h" +#include "Fasta.h" +#include "Sam.h" +#include "SamRatio.h" + +using namespace std; + +int main(int argc, char *argv[]) { + cmd_args args; + args = parse_command_line(argc, argv); + + Fasta f(args.amr_fp); + f.read_fasta(args.amr_fp); + + if(args.sam_dir) { + list<string> sam_files = parse_sam_dir(args.sam_fp); + for(auto &fn : sam_files) { + args.sam_fp = fn; + Sam s(args.sam_fp); + s.read_sam(args); + generate_samples(f.records, s.alignment, args); + } + } + else { + Sam s(args.sam_fp); + s.read_sam(args); + generate_samples(f.records, s.alignment, args); + } + + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_fraction/src/ref.fa Sun Feb 21 23:31:55 2016 -0500 @@ -0,0 +1,2 @@ +>seq1 +AAAAAAAAAA