Mercurial > repos > mrvollger > trtr
changeset 30:c92ea5a8dec6
planemo upload for repository https://github.com/lparsons/galaxy_tools/tree/master/tools/trtr commit 6f76316d57fa4ec6b16aeed61e197b2c117f006d-dirty
author | lparsons |
---|---|
date | Mon, 14 Mar 2016 16:18:55 -0400 |
parents | 4e6b9640989f |
children | 13cad5d36301 |
files | tool-dependencies/trtrR-1.0.0/Makefile tool-dependencies/trtrR-1.0.0/trtr tool-dependencies/trtrR-1.0.0/trtr.c tool-dependencies/v1.0.0.tar.gz trtr-4e6b9640989f.tar.gz trtr.xml |
diffstat | 6 files changed, 266 insertions(+), 35 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-dependencies/trtrR-1.0.0/Makefile Mon Mar 14 16:18:55 2016 -0400 @@ -0,0 +1,10 @@ +trtr : trtr.c + cc trtr.c -o trtr + +clean : + rm trtr + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-dependencies/trtrR-1.0.0/trtr.c Mon Mar 14 16:18:55 2016 -0400 @@ -0,0 +1,221 @@ +/********************************************************************** +* Author: Jonathan Richards +* Date: 12/11/2014 +* +* This tool removes tandem repeats from ends of unaligned sequencing +* reads (leaving one copy). This prevents reads that don't span the +* repeated region from overlapping and leading to innaccurate SNPs +* calls. +* +* The maximimum repeat length is adjustable (use 1 to trim only +* homopolymers). +* +* The "aggressive" option should not be touched in general. Setting to +* 0 will prevent the program from trimming to exactly 1 copy of the +* repeat, instead leaving between 1 and 2 copies. Why this would be +* useful, I don't know. +* +* This program could also be a useful first step before assembly. More +* testing needs to be done. +* +* Special thanks to my advisor, Professor Alison Gammie, for bringing +* this problem to my attention and to my reseach partner, Mitchell +* Vollger, for help testing and finalizing. +**********************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <sys/types.h> +#include <assert.h> +#include <errno.h> + +//use my getline for portability +//adapted from getline.c written by Jan Brittenson, bson@gnu.ai.mit.edu +//http://www.opensource.apple.com/source/cvs/cvs-19/cvs/lib/getline.c +ssize_t getline(char** lineptr, size_t* n, FILE* stream) { + size_t nchars_avail; + char* read_pos; + int save_errno; + ssize_t ret; + register int c; + + if (!lineptr || !n || !stream) { + errno = EINVAL; + return -1; + } + if (!*lineptr) { + *n = 128; + *lineptr = malloc(*n); + if (!*lineptr) { + errno = ENOMEM; + return -1; + } + } + nchars_avail = *n; + read_pos = *lineptr; + for (;;) { + c = getc(stream); + save_errno = errno; + if (c != '\r') { //for portability... + assert((*lineptr+*n)==(read_pos+nchars_avail)); + if (nchars_avail < 2) { + *n *= 2; + nchars_avail = *n + *lineptr - read_pos; + *lineptr = realloc(*lineptr, *n); + if (!*lineptr) { + errno = ENOMEM; + return -1; + } + read_pos = *n - nchars_avail + *lineptr; + assert((*lineptr+*n) == (read_pos+nchars_avail)); + } + if (ferror(stream)) { + errno = save_errno; + return -1; + } + if (c == EOF) { + if (read_pos == *lineptr) + return -1; + else + break; + } + *read_pos++ = c; + nchars_avail--; + if (c == '\n') + break; + } + } + *read_pos = '\0'; + ret = read_pos - *lineptr; + return ret; +} + +int main(int argc, char *argv[]) { + char *line = NULL; + size_t len = 0; + ssize_t line_length; + + int count = 0; + size_t leftTrim = 0; + size_t rightTrim = 0; + size_t i; + size_t i_max = 10; + size_t j; + size_t r; + size_t length; + size_t longest_region; + char *ptr; + bool matched = false; + bool aggressive_trim = true; + + FILE *file = fopen(argv[1], "r"); + + if (argc >= 3) { + i_max = strtol(argv[2], &ptr, 10); + if (argc >= 4) { + aggressive_trim = strtol(argv[3], &ptr, 10); + } + } + if (file != NULL) { + while ((line_length = getline(&line, &len, file)) != -1) { + count++; + switch (count) { + + //read name + case 1: + fputs(line, stdout); + break; + + //read sequence + case 2: + //find leftTrim + longest_region = 0; + for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat + if (line[0] == line[i]) { + matched = true; + j=1; + r=0; + while (matched == true) { + if (j == i) { + r++; + j=0; + } else if (line[j] != line[(r+1)*i+j]) { + //no length comparison needed because of \n at end + matched = false; + if (aggressive_trim) { + length = r*i+j; + } else { + length = r*i; + } + if (length > longest_region && r>0) { + longest_region = length; + } + } else { + j++; + } + } + + } + } + leftTrim = longest_region; + + //find rightTrim + longest_region = 0; + for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat + if (line[line_length-2] == line[line_length-2-i]) { + matched = true; + j=1; + r=0; + while (matched == true) { + if (j == i) { + r++; + j=0; + } else if ((line[line_length-2-j] != line[line_length-2-(r+1)*i-j]) + || line_length-2-(r+1)*i-j == leftTrim) { + matched = false; + if (aggressive_trim) { + length = r*i+j; + } else { + length = r*i; + } + if (length > longest_region && r>0) { + longest_region = length; + } + } else { + j++; + } + } + } + } + rightTrim = line_length-longest_region-1; + + //print trimmed line + line[rightTrim] = '\n'; + line[rightTrim+1] = '\0'; + fputs(line+leftTrim, stdout); + break; + + //+ + case 3: + fputs(line, stdout); + break; + + //read qualities + case 4: + count = 0; //reset to read title + line[rightTrim] = '\n'; + line[rightTrim+1] = '\0'; + fputs(line+leftTrim, stdout); + break; + default: + break; + } + } + free(line); + fclose(file); + } else { + perror(argv[1]); + } + return 0; +} \ No newline at end of file
--- a/trtr.xml Sat Mar 12 14:50:15 2016 -0500 +++ b/trtr.xml Mon Mar 14 16:18:55 2016 -0400 @@ -1,43 +1,43 @@ -<tool id="trtr" name="TRTR"> - <requirements> - <requirement type="package" version="1.0">trtr</requirement> - </requirements> +<tool id="trtr" name="TRTR" version="1.0galaxy1"> + <description>Trim Reads of Tandem Repeat in a fastq file. </description> + + <requirements> + <requirement type="package" version="1.0">trtr</requirement> + </requirements> - <description>Trim Reads of Tandem Repeat in a fastq file. </description> - - <command> - trtr $input $max_repeat $aggressive > $output - </command> - - <inputs> - <param format="fastq" name="input" type="data" label="Source file"/> - <param name="max_repeat" type="integer" value="10" label="Maximum repeat length" /> - <param name="aggressive" type="integer" value="1" label="Aggressive? See description."/> - </inputs> - - <outputs> - <data format_source="input" name="output" /> - </outputs> + <command> + trtr $input $max_repeat $aggressive > $output + </command> + + <inputs> + <param format="fastq" name="input" type="data" label="Source file"/> + <param name="max_repeat" type="integer" value="10" label="Maximum repeat length" /> + <param name="aggressive" type="integer" value="1" label="Aggressive? See description."/> + </inputs> + + <outputs> + <data format_source="input" name="output" /> + </outputs> - <tests> - <test> - <param name="input" value="small.fastq"/> - <output name="output" file="smallTrimmed.fastq"/> - </test> - <test> - <param name="input" value="medium.fastq"/> - <output name="output" file="mediumTrimmed.fastq"/> - </test> - </tests> + <tests> + <test> + <param name="input" value="small.fastq"/> + <output name="output" file="smallTrimmed.fastq"/> + </test> + <test> + <param name="input" value="medium.fastq"/> + <output name="output" file="mediumTrimmed.fastq"/> + </test> + </tests> - <help> -This tool removes tandem repeats from ends of unaligned sequencing reads (leaving one copy). This prevents reads that don't span the repeated region from overlapping and leading to innaccurate SNPs calls. + <help> + This tool removes tandem repeats from ends of unaligned sequencing reads (leaving one copy). This prevents reads that don't span the repeated region from overlapping and leading to innaccurate SNPs calls. -The maximimum repeat length is adjustable (use 1 to trim only homopolymers). + The maximimum repeat length is adjustable (use 1 to trim only homopolymers). -The "aggressive" option should not be touched in general. Setting to 0 will prevent the program from trimming to exactly 1 copy of the repeat, instead leaving between 1 and 2 copies. + The "aggressive" option should not be touched in general. Setting to 0 will prevent the program from trimming to exactly 1 copy of the repeat, instead leaving between 1 and 2 copies. -This could also be a useful first step before assembly. More testing needs to be done. - </help> + This could also be a useful first step before assembly. More testing needs to be done. + </help> </tool>