Mercurial > repos > saskia-hiltemann > file_manipulation
changeset 0:e77c9484b2d0 draft default tip
Uploaded
author | saskia-hiltemann |
---|---|
date | Thu, 22 Oct 2015 09:18:30 -0400 |
parents | |
children | |
files | chrprefix.sh chrprefix.xml column_extract.sh column_extract.xml concatenate.sh concatenate.xml filter_columns.sh filter_columns.xml getcolumnnames.py sort_chromosomal_position.sh sort_chromosomal_position.xml strip_header.sh strip_header.xml |
diffstat | 13 files changed, 550 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chrprefix.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,52 @@ +#!/bin/bash + +inputfile=$1 +col=$2 +addremove=$3 +outputfile=$4 + +echo "args: $@" +echo "inputfile: $inputfile" +echo "column: $column" +echo "addremove: $addremove" +echo "outputfile: $outputfile" + +#get column number +column=`expr match "$col" '\([0-9]*\)'` +echo "colnumber: $column" + +if [ $addremove == "add" ] +then + echo "adding prefix to column $column" + awk 'BEGIN{ + FS="\t" + OFS="\t" + c="'"$column"'" + }{ + if (index($0,"#")!=1){ + $c="chr"$c + } + print $0 + + }END{}' $inputfile > $outputfile + +else #remove prefix + echo "removing prefix from column $column" + awk 'BEGIN{ + FS="\t" + OFS="\t" + c="'"$column"'" + }{ + if (FNR>1 && index($0,"#")!=1){ + $c=substr($c,4) + } + print $0 + + }END{}' $inputfile > $outputfile +fi + +echo "inputfile: " +head -5 $inputfile + +echo "outputfile: " +head -5 $outputfile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chrprefix.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,47 @@ + <tool id="chr_prefix" name="Add/Remove chr prefix" version="1"> + + <description> add or remove chr prefix from a column </description> + + <code file="getcolumnnames.py"/> + + <command interpreter="bash"> + chrprefix.sh + $infile + "${go.column}" + $addremove + $outputfile + </command> + + <outputs> + <data format="tabular" name="outputfile" label="${sample} - chr-prefix on ${on_string}"/> + </outputs> + + <inputs> + + <param name="infile" type="data" label="Select inputfile" help="must be a tab-delimited file with a 1-line header" /> + <conditional name="go"> + <param name="ready" type="select" label="Ready to select column?" help="Set to NO and back to YES after changing input file"> + <option value="no" selected="True">no</option> + <option value="yes">yes</option> + </param> + + <when value="yes"> + <param name="column" type="select" dynamic_options="get_headers(infile)" multiple="False" label="Select Column" /> + </when> + </conditional> + + <param name="addremove" type="select" label="Add or remove the chr prefix?"> + <option value="add">Add</option> + <option value="remove">Remove</option> + </param> + <param name="sample" type="text" value="" label="Output File Name" help="Optional."/> + + </inputs> + + <help> + +**What it does** +removes or adds the "chr" prefix in a column of a file. Some tools expect you to indicate chromosomes as "chr1,chr2,chrX", while others expect only 1,2,X as input. This tool allows you to easily switch notations + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/column_extract.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,65 @@ +#!/bin/bash + +inputfile=$1 +outputfile=$2 +removeheader=$3 +columns="$@" + +cols="${columns// /,}" #make comma-separated + +#skip first three arguments +firstcomma=`expr index "$cols" ,` +cols="${cols:$firstcomma}" +secondcomma=`expr index "$cols" ,` +cols="${cols:$secondcomma}" +thirdcomma=`expr index "$cols" ,` +cols="${cols:$thirdcomma}" +cols="${cols//:/}" #remove colons +echo "colums to print: $cols" + +arr=$(echo $cols | tr "," "\n") + +for x in $arr +do + echo $x +done + +myArray=($columns) +i=3 +len=${#myArray[@]} +mycols="" +echo "len: $len" +while [ $i -le $len ] +do + echo "myarray: ${myArray[$i]}" + mycols+=${myArray[$i]} + i=$[$i+2] +done +mycols="${mycols//:/,}" #make comma-separated +mycols="${mycols%?}" +echo "mycols: $mycols" + +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + columns="'"$mycols"'"; + len=split(columns,arr,",") + }{ + if (index($1,"#")==1 || $1==""){ #print header as--s + if("'"$removeheader"'"=="N"){ + print $0 + } + } + else{ + for (i=1;i<len;i++){ + j=arr[i] + printf $j"\t" + } + j=arr[len] + printf $j"\n" + } + }END{ + + }' $inputfile > $outputfile + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/column_extract.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,50 @@ +<tool id="column_select" name="Column Select" version="1"> + + <description> extract/rearrange columns from a tab-delimited file </description> + <code file="getcolumnnames.py"/> + + <command interpreter="bash"> + column_extract.sh + $infile + $outputfile + $removeheader + #for $c in $go.columns + ${c.column} + #end for + </command> + + <outputs> + <data format="tabular" name="outputfile" label="${sample} Column extract on ${on_string}"/> + </outputs> + + <inputs> + + <param name="infile" type="data" label="Select file to extract columns from" help="Must be a tab-separated file with a 1-line header"/> + <param name="removeheader" type="boolean" truevalue="Y" falsevalue="N" selected="false" label="Remove header?" help="If selected, any lines starting with # and any empty lines will not be output."/> + + <conditional name="go"> + <param name="ready" type="select" label="Ready to select columns?" help="Set to NO and back to YES after changing input file"> + <option value="no" selected="True">no</option> + <option value="yes">yes</option> + </param> + + <when value="yes"> + <repeat name="columns" title="Column" min="1"> + <!-- <param name="col" type="data_column" data_ref="infile" multiple="False" label="Column from file to output" /> --> + <param name="column" type="select" dynamic_options="get_headers(infile)" multiple="False" label="Select Column" /> + </repeat> + </when> + </conditional> + + <param name="sample" type="text" value="" label="Sample Name" help="Optional. For file naming purposes only"/> + + </inputs> + + <help> + + +**What it does** +Outputs columns of input file in order specified by user. Columns not selected will not be output. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/concatenate.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,3 @@ +#!/bin/bash + +cat $1 $2 > $3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/concatenate.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,31 @@ + <tool id="concatenate" name="File Concatenation" version="1"> + + <description> concatenate 2 files </description> + + <command interpreter="bash"> + concatenate.sh + $infile + $infile2 + $outputfile + </command> + + <outputs> + <data format="tabular" name="outputfile" label="${sample} - File Concatenation on ${on_string}"/> + </outputs> + + <inputs> + + <param name="infile" type="data" label="Select first file for concatenation" /> + <param name="infile2" type="data" label="Select second file for concatenation" /> + + <param name="sample" type="text" value="" label="Output File Name" help="Optional."/> + + </inputs> + + <help> + +**What it does** +Concatenates 2 files + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_columns.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,116 @@ +#!/bin/bash + +echo $@ +count=1 + +set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter: " "h:" "$@"` || usage +[ $# -eq 0 ] && usage + + +while [ $# -gt 0 ] +do + case "$1" in + --infile) infile=$2;shift;; + --outfile_rem) outfile_rem=$2;shift;; + --outfile_ret) outfile_ret=$2;shift;; + --filter) filter[$count]=$2 + count=$[$count+1] + shift;; + -h) shift;; + --) shift;break;; + -*) usage;; + *) break;; + esac + shift +done + + + + +# remove but remember header +cp $infile inputfile +sed -i -e "1d" inputfile + +head inputfile + +count=1 +cp $infile inputfile + +for f in ${filter[@]} +do + echo "filter $count: $f" + count=$[$count+1] + + IFS=',' read -a filt <<< "$f" + + #filt=split(${f//,/ }) + column=${filt[0]} + op=${filt[1]} + value=${filt[2]} + + echo "column: $column" + echo "op : $op" + echo "value: $value" + + #perform filtering + + awk 'BEGIN{ + FS="\t" + OFS="\t" + op="'"$op"'" + numeric_value="'"$value"'"+0 + + }{ + # keep header in both output files + if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){ + print $0 >> "outfile_removed" + print $0 >> "outfile_retained" + } + + if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else + print $0 >> "outfile_retained" + + }END{}' inputfile + + #next iteration only run on retained lines + cp outfile_retained inputfile + rm outfile_retained +done + +#remove duplicate lines in outputfiles +cat outfile_removed | uniq > $outfile_rem +cat inputfile | uniq > $outfile_ret + +#awk ' !x[$0]++' outfile_removed > $outfile_rem +#awk ' !x[$0]++' outfile_retained > $outfile_ret +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_columns.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,53 @@ + <tool id="filter_columns" name="Filter Columns" version="1.2"> + + <description> filter file based on column values </description> + + <command interpreter="bash"> + filter_columns.sh + --infile $infile + #for $f in $filters <!--get each var/mastervar file--> + --filter ${f.column},${f.condition},${f.value} + #end for + --outfile_rem $outputfile_removed + --outfile_ret $outputfile_retained + </command> + + <outputs> + <data format="input" name="outputfile_removed" label="${sample} - Removed lines Column filter on ${on_string}" metadata_source="infile"/> + <data format="input" name="outputfile_retained" label="${sample} - Retained lines Column filter on ${on_string}" metadata_source="infile"/> + </outputs> + + <inputs> + + <param name="infile" type="data" label="Select input file" help="expects header lines to be indicated by # or > symbols. First line is always considered header line"/> + + <repeat name="filters" title="Filter" min="1" help="Multiple filters: lines removed if match any of the filters, not all (rules are OR'd, not AND'd)"> + <param name="column" type="data_column" data_ref="infile" multiple="False" label="Select Column to filter on" /> + <param name="condition" type="select" label="Remove line if column value.." help="you will receive both file with retained lines and file with removed lines"> + <option value="nonempty"> non-empty </option> + <option value="empty" > empty </option> + <option value="equals" > equals (string or number) </option> + <option value="nequals" > not equals (string or number)</option> + <option value="contains"> contains (substring) </option> + <option value="ncontains"> does not contain (substring) </option> + <option value="gt" > greater than </option> + <option value="ge" > greater than or equals </option> + <option value="lt" > less than </option> + <option value="le" > less than or equals </option> + </param> + + + <param name="value" type="text" value="" label="enter number or string unless above option was set to empty or nonempty"/> + </repeat> + + <param name="sample" type="text" value="" label="Output File Name" help="Optional."/> + + </inputs> + + <help> + +**What it does** +Concatenates 2 files + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getcolumnnames.py Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,30 @@ +import os, sys +import fnmatch +import csv + +def get_headers(inputfile): + columnList=[] + #line=inputfile.readlines()[0] + filename=inputfile.get_file_name() + try: + f = open(filename) + line=f.readline() + while(line[0]=='#' or (not line.strip())): #remove header (starting with hash sign and empty lines to get to headerline + line=f.readline() + line = line.strip() + i=1; + for col in line.split("\t"): + label=str(i)+': '+str(col) + columnList.append([label,label,False]) + i+=1 + + except IOError as e: + pass + + return columnList + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sort_chromosomal_position.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,26 @@ +#!/bin/bash + +#sort_chromosomal_position.sh $infile $chrcol $startcol $endcol $num_headerlines + + +cp $1 inputfile.tsv +chrcol=$2 +startcol=$3 +endcol=$4 +num_headerlines=$5 +outfile=$6 + +#remember header +head -$num_headerlines inputfile.tsv > header.tsv + +#remove header +sed -i "1,$num_headerlines d" inputfile.tsv + +#sort file +sort -k ${chrcol},${chrcol}V -k${startcol},${startcol}n inputfile.tsv > tmpout.txt + +cat header.tsv tmpout.txt > $outfile + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sort_chromosomal_position.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,31 @@ +<tool id="sort_chrom_pos" name="Sort Chromosomal Position" version="1" > + <description> sort file by chromosome, then by position </description> + <command interpreter="bash"> + sort_chromosomal_position.sh $infile $chrcol $startcol $endcol $num_headerlines $sorted_file + </command> + + + <inputs> + + <param name="infile" type="data" label="select file to be sorted" help="must be a tab-separated file with a 1 line header"/> + <param name="chrcol" type="data_column" data_ref="infile" multiple="False" label="Chromosome Column" /> + <param name="startcol" type="data_column" data_ref="infile" multiple="False" label="Start Column" /> + <param name="endcol" type="data_column" data_ref="infile" multiple="False" label="End Column" /> + <param name="num_headerlines" type="text" value="1" label="number of headerlines in your file" help="these lines will not be sorted"/> + + <param name="fname" type="text" value="" label="Prefix for your output file" help="Optional"/> + </inputs> + + <outputs> + <data format="tabular" name="sorted_file" label="$fname ${tool.name} Annotated region on file ${on_string}"/> + + </outputs> + + <help> + + +</help> + +</tool> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/strip_header.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,11 @@ + #!/bin/bash + +inputfile=$1 +outputfile=$2 +header=$3 +commentchar=$4 + +echo "commentchar: -${commentchar}-" + +sed -e "/^${commentchar}/d" -e '/^$/d' $inputfile > $outputfile +sed -n "/^${commentchar}/p" $inputfile > $header
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/strip_header.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,35 @@ + <tool id="header_remove" name="Strip Header" version="1"> + + <description> remove header from a file </description> + + <command interpreter="bash"> + strip_header.sh + $infile + $outputfile + $header + "$commentchar" + </command> + + <outputs> + <data format="tabular" name="outputfile" label="${sample} - Header Stripped on ${on_string}"/> + <data format="tabular" name="header" label="${sample} - Stripped Header on ${on_string}"/> + </outputs> + + <inputs> + + <param name="infile" type="data" label="Select file to extract columns from" /> + <param name="commentchar" type="text" value="#" label="Comment character" help="Any lines starting with this character will be removed (as well as any empty lines)"> + <sanitizer sanitize="False"/> + </param> + <param name="sample" type="text" value="" label="Sample Name" help="Optional. For file naming purposes only"/> + + </inputs> + + <help> + + +**What it does** +Removes header from a file. Any lines starting with the comment character specified by user, as well as any empty lines will be removed. + + </help> +</tool>