# HG changeset patch # User saskia-hiltemann # Date 1445519910 14400 # Node ID e77c9484b2d06563f90b307a3ff2de15f1a3fe4c Uploaded diff -r 000000000000 -r e77c9484b2d0 chrprefix.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chrprefix.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,52 @@ +#!/bin/bash + +inputfile=$1 +col=$2 +addremove=$3 +outputfile=$4 + +echo "args: $@" +echo "inputfile: $inputfile" +echo "column: $column" +echo "addremove: $addremove" +echo "outputfile: $outputfile" + +#get column number +column=`expr match "$col" '\([0-9]*\)'` +echo "colnumber: $column" + +if [ $addremove == "add" ] +then + echo "adding prefix to column $column" + awk 'BEGIN{ + FS="\t" + OFS="\t" + c="'"$column"'" + }{ + if (index($0,"#")!=1){ + $c="chr"$c + } + print $0 + + }END{}' $inputfile > $outputfile + +else #remove prefix + echo "removing prefix from column $column" + awk 'BEGIN{ + FS="\t" + OFS="\t" + c="'"$column"'" + }{ + if (FNR>1 && index($0,"#")!=1){ + $c=substr($c,4) + } + print $0 + + }END{}' $inputfile > $outputfile +fi + +echo "inputfile: " +head -5 $inputfile + +echo "outputfile: " +head -5 $outputfile diff -r 000000000000 -r e77c9484b2d0 chrprefix.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chrprefix.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,47 @@ + + + add or remove chr prefix from a column + + + + + chrprefix.sh + $infile + "${go.column}" + $addremove + $outputfile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** +removes or adds the "chr" prefix in a column of a file. Some tools expect you to indicate chromosomes as "chr1,chr2,chrX", while others expect only 1,2,X as input. This tool allows you to easily switch notations + + + diff -r 000000000000 -r e77c9484b2d0 column_extract.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/column_extract.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,65 @@ +#!/bin/bash + +inputfile=$1 +outputfile=$2 +removeheader=$3 +columns="$@" + +cols="${columns// /,}" #make comma-separated + +#skip first three arguments +firstcomma=`expr index "$cols" ,` +cols="${cols:$firstcomma}" +secondcomma=`expr index "$cols" ,` +cols="${cols:$secondcomma}" +thirdcomma=`expr index "$cols" ,` +cols="${cols:$thirdcomma}" +cols="${cols//:/}" #remove colons +echo "colums to print: $cols" + +arr=$(echo $cols | tr "," "\n") + +for x in $arr +do + echo $x +done + +myArray=($columns) +i=3 +len=${#myArray[@]} +mycols="" +echo "len: $len" +while [ $i -le $len ] +do + echo "myarray: ${myArray[$i]}" + mycols+=${myArray[$i]} + i=$[$i+2] +done +mycols="${mycols//:/,}" #make comma-separated +mycols="${mycols%?}" +echo "mycols: $mycols" + +awk 'BEGIN{ + FS="\t"; + OFS="\t"; + columns="'"$mycols"'"; + len=split(columns,arr,",") + }{ + if (index($1,"#")==1 || $1==""){ #print header as--s + if("'"$removeheader"'"=="N"){ + print $0 + } + } + else{ + for (i=1;i $outputfile + + diff -r 000000000000 -r e77c9484b2d0 column_extract.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/column_extract.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,50 @@ + + + extract/rearrange columns from a tab-delimited file + + + + column_extract.sh + $infile + $outputfile + $removeheader + #for $c in $go.columns + ${c.column} + #end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** +Outputs columns of input file in order specified by user. Columns not selected will not be output. + + + diff -r 000000000000 -r e77c9484b2d0 concatenate.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/concatenate.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,3 @@ +#!/bin/bash + +cat $1 $2 > $3 diff -r 000000000000 -r e77c9484b2d0 concatenate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/concatenate.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,31 @@ + + + concatenate 2 files + + + concatenate.sh + $infile + $infile2 + $outputfile + + + + + + + + + + + + + + + + + +**What it does** +Concatenates 2 files + + + diff -r 000000000000 -r e77c9484b2d0 filter_columns.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_columns.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,116 @@ +#!/bin/bash + +echo $@ +count=1 + +set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter: " "h:" "$@"` || usage +[ $# -eq 0 ] && usage + + +while [ $# -gt 0 ] +do + case "$1" in + --infile) infile=$2;shift;; + --outfile_rem) outfile_rem=$2;shift;; + --outfile_ret) outfile_ret=$2;shift;; + --filter) filter[$count]=$2 + count=$[$count+1] + shift;; + -h) shift;; + --) shift;break;; + -*) usage;; + *) break;; + esac + shift +done + + + + +# remove but remember header +cp $infile inputfile +sed -i -e "1d" inputfile + +head inputfile + +count=1 +cp $infile inputfile + +for f in ${filter[@]} +do + echo "filter $count: $f" + count=$[$count+1] + + IFS=',' read -a filt <<< "$f" + + #filt=split(${f//,/ }) + column=${filt[0]} + op=${filt[1]} + value=${filt[2]} + + echo "column: $column" + echo "op : $op" + echo "value: $value" + + #perform filtering + + awk 'BEGIN{ + FS="\t" + OFS="\t" + op="'"$op"'" + numeric_value="'"$value"'"+0 + + }{ + # keep header in both output files + if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){ + print $0 >> "outfile_removed" + print $0 >> "outfile_retained" + } + + if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){ + print $0 >> "outfile_removed" + } + else + print $0 >> "outfile_retained" + + }END{}' inputfile + + #next iteration only run on retained lines + cp outfile_retained inputfile + rm outfile_retained +done + +#remove duplicate lines in outputfiles +cat outfile_removed | uniq > $outfile_rem +cat inputfile | uniq > $outfile_ret + +#awk ' !x[$0]++' outfile_removed > $outfile_rem +#awk ' !x[$0]++' outfile_retained > $outfile_ret + diff -r 000000000000 -r e77c9484b2d0 filter_columns.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_columns.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,53 @@ + + + filter file based on column values + + + filter_columns.sh + --infile $infile + #for $f in $filters + --filter ${f.column},${f.condition},${f.value} + #end for + --outfile_rem $outputfile_removed + --outfile_ret $outputfile_retained + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** +Concatenates 2 files + + + diff -r 000000000000 -r e77c9484b2d0 getcolumnnames.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getcolumnnames.py Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,30 @@ +import os, sys +import fnmatch +import csv + +def get_headers(inputfile): + columnList=[] + #line=inputfile.readlines()[0] + filename=inputfile.get_file_name() + try: + f = open(filename) + line=f.readline() + while(line[0]=='#' or (not line.strip())): #remove header (starting with hash sign and empty lines to get to headerline + line=f.readline() + line = line.strip() + i=1; + for col in line.split("\t"): + label=str(i)+': '+str(col) + columnList.append([label,label,False]) + i+=1 + + except IOError as e: + pass + + return columnList + + + + + + diff -r 000000000000 -r e77c9484b2d0 sort_chromosomal_position.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sort_chromosomal_position.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,26 @@ +#!/bin/bash + +#sort_chromosomal_position.sh $infile $chrcol $startcol $endcol $num_headerlines + + +cp $1 inputfile.tsv +chrcol=$2 +startcol=$3 +endcol=$4 +num_headerlines=$5 +outfile=$6 + +#remember header +head -$num_headerlines inputfile.tsv > header.tsv + +#remove header +sed -i "1,$num_headerlines d" inputfile.tsv + +#sort file +sort -k ${chrcol},${chrcol}V -k${startcol},${startcol}n inputfile.tsv > tmpout.txt + +cat header.tsv tmpout.txt > $outfile + + + + diff -r 000000000000 -r e77c9484b2d0 sort_chromosomal_position.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sort_chromosomal_position.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,31 @@ + + sort file by chromosome, then by position + + sort_chromosomal_position.sh $infile $chrcol $startcol $endcol $num_headerlines $sorted_file + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r e77c9484b2d0 strip_header.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/strip_header.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,11 @@ + #!/bin/bash + +inputfile=$1 +outputfile=$2 +header=$3 +commentchar=$4 + +echo "commentchar: -${commentchar}-" + +sed -e "/^${commentchar}/d" -e '/^$/d' $inputfile > $outputfile +sed -n "/^${commentchar}/p" $inputfile > $header diff -r 000000000000 -r e77c9484b2d0 strip_header.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/strip_header.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,35 @@ + + + remove header from a file + + + strip_header.sh + $infile + $outputfile + $header + "$commentchar" + + + + + + + + + + + + + + + + + + + + +**What it does** +Removes header from a file. Any lines starting with the comment character specified by user, as well as any empty lines will be removed. + + +