Mercurial > repos > saskia-hiltemann > file_manipulation

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chrprefix.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+inputfile=$1
+col=$2
+addremove=$3
+outputfile=$4
+
+echo "args: $@"
+echo "inputfile: $inputfile"
+echo "column: $column"
+echo "addremove: $addremove"
+echo "outputfile: $outputfile"
+
+#get column number
+column=`expr match "$col" '\([0-9]*\)'`
+echo "colnumber: $column"
+
+if [ $addremove == "add" ]
+then
+	echo "adding prefix to column $column"
+	awk 'BEGIN{
+		FS="\t"
+		OFS="\t"
+		c="'"$column"'"
+	}{
+		if (index($0,"#")!=1){
+			$c="chr"$c
+		}
+		print $0
+
+	}END{}' $inputfile > $outputfile
+
+else	#remove prefix
+	echo "removing prefix from column $column"
+	awk 'BEGIN{
+		FS="\t"
+		OFS="\t"
+		c="'"$column"'"
+	}{
+		if (FNR>1 && index($0,"#")!=1){
+			$c=substr($c,4)
+		}
+		print $0
+
+	}END{}' $inputfile > $outputfile
+fi
+
+echo "inputfile: "
+head -5 $inputfile
+
+echo "outputfile: "
+head -5 $outputfile
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chrprefix.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,47 @@
+ <tool id="chr_prefix" name="Add/Remove chr prefix" version="1">
+
+  <description> add or remove chr prefix from a column </description>
+
+   <code file="getcolumnnames.py"/>
+
+  <command interpreter="bash">
+  	chrprefix.sh
+  		$infile
+  		"${go.column}"
+  		$addremove
+		$outputfile
+  </command>
+
+  <outputs>
+  	<data format="tabular" name="outputfile" label="${sample} - chr-prefix on ${on_string}"/>
+  </outputs>
+
+  <inputs>
+
+  	<param name="infile" type="data" label="Select inputfile" help="must be a tab-delimited file with a 1-line header" />
+  	<conditional name="go">
+		<param name="ready" type="select"  label="Ready to select column?"  help="Set to NO and back to YES after changing input file">
+			<option value="no" selected="True">no</option>
+			<option value="yes">yes</option>
+		</param>
+
+		<when value="yes">
+  			<param name="column"   type="select" dynamic_options="get_headers(infile)"  multiple="False" label="Select Column"   />
+  		</when>
+  	</conditional>
+
+  	<param name="addremove" type="select" label="Add or remove the chr prefix?">
+  		<option value="add">Add</option>
+  		<option value="remove">Remove</option>
+  	</param>
+  	<param name="sample"	type="text" value="" label="Output File Name" help="Optional."/>
+
+  </inputs>
+
+  <help>
+
+**What it does**
+removes or adds the "chr" prefix in a column of a file. Some tools expect you to indicate chromosomes as "chr1,chr2,chrX", while others expect only 1,2,X as input. This tool allows you to easily switch notations
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/column_extract.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+inputfile=$1
+outputfile=$2
+removeheader=$3
+columns="$@"
+
+cols="${columns// /,}" #make comma-separated
+
+#skip first three arguments
+firstcomma=`expr index "$cols" ,`
+cols="${cols:$firstcomma}"
+secondcomma=`expr index "$cols" ,`
+cols="${cols:$secondcomma}"
+thirdcomma=`expr index "$cols" ,`
+cols="${cols:$thirdcomma}"
+cols="${cols//:/}" #remove colons
+echo "colums to print: $cols"
+
+arr=$(echo $cols | tr "," "\n")
+
+for x in $arr
+do
+    echo $x
+done
+
+myArray=($columns)
+i=3
+len=${#myArray[@]}
+mycols=""
+echo "len: $len"
+while [ $i -le $len ]
+do
+	echo "myarray: ${myArray[$i]}"
+	mycols+=${myArray[$i]}
+	i=$[$i+2]
+done
+mycols="${mycols//:/,}" #make comma-separated
+mycols="${mycols%?}"
+echo "mycols: $mycols"
+
+awk 'BEGIN{
+		FS="\t";
+		OFS="\t";
+		columns="'"$mycols"'";
+		len=split(columns,arr,",")
+	}{
+	 if (index($1,"#")==1 || $1==""){ #print header as--s
+		if("'"$removeheader"'"=="N"){
+            print $0
+		}
+	 }
+	 else{
+		 for (i=1;i<len;i++){
+			j=arr[i]
+			printf $j"\t"
+		 }
+		 j=arr[len]
+		 printf $j"\n"
+	}
+	}END{
+
+	}' $inputfile > $outputfile
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/column_extract.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,50 @@
+<tool id="column_select" name="Column Select" version="1">
+
+  <description> extract/rearrange columns from a tab-delimited file </description>
+  <code file="getcolumnnames.py"/>
+
+  <command interpreter="bash">
+  	column_extract.sh
+		$infile
+		$outputfile
+		$removeheader
+		#for $c in $go.columns
+			${c.column}
+		#end for
+  </command>
+
+  <outputs>
+  	<data format="tabular" name="outputfile" label="${sample} Column extract on ${on_string}"/>
+  </outputs>
+
+  <inputs>
+
+   <param name="infile" type="data" label="Select file to extract columns from" help="Must be a tab-separated file with a 1-line header"/>
+   <param name="removeheader" type="boolean" truevalue="Y" falsevalue="N" selected="false" label="Remove header?" help="If selected, any lines starting with # and any empty lines will not be output."/>
+
+   <conditional name="go">
+		<param name="ready" type="select"  label="Ready to select columns?"  help="Set to NO and back to YES after changing input file">
+			<option value="no" selected="True">no</option>
+			<option value="yes">yes</option>
+		</param>
+
+		<when value="yes">
+		   <repeat name="columns" title="Column" min="1">
+		   		<!-- <param name="col"     type="data_column"   data_ref="infile" multiple="False" label="Column from file to output"  /> -->
+				<param name="column"   type="select" dynamic_options="get_headers(infile)"  multiple="False" label="Select Column"   />
+		   </repeat>
+	   	</when>
+   </conditional>
+
+  <param name="sample"	type="text" value="" label="Sample Name" help="Optional. For file naming purposes only"/>
+
+  </inputs>
+
+  <help>
+
+
+**What it does**
+Outputs columns of input file in order specified by user. Columns not selected will not be output.
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/concatenate.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+cat $1 $2 > $3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/concatenate.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,31 @@
+ <tool id="concatenate" name="File Concatenation" version="1">
+
+  <description> concatenate 2 files </description>
+
+  <command interpreter="bash">
+  	concatenate.sh
+  		$infile
+  		$infile2
+		$outputfile
+  </command>
+
+  <outputs>
+  	<data format="tabular" name="outputfile" label="${sample} - File Concatenation on ${on_string}"/>
+  </outputs>
+
+  <inputs>
+
+  	<param name="infile" type="data" label="Select first file for concatenation" />
+  	<param name="infile2" type="data" label="Select second file for concatenation" />
+
+  	<param name="sample"	type="text" value="" label="Output File Name" help="Optional."/>
+
+  </inputs>
+
+  <help>
+
+**What it does**
+Concatenates 2 files
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_columns.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+echo $@
+count=1
+
+set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter:  " "h:" "$@"` || usage
+[ $# -eq 0 ] && usage
+
+
+while [ $# -gt 0 ]
+do
+    case "$1" in
+    --infile)      	infile=$2;shift;;
+	--outfile_rem)	outfile_rem=$2;shift;;
+	--outfile_ret)	outfile_ret=$2;shift;;
+	--filter)		filter[$count]=$2
+					count=$[$count+1]
+					shift;;
+    -h)   	     	shift;;
+	--)   	     	shift;break;;
+    -*)   	     	usage;;
+    *)    	     	break;;
+    esac
+    shift
+done
+
+
+
+
+# remove but remember header
+cp $infile inputfile
+sed -i -e "1d" inputfile
+
+head inputfile
+
+count=1
+cp $infile inputfile
+
+for f in ${filter[@]}
+do
+	echo "filter $count: $f"
+	count=$[$count+1]
+
+	IFS=',' read -a filt <<< "$f"
+
+	#filt=split(${f//,/ })
+	column=${filt[0]}
+	op=${filt[1]}
+	value=${filt[2]}
+
+	echo "column: $column"
+	echo "op : $op"
+	echo "value: $value"
+
+	#perform filtering
+
+	awk 'BEGIN{
+			FS="\t"
+			OFS="\t"
+			op="'"$op"'"
+			numeric_value="'"$value"'"+0
+
+		}{
+			# keep header in both output files
+			if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){
+				print $0 >> "outfile_removed"
+				print $0 >> "outfile_retained"
+			}
+
+			if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else
+			  print $0 >> "outfile_retained"
+
+		}END{}' inputfile
+
+		#next iteration only run on retained lines
+		cp outfile_retained inputfile
+		rm outfile_retained
+done
+
+#remove duplicate lines in outputfiles
+cat  outfile_removed | uniq > $outfile_rem
+cat  inputfile | uniq > $outfile_ret
+
+#awk ' !x[$0]++' outfile_removed > $outfile_rem
+#awk ' !x[$0]++' outfile_retained > $outfile_ret
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_columns.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,53 @@
+ <tool id="filter_columns" name="Filter Columns" version="1.2">
+
+  <description> filter file based on column values </description>
+
+  <command interpreter="bash">
+  	filter_columns.sh
+  		--infile $infile
+  		#for $f in $filters <!--get each var/mastervar file-->
+			--filter ${f.column},${f.condition},${f.value}
+		#end for
+		--outfile_rem $outputfile_removed
+		--outfile_ret $outputfile_retained
+  </command>
+
+  <outputs>
+	  <data format="input" name="outputfile_removed"  label="${sample} - Removed lines Column filter on ${on_string}" metadata_source="infile"/>
+	  <data format="input" name="outputfile_retained" label="${sample} - Retained lines Column filter on ${on_string}" metadata_source="infile"/>
+  </outputs>
+
+  <inputs>
+
+	<param name="infile" type="data" label="Select input file" help="expects header lines to be indicated by # or > symbols. First line is always considered header line"/>
+
+  	<repeat name="filters" title="Filter" min="1" help="Multiple filters: lines removed if match any of the filters, not all (rules are OR'd, not AND'd)">
+		<param name="column"    type="data_column"   data_ref="infile" multiple="False" label="Select Column to filter on"  />
+		<param name="condition" type="select" label="Remove line if column value.." help="you will receive both file with retained lines and file with removed lines">
+			<option value="nonempty">  	non-empty					</option>
+			<option value="empty"	>	empty						</option>
+			<option value="equals"	>	equals (string or number)	</option>
+			<option value="nequals"	>	not equals (string or number)</option>
+			<option value="contains"> 	contains (substring) 		</option>
+			<option value="ncontains"> 	does not contain (substring) </option>
+			<option value="gt"		>	greater than				</option>
+			<option value="ge"		>	greater than or equals		</option>
+			<option value="lt"		>	less than					</option>
+			<option value="le"		>	less than or equals			</option>
+		</param>
+
+
+		<param name="value" type="text" value="" label="enter number or string unless above option was set to empty or nonempty"/>
+	</repeat>
+
+  	<param name="sample"	type="text" value="" label="Output File Name" help="Optional."/>
+
+  </inputs>
+
+  <help>
+
+**What it does**
+Concatenates 2 files
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getcolumnnames.py	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,30 @@
+import os, sys
+import fnmatch
+import csv
+
+def get_headers(inputfile):
+	columnList=[]
+	#line=inputfile.readlines()[0]
+	filename=inputfile.get_file_name()
+	try:
+		f = open(filename)
+		line=f.readline()
+		while(line[0]=='#' or (not line.strip())):	#remove header (starting with hash sign and empty lines to get to headerline
+			line=f.readline()
+		line = line.strip()
+		i=1;
+		for col in line.split("\t"):
+			label=str(i)+': '+str(col)
+			columnList.append([label,label,False])
+			i+=1
+
+	except IOError as e:
+		pass
+
+	return columnList
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sort_chromosomal_position.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#sort_chromosomal_position.sh $infile $chrcol $startcol $endcol $num_headerlines
+
+
+cp $1 inputfile.tsv
+chrcol=$2
+startcol=$3
+endcol=$4
+num_headerlines=$5
+outfile=$6
+
+#remember header
+head -$num_headerlines inputfile.tsv > header.tsv
+
+#remove header
+sed -i "1,$num_headerlines d" inputfile.tsv
+
+#sort file
+sort -k ${chrcol},${chrcol}V -k${startcol},${startcol}n inputfile.tsv > tmpout.txt
+
+cat header.tsv tmpout.txt > $outfile
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sort_chromosomal_position.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,31 @@
+<tool id="sort_chrom_pos" name="Sort Chromosomal Position" version="1" >
+	<description> sort file by chromosome, then by position </description>
+	<command interpreter="bash">
+		sort_chromosomal_position.sh $infile $chrcol $startcol $endcol $num_headerlines $sorted_file
+	</command>
+
+
+	<inputs>
+
+		<param name="infile" 	  type="data" 			label="select file to be sorted" help="must be a tab-separated file with a 1 line header"/>
+		<param name="chrcol"   type="data_column"    data_ref="infile" multiple="False" label="Chromosome Column" />
+		<param name="startcol" type="data_column"    data_ref="infile" multiple="False" label="Start Column"  />
+		<param name="endcol"   type="data_column"    data_ref="infile" multiple="False" label="End Column"  />
+		<param name="num_headerlines" type="text" value="1" label="number of headerlines in your file" help="these lines will not be sorted"/>
+
+		<param name="fname" type="text" value="" label="Prefix for your output file" help="Optional"/>
+	</inputs>
+
+  <outputs>
+      <data format="tabular" name="sorted_file"  label="$fname ${tool.name} Annotated region on file ${on_string}"/>
+
+  </outputs>
+
+	<help>
+
+
+</help>
+
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/strip_header.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,11 @@
+ #!/bin/bash
+
+inputfile=$1
+outputfile=$2
+header=$3
+commentchar=$4
+
+echo "commentchar: -${commentchar}-"
+
+sed -e "/^${commentchar}/d" -e '/^$/d' $inputfile > $outputfile
+sed -n "/^${commentchar}/p"  $inputfile > $header
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/strip_header.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,35 @@
+ <tool id="header_remove" name="Strip Header" version="1">
+
+  <description> remove header from a file </description>
+
+  <command interpreter="bash">
+  	strip_header.sh
+  		$infile
+		$outputfile
+		$header
+		"$commentchar"
+  </command>
+
+  <outputs>
+  	<data format="tabular" name="outputfile" label="${sample} - Header Stripped on ${on_string}"/>
+  	<data format="tabular" name="header" label="${sample} - Stripped Header on ${on_string}"/>
+  </outputs>
+
+  <inputs>
+
+  	<param name="infile" type="data" label="Select file to extract columns from" />
+  	<param name="commentchar"	type="text" value="#" label="Comment character" help="Any lines starting with this character will be removed (as well as any empty lines)">
+  		<sanitizer sanitize="False"/>
+  	</param>
+  	<param name="sample"	type="text" value="" label="Sample Name" help="Optional. For file naming purposes only"/>
+
+  </inputs>
+
+  <help>
+
+
+**What it does**
+Removes header from a file. Any lines starting with the comment character specified by user, as well as any empty lines will be removed.
+
+  </help>
+</tool>