# HG changeset patch # User saskia-hiltemann # Date 1445519910 14400 # Node ID e77c9484b2d06563f90b307a3ff2de15f1a3fe4c Uploaded diff -r 000000000000 -r e77c9484b2d0 chrprefix.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chrprefix.sh Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,52 @@ +#!/bin/bash + +inputfile=$1 +col=$2 +addremove=$3 +outputfile=$4 + +echo "args: $@" +echo "inputfile: $inputfile" +echo "column: $column" +echo "addremove: $addremove" +echo "outputfile: $outputfile" + +#get column number +column=`expr match "$col" '$[0-9]*$'` +echo "colnumber: $column" + +if [ $addremove == "add" ] +then + echo "adding prefix to column $column" + awk 'BEGIN{ + FS="\t" + OFS="\t" + c="'"$column"'" + }{ + if (index($0,"#")!=1){ + $c="chr"$c + } + print $0 + + }END{}' $inputfile > $outputfile + +else #remove prefix + echo "removing prefix from column $column" + awk 'BEGIN{ + FS="\t" + OFS="\t" + c="'"$column"'" + }{ + if (FNR>1 && index($0,"#")!=1){ + $c=substr($c,4) + } + print $0 + + }END{}' $inputfile > $outputfile +fi + +echo "inputfile: " +head -5 $inputfile + +echo "outputfile: " +head -5 $outputfile diff -r 000000000000 -r e77c9484b2d0 chrprefix.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chrprefix.xml Thu Oct 22 09:18:30 2015 -0400 @@ -0,0 +1,47 @@ + + + add or remove chr prefix from a column + +


+   
+   
+  	chrprefix.sh 
+  		$infile
+  		"${go.column}"
+  		$addremove
+		$outputfile		
+  
+
+  
+  	
+  
+  
+  
+
+  	   
+  	
+		
+			
+				
+		
+			
+				
+  			
+  		
+  			
+  		
+  	
+  		
+  		 
+  	 
+  		
+
+  
+
+  
+  
+**What it does**
+removes or adds the "chr" prefix in a column of a file. Some tools expect you to indicate chromosomes as "chr1,chr2,chrX", while others expect only 1,2,X as input. This tool allows you to easily switch notations
+
+  
+


diff -r 000000000000 -r e77c9484b2d0 column_extract.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/column_extract.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+inputfile=$1
+outputfile=$2
+removeheader=$3	
+columns="$@"
+
+cols="${columns// /,}" #make comma-separated
+
+#skip first three arguments
+firstcomma=`expr index "$cols" ,`
+cols="${cols:$firstcomma}"
+secondcomma=`expr index "$cols" ,`
+cols="${cols:$secondcomma}"
+thirdcomma=`expr index "$cols" ,`
+cols="${cols:$thirdcomma}"
+cols="${cols//:/}" #remove colons
+echo "colums to print: $cols"
+
+arr=$(echo $cols | tr "," "\n")
+
+for x in $arr
+do
+    echo $x
+done
+
+myArray=($columns)
+i=3
+len=${#myArray[@]}
+mycols=""
+echo "len: $len"
+while [ $i -le $len ]
+do	
+	echo "myarray: ${myArray[$i]}"
+	mycols+=${myArray[$i]}
+	i=$[$i+2]
+done
+mycols="${mycols//:/,}" #make comma-separated
+mycols="${mycols%?}"
+echo "mycols: $mycols"
+
+awk 'BEGIN{
+		FS="\t";
+		OFS="\t";
+		columns="'"$mycols"'";
+		len=split(columns,arr,",")
+	}{
+	 if (index($1,"#")==1 || $1==""){ #print header as--s
+		if("'"$removeheader"'"=="N"){
+            print $0
+		}
+	 }	
+	 else{	
+		 for (i=1;i $outputfile
+
+
diff -r 000000000000 -r e77c9484b2d0 column_extract.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/column_extract.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,50 @@
+
+
+   extract/rearrange columns from a tab-delimited file 
+  
+
+   
+  	column_extract.sh 
+		$infile
+		$outputfile
+		$removeheader
+		#for $c in $go.columns
+			${c.column}
+		#end for
+  
+
+  
+  	
+  
+  
+  
+
+      	
+   
+   
+   
+		
+			
+				
+		
+			
+				
+		   
+		   		
+				
+		   	
+	   	
+   
+   
+  	
+
+  
+
+  
+  
+
+**What it does**
+Outputs columns of input file in order specified by user. Columns not selected will not be output. 
+
+  
+
diff -r 000000000000 -r e77c9484b2d0 concatenate.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/concatenate.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+cat $1 $2 > $3
diff -r 000000000000 -r e77c9484b2d0 concatenate.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/concatenate.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,31 @@
+ 
+
+   concatenate 2 files 
+  
+   
+  	concatenate.sh 
+  		$infile
+  		$infile2
+		$outputfile		
+  
+
+  
+  	
+  
+  
+  
+
+  	   	
+  	  	
+  	 
+  		
+
+  
+
+  
+  
+**What it does**
+Concatenates 2 files
+
+  
+
diff -r 000000000000 -r e77c9484b2d0 filter_columns.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_columns.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+echo $@ 
+count=1
+
+set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter:  " "h:" "$@"` || usage
+[ $# -eq 0 ] && usage
+
+
+while [ $# -gt 0 ]
+do
+    case "$1" in
+    --infile)      	infile=$2;shift;;  
+	--outfile_rem)	outfile_rem=$2;shift;; 
+	--outfile_ret)	outfile_ret=$2;shift;;
+	--filter)		filter[$count]=$2 
+					count=$[$count+1] 
+					shift;;  	
+    -h)   	     	shift;;
+	--)   	     	shift;break;;
+    -*)   	     	usage;;
+    *)    	     	break;;            
+    esac
+    shift
+done
+
+
+
+
+# remove but remember header
+cp $infile inputfile
+sed -i -e "1d" inputfile
+
+head inputfile
+
+count=1
+cp $infile inputfile
+
+for f in ${filter[@]}
+do
+	echo "filter $count: $f" 
+	count=$[$count+1] 
+	
+	IFS=',' read -a filt <<< "$f"
+	
+	#filt=split(${f//,/ })
+	column=${filt[0]}
+	op=${filt[1]}
+	value=${filt[2]}
+	
+	echo "column: $column"
+	echo "op : $op"
+	echo "value: $value"
+	
+	#perform filtering
+	
+	awk 'BEGIN{
+			FS="\t"
+			OFS="\t"
+			op="'"$op"'"
+			numeric_value="'"$value"'"+0
+			
+		}{
+			# keep header in both output files
+			if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){
+				print $0 >> "outfile_removed"
+				print $0 >> "outfile_retained"
+			}
+			
+			if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){
+				print $0 >> "outfile_removed"
+			}
+			else
+			  print $0 >> "outfile_retained"
+		
+		}END{}' inputfile	
+	
+		#next iteration only run on retained lines
+		cp outfile_retained inputfile
+		rm outfile_retained
+done
+
+#remove duplicate lines in outputfiles
+cat  outfile_removed | uniq > $outfile_rem
+cat  inputfile | uniq > $outfile_ret
+
+#awk ' !x[$0]++' outfile_removed > $outfile_rem 
+#awk ' !x[$0]++' outfile_retained > $outfile_ret 
+
diff -r 000000000000 -r e77c9484b2d0 filter_columns.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_columns.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,53 @@
+ 
+
+   filter file based on column values 
+  
+   
+  	filter_columns.sh 
+  		--infile $infile
+  		#for $f in $filters 
+			--filter ${f.column},${f.condition},${f.value}
+		#end for
+		--outfile_rem $outputfile_removed
+		--outfile_ret $outputfile_retained		
+  
+
+  
+	  
+	  
+  
+  
+  
+
+	
+  	
+  	
+		 
+		
+			
+			
+			
+			
+			
+			
+			
+			
+			
+			
+		
+		
+		
+				
+					
+  	 
+  		
+
+  
+
+  
+  
+**What it does**
+Concatenates 2 files
+
+  
+
diff -r 000000000000 -r e77c9484b2d0 getcolumnnames.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getcolumnnames.py	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,30 @@
+import os, sys
+import fnmatch
+import csv
+	
+def get_headers(inputfile):	
+	columnList=[]
+	#line=inputfile.readlines()[0]
+	filename=inputfile.get_file_name()
+	try:
+		f = open(filename)
+		line=f.readline()	
+		while(line[0]=='#' or (not line.strip())):	#remove header (starting with hash sign and empty lines to get to headerline
+			line=f.readline()
+		line = line.strip()		
+		i=1;
+		for col in line.split("\t"):
+			label=str(i)+': '+str(col)
+			columnList.append([label,label,False])		
+			i+=1		
+		
+	except IOError as e:	
+		pass	
+	
+	return columnList
+	
+	
+
+
+
+
diff -r 000000000000 -r e77c9484b2d0 sort_chromosomal_position.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sort_chromosomal_position.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#sort_chromosomal_position.sh $infile $chrcol $startcol $endcol $num_headerlines
+
+
+cp $1 inputfile.tsv
+chrcol=$2
+startcol=$3
+endcol=$4
+num_headerlines=$5
+outfile=$6
+
+#remember header 
+head -$num_headerlines inputfile.tsv > header.tsv
+
+#remove header
+sed -i "1,$num_headerlines d" inputfile.tsv
+
+#sort file
+sort -k ${chrcol},${chrcol}V -k${startcol},${startcol}n inputfile.tsv > tmpout.txt
+
+cat header.tsv tmpout.txt > $outfile
+
+
+
+
diff -r 000000000000 -r e77c9484b2d0 sort_chromosomal_position.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sort_chromosomal_position.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,31 @@
+
+	 sort file by chromosome, then by position 
+	
+		sort_chromosomal_position.sh $infile $chrcol $startcol $endcol $num_headerlines $sorted_file
+	
+	
+	
+	
+										
+		
+		 
+		 		
+		 
+				
+	
+				
+	
+
+  
+      	 	
+     
+  
+
+	 
+
+ 
+
+
+
+
+
diff -r 000000000000 -r e77c9484b2d0 strip_header.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/strip_header.sh	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,11 @@
+ #!/bin/bash
+ 
+inputfile=$1
+outputfile=$2
+header=$3
+commentchar=$4
+
+echo "commentchar: -${commentchar}-"
+ 
+sed -e "/^${commentchar}/d" -e '/^$/d' $inputfile > $outputfile
+sed -n "/^${commentchar}/p"  $inputfile > $header 	
diff -r 000000000000 -r e77c9484b2d0 strip_header.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/strip_header.xml	Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,35 @@
+ 
+
+   remove header from a file 
+  
+   
+  	strip_header.sh 
+  		$infile
+		$outputfile
+		$header
+		"$commentchar"		
+  
+
+  
+  	
+  	
+  
+  
+  
+
+  	   	
+  	
+  		
+  			   
+  		
+
+  
+
+  
+  
+
+**What it does**
+Removes header from a file. Any lines starting with the comment character specified by user, as well as any empty lines will be removed.
+
+  
+