diff delete_overlapping_indels.pl @ 0:f16000dc644b draft default tip

Uploaded tool tarball.
author devteam
date Wed, 25 Sep 2013 10:24:36 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/delete_overlapping_indels.pl	Wed Sep 25 10:24:36 2013 -0400
@@ -0,0 +1,94 @@
+#!/usr/bin/perl -w
+
+# This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, 
+# the first encountered one is kept and all others are removed. It requires three inputs: 
+# The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment.
+# The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file.
+# The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file.
+# The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones.
+# Note: The number of the first column is 1.
+ 
+use strict;
+use warnings;
+
+#varaibles to handle information related to indels
+my $indel1 = "";
+my $indel2 = "";
+my @indelArray1 = ();
+my @indelArray2 = ();
+my $lineCounter1 = 0;
+my $lineCounter2 = 0;
+my $totalNumberofNonOverlappingIndels = 0;
+
+# check to make sure having correct files
+my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n";
+die $usage unless @ARGV == 4;
+
+my $inputFile = $ARGV[0];
+my $indelStartColumn = $ARGV[1] - 1;
+my $indelEndColumn = $ARGV[2] - 1;
+my $outputFile = $ARGV[3];
+
+#verifie column numbers
+if ($indelStartColumn < 0 ){
+	die ("The indel start column number is invalid \n"); 
+}
+if ($indelEndColumn < 0 ){
+	die ("The indel end column number is invalid \n"); 
+}
+
+#open the input and output files
+open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n"); 
+open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n"); 
+
+#store the input file in the array @rawData
+my @indelsRawData = <INPUT>;
+
+#iterated through the indels of the input file
+INDEL1:
+foreach $indel1 (@indelsRawData){
+	chomp ($indel1);
+	$lineCounter1++;
+	
+	#get the first indel
+	@indelArray1 = split(/\t/, $indel1);
+	 
+	#our purpose is to detect overlapping indels and to store one copy of them only in the output file
+	#all other non-overlapping indels will stored in the output file also
+			 
+	$lineCounter2 = 0;
+		 
+	#iterated through the indels of the input file
+	INDEL2:
+	foreach $indel2 (@indelsRawData){
+		chomp ($indel2);
+		$lineCounter2++;
+				
+		if ($lineCounter2 > $lineCounter1){
+			#get the second indel
+			@indelArray2 = split(/\t/, $indel2);
+		 				
+ 			#check if the two indels are overlapping
+ 			if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){
+ 				#print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n");
+ 				#print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n");
+ 				
+ 				#break out of the loop and go back to the outerloop
+ 				next INDEL1;
+ 			}
+ 			else{
+ 				#print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n");
+ 			}
+		}
+	}
+		 
+	print OUTPUT $indel1 . "\n";
+	$totalNumberofNonOverlappingIndels++;
+}
+
+#print("The total number of indels is: " . $lineCounter1 . "\n");
+#print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n");
+
+#close the input and output files
+close(OUTPUT);
+close(INPUT);
\ No newline at end of file