view tools/regVariation/delete_overlapping_indels.pl @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
line wrap: on
line source

#!/usr/bin/perl -w

# This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, 
# the first encountered one is kept and all others are removed. It requires three inputs: 
# The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment.
# The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file.
# The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file.
# The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones.
# Note: The number of the first column is 1.
 
use strict;
use warnings;

#varaibles to handle information related to indels
my $indel1 = "";
my $indel2 = "";
my @indelArray1 = ();
my @indelArray2 = ();
my $lineCounter1 = 0;
my $lineCounter2 = 0;
my $totalNumberofNonOverlappingIndels = 0;

# check to make sure having correct files
my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n";
die $usage unless @ARGV == 4;

my $inputFile = $ARGV[0];
my $indelStartColumn = $ARGV[1] - 1;
my $indelEndColumn = $ARGV[2] - 1;
my $outputFile = $ARGV[3];

#verifie column numbers
if ($indelStartColumn < 0 ){
	die ("The indel start column number is invalid \n"); 
}
if ($indelEndColumn < 0 ){
	die ("The indel end column number is invalid \n"); 
}

#open the input and output files
open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n"); 
open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n"); 

#store the input file in the array @rawData
my @indelsRawData = <INPUT>;

#iterated through the indels of the input file
INDEL1:
foreach $indel1 (@indelsRawData){
	chomp ($indel1);
	$lineCounter1++;
	
	#get the first indel
	@indelArray1 = split(/\t/, $indel1);
	 
	#our purpose is to detect overlapping indels and to store one copy of them only in the output file
	#all other non-overlapping indels will stored in the output file also
			 
	$lineCounter2 = 0;
		 
	#iterated through the indels of the input file
	INDEL2:
	foreach $indel2 (@indelsRawData){
		chomp ($indel2);
		$lineCounter2++;
				
		if ($lineCounter2 > $lineCounter1){
			#get the second indel
			@indelArray2 = split(/\t/, $indel2);
		 				
 			#check if the two indels are overlapping
 			if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){
 				#print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n");
 				#print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n");
 				
 				#break out of the loop and go back to the outerloop
 				next INDEL1;
 			}
 			else{
 				#print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n");
 			}
		}
	}
		 
	print OUTPUT $indel1 . "\n";
	$totalNumberofNonOverlappingIndels++;
}

#print("The total number of indels is: " . $lineCounter1 . "\n");
#print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n");

#close the input and output files
close(OUTPUT);
close(INPUT);