annotate delete_overlapping_indels.pl @ 0:f16000dc644b draft default tip

Uploaded tool tarball.
author devteam
date Wed, 25 Sep 2013 10:24:36 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
1 #!/usr/bin/perl -w
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
2
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
3 # This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels,
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
4 # the first encountered one is kept and all others are removed. It requires three inputs:
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
5 # The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment.
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
6 # The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file.
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
7 # The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file.
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
8 # The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones.
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
9 # Note: The number of the first column is 1.
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
10
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
11 use strict;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
12 use warnings;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
13
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
14 #varaibles to handle information related to indels
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
15 my $indel1 = "";
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
16 my $indel2 = "";
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
17 my @indelArray1 = ();
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
18 my @indelArray2 = ();
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
19 my $lineCounter1 = 0;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
20 my $lineCounter2 = 0;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
21 my $totalNumberofNonOverlappingIndels = 0;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
22
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
23 # check to make sure having correct files
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
24 my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n";
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
25 die $usage unless @ARGV == 4;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
26
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
27 my $inputFile = $ARGV[0];
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
28 my $indelStartColumn = $ARGV[1] - 1;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
29 my $indelEndColumn = $ARGV[2] - 1;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
30 my $outputFile = $ARGV[3];
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
31
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
32 #verifie column numbers
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
33 if ($indelStartColumn < 0 ){
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
34 die ("The indel start column number is invalid \n");
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
35 }
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
36 if ($indelEndColumn < 0 ){
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
37 die ("The indel end column number is invalid \n");
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
38 }
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
39
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
40 #open the input and output files
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
41 open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n");
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
42 open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n");
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
43
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
44 #store the input file in the array @rawData
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
45 my @indelsRawData = <INPUT>;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
46
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
47 #iterated through the indels of the input file
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
48 INDEL1:
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
49 foreach $indel1 (@indelsRawData){
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
50 chomp ($indel1);
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
51 $lineCounter1++;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
52
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
53 #get the first indel
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
54 @indelArray1 = split(/\t/, $indel1);
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
55
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
56 #our purpose is to detect overlapping indels and to store one copy of them only in the output file
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
57 #all other non-overlapping indels will stored in the output file also
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
58
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
59 $lineCounter2 = 0;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
60
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
61 #iterated through the indels of the input file
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
62 INDEL2:
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
63 foreach $indel2 (@indelsRawData){
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
64 chomp ($indel2);
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
65 $lineCounter2++;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
66
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
67 if ($lineCounter2 > $lineCounter1){
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
68 #get the second indel
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
69 @indelArray2 = split(/\t/, $indel2);
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
70
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
71 #check if the two indels are overlapping
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
72 if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
73 #print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n");
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
74 #print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n");
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
75
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
76 #break out of the loop and go back to the outerloop
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
77 next INDEL1;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
78 }
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
79 else{
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
80 #print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n");
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
81 }
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
82 }
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
83 }
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
84
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
85 print OUTPUT $indel1 . "\n";
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
86 $totalNumberofNonOverlappingIndels++;
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
87 }
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
88
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
89 #print("The total number of indels is: " . $lineCounter1 . "\n");
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
90 #print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n");
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
91
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
92 #close the input and output files
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
93 close(OUTPUT);
f16000dc644b Uploaded tool tarball.
devteam
parents:
diff changeset
94 close(INPUT);