annotate categorize_elements_satisfying_criteria.pl @ 0:586c1f0e1515 draft default tip

Uploaded tool tarball.
author devteam
date Wed, 25 Sep 2013 10:03:03 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
1 #!/usr/bin/perl -w
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
2
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
3 # The program takes as input a set of categories, such that each category contains many elements.
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
4 # It also takes a table relating elements with criteria, such that each element is assigned a number
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
5 # representing the number of times the element satisfies a certain criterion.
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
6 # The first input is a TABULAR format file, such that the left column represents the name of categories and,
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
7 # all other columns represent the names of elements.
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
8 # The second input is a TABULAR format file relating elements with criteria, such that the first line
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
9 # represents the names of criteria and the left column represents the names of elements.
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
10 # The output is a TABULAR format file relating catergories with criteria, such that each categoy is
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
11 # assigned a number representing the total number of times its elements satisfies a certain criterion.
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
12 # Each category is assigned as many numbers as criteria.
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
13
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
14 use strict;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
15 use warnings;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
16
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
17 #variables to handle information of the categories input file
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
18 my @categoryElementsArray = ();
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
19 my @categoriesArray = ();
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
20 my $categoryMemberNames;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
21 my $categoryName;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
22 my %categoryMembersHash = ();
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
23 my $memberNumber = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
24 my $totalMembersNumber = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
25 my $totalCategoriesNumber = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
26 my @categoryCountersTwoDimArray = ();
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
27 my $lineCounter1 = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
28
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
29 #variables to handle information of the criteria and elements data input file
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
30 my $elementLine;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
31 my @elementDataArray = ();
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
32 my $elementName;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
33 my @criteriaArray = ();
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
34 my $criteriaNumber = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
35 my $totalCriteriaNumber = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
36 my $lineCounter2 = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
37
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
38 #variable representing the row and column indices used to store results into a two-dimensional array
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
39 my $row = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
40 my $column = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
41
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
42 # check to make sure having correct files
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
43 my $usage = "usage: categorize_motifs_significance.pl [TABULAR.in] [TABULAR.in] [TABULAR.out] \n";
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
44 die $usage unless @ARGV == 3;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
45
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
46 #get the categories input file
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
47 my $categories_inputFile = $ARGV[0];
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
48
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
49 #get the criteria and data input file
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
50 my $elements_data_inputFile = $ARGV[1];
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
51
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
52 #get the output file
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
53 my $categorized_data_outputFile = $ARGV[2];
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
54
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
55 #open the input and output files
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
56 open (INPUT1, "<", $categories_inputFile) || die("Could not open file $categories_inputFile \n");
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
57 open (INPUT2, "<", $elements_data_inputFile ) || die("Could not open file $elements_data_inputFile \n");
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
58 open (OUTPUT, ">", $categorized_data_outputFile) || die("Could not open file $categorized_data_outputFile \n");
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
59
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
60 #store the first input file into an array
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
61 my @categoriesData = <INPUT1>;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
62
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
63 #reset the value of $lineCounter1 to 0
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
64 $lineCounter1 = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
65
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
66 #iterate through the first input file to get the names of categories and their corresponding elements
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
67 foreach $categoryMemberNames (@categoriesData){
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
68 chomp ($categoryMemberNames);
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
69
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
70 @categoryElementsArray = split(/\t/, $categoryMemberNames);
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
71
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
72 #store the name of the current category into an array
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
73 $categoriesArray [$lineCounter1] = $categoryElementsArray[0];
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
74
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
75 #store the name of the current category into a two-dimensional array
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
76 $categoryCountersTwoDimArray [$lineCounter1] [0] = $categoryElementsArray[0];
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
77
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
78 #get the total number of elements in the current category
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
79 $totalMembersNumber = @categoryElementsArray;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
80
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
81 #store the names of categories and their corresponding elements into a hash
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
82 for ($memberNumber = 1; $memberNumber < $totalMembersNumber; $memberNumber++) {
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
83
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
84 $categoryMembersHash{$categoryElementsArray[$memberNumber]} = $categoriesArray[$lineCounter1];
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
85 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
86
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
87 $lineCounter1++;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
88 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
89
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
90 #store the second input file into an array
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
91 my @elementsData = <INPUT2>;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
92
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
93 #reset the value of $lineCounter2 to 0
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
94 $lineCounter2 = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
95
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
96 #iterate through the second input file in order to count the number of elements
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
97 #in each category that satisfy each criterion
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
98 foreach $elementLine (@elementsData){
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
99 chomp ($elementLine);
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
100
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
101 $lineCounter2++;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
102
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
103 @elementDataArray = split(/\t/, $elementLine);
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
104
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
105 #if at the first line, get the total number of criteria and the total
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
106 #number of catergories and initialize the two-dimensional array
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
107 if ($lineCounter2 == 1){
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
108 @criteriaArray = @elementDataArray;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
109 $totalCriteriaNumber = @elementDataArray;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
110
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
111 $totalCategoriesNumber = @categoriesArray;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
112
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
113 #initialize the two-dimensional array
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
114 for ($row = 0; $row < $totalCategoriesNumber; $row++) {
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
115
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
116 for ($column = 1; $column <= $totalCriteriaNumber; $column++) {
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
117
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
118 $categoryCountersTwoDimArray [$row][$column] = 0;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
119 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
120 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
121 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
122 else{
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
123 #get the element data
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
124 $elementName = $elementDataArray[0];
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
125
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
126 #do the counting and store the result in the two-dimensional array
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
127 for ($criteriaNumber = 0; $criteriaNumber < $totalCriteriaNumber; $criteriaNumber++) {
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
128
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
129 if ($elementDataArray[$criteriaNumber + 1] > 0){
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
130
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
131 $categoryName = $categoryMembersHash{$elementName};
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
132
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
133 my ($categoryIndex) = grep $categoriesArray[$_] eq $categoryName, 0 .. $#categoriesArray;
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
134
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
135 $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] = $categoryCountersTwoDimArray [$categoryIndex] [$criteriaNumber + 1] + $elementDataArray[$criteriaNumber + 1];
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
136 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
137 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
138 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
139 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
140
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
141 print OUTPUT "\t";
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
142
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
143 #store the criteria names into the output file
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
144 for ($column = 1; $column <= $totalCriteriaNumber; $column++) {
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
145
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
146 if ($column < $totalCriteriaNumber){
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
147 print OUTPUT $criteriaArray[$column - 1] . "\t";
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
148 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
149 else{
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
150 print OUTPUT $criteriaArray[$column - 1] . "\n";
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
151 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
152 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
153
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
154 #store the category names and their corresponding number of elements satisfying criteria into the output file
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
155 for ($row = 0; $row < $totalCategoriesNumber; $row++) {
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
156
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
157 for ($column = 0; $column <= $totalCriteriaNumber; $column++) {
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
158
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
159 if ($column < $totalCriteriaNumber){
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
160 print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\t";
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
161 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
162 else{
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
163 print OUTPUT $categoryCountersTwoDimArray [$row][$column] . "\n";
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
164 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
165 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
166 }
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
167
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
168 #close the input and output file
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
169 close(OUTPUT);
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
170 close(INPUT2);
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
171 close(INPUT1);
586c1f0e1515 Uploaded tool tarball.
devteam
parents:
diff changeset
172