Mercurial > repos > xuebing > sharplabtool
comparison tools/regVariation/compute_motif_frequencies_for_all_motifs.pl @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 # a program to compute the frequencies of each motif at a window size, determined by the user, in both | |
4 # upstream and downstream sequences flanking indels in all chromosomes. | |
5 # the first input is a TABULAR format file containing the motif names and sequences, such that the file | |
6 # consists of two columns: the left column represents the motif names and the right column represents | |
7 # the motif sequence, one line per motif. | |
8 # the second input is a TABULAR format file containing the windows into which both upstream and downstream | |
9 # sequences flanking indels have been divided. | |
10 # the fourth input is an integer number representing the number of windows to be considered in both | |
11 # upstream and downstream flanking sequences. | |
12 # the output is a TABULAR format file consisting of three columns: the left column represents the motif | |
13 # name, the middle column represents the motif frequency in the window of the upstream sequence flanking | |
14 # an indel, and the the right column represents the motif frequency in the window of the downstream | |
15 # sequence flanking an indel, one line per indel. | |
16 # The total number of lines in the output file = number of motifs x number of indels. | |
17 | |
18 use strict; | |
19 use warnings; | |
20 | |
21 #variable to handle the window information | |
22 my $window = ""; | |
23 my $windowNumber = 0; | |
24 my $totalWindowsNumber = 0; | |
25 my $upstreamAndDownstreamFlankingSequencesWindows = ""; | |
26 | |
27 #variable to handle the motif information | |
28 my $motif = ""; | |
29 my $motifName = ""; | |
30 my $motifSequence = ""; | |
31 my $motifNumber = 0; | |
32 my $totalMotifsNumber = 0; | |
33 my $upstreamMotifFrequencyCounter = 0; | |
34 my $downstreamMotifFrequencyCounter = 0; | |
35 | |
36 #arrays to sotre window and motif data | |
37 my @windowsArray = (); | |
38 my @motifNamesArray = (); | |
39 my @motifSequencesArray = (); | |
40 | |
41 #variable to handle the indel information | |
42 my $indelIndex = 0; | |
43 | |
44 #variable to store line counter value | |
45 my $lineCounter = 0; | |
46 | |
47 # check to make sure having correct files | |
48 my $usage = "usage: compute_motif_frequencies_for_all_motifs.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] \n"; | |
49 die $usage unless @ARGV == 4; | |
50 | |
51 #get the input arguments | |
52 my $motifsInputFile = $ARGV[0]; | |
53 my $indelFlankingSequencesWindowsInputFile = $ARGV[1]; | |
54 my $numberOfConsideredWindows = $ARGV[2]; | |
55 my $motifFrequenciesOutputFile = $ARGV[3]; | |
56 | |
57 #open the input files | |
58 open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n"); | |
59 open (INPUT2, "<", $indelFlankingSequencesWindowsInputFile) || die("Could not open file indelFlankingSequencesWindowsInputFile \n"); | |
60 open (OUTPUT, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n"); | |
61 | |
62 #store the motifs input file in the array @motifsData | |
63 my @motifsData = <INPUT1>; | |
64 | |
65 #iterated through the motifs (lines) of the motifs input file | |
66 foreach $motif (@motifsData){ | |
67 chomp ($motif); | |
68 #print ($motif . "\n"); | |
69 | |
70 #split the motif data into its name and its sequence | |
71 my @motifNameAndSequenceArray = split(/\t/, $motif); | |
72 | |
73 #store the name of the motif into the array @motifNamesArray | |
74 push @motifNamesArray, $motifNameAndSequenceArray[0]; | |
75 | |
76 #store the sequence of the motif into the array @motifSequencesArray | |
77 push @motifSequencesArray, $motifNameAndSequenceArray[1]; | |
78 } | |
79 | |
80 #compute the size of the motif names array | |
81 $totalMotifsNumber = @motifNamesArray; | |
82 | |
83 | |
84 #store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData | |
85 my @windowsData = <INPUT2>; | |
86 | |
87 #check if the number of considered window entered by the user is 0 or negative, if so make it equal to 1 | |
88 if ($numberOfConsideredWindows <= 0){ | |
89 $numberOfConsideredWindows = 1; | |
90 } | |
91 | |
92 #iterated through the motif sequences to check their occurrences in the considered windows | |
93 #and store the count of their occurrences in the corresponding ouput file | |
94 for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ | |
95 | |
96 #get the motif name | |
97 $motifName = $motifNamesArray[$motifNumber]; | |
98 | |
99 #get the motif sequence | |
100 $motifSequence = $motifSequencesArray[$motifNumber]; | |
101 | |
102 #iterated through the lines of the second input file. Each line represents | |
103 #the windows of the upstream and downstream flanking sequences of an indel | |
104 foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){ | |
105 | |
106 chomp ($upstreamAndDownstreamFlankingSequencesWindows); | |
107 $lineCounter++; | |
108 | |
109 #split both upstream and downstream flanking sequences into their windows | |
110 my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows); | |
111 | |
112 if ($lineCounter == 1){ | |
113 $totalWindowsNumber = @windowsArray; | |
114 $indelIndex = ($totalWindowsNumber - 1)/2; | |
115 } | |
116 | |
117 #reset the motif frequency counters | |
118 $upstreamMotifFrequencyCounter = 0; | |
119 $downstreamMotifFrequencyCounter = 0; | |
120 | |
121 #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter | |
122 for ($windowNumber = $indelIndex - 1; $windowNumber > $indelIndex - $numberOfConsideredWindows - 1; $windowNumber--){ | |
123 | |
124 #get the window | |
125 $window = $windowsArray[$windowNumber]; | |
126 | |
127 #if the motif is found in the window, then increment its corresponding counter | |
128 if ($window =~ m/$motifSequence/i){ | |
129 $upstreamMotifFrequencyCounter++; | |
130 } | |
131 } | |
132 | |
133 #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter | |
134 for ($windowNumber = $indelIndex + 1; $windowNumber < $indelIndex + $numberOfConsideredWindows + 1; $windowNumber++){ | |
135 | |
136 #get the window | |
137 $window = $windowsArray[$windowNumber]; | |
138 | |
139 #if the motif is found in the window, then increment its corresponding counter | |
140 if ($window =~ m/$motifSequence/i){ | |
141 $downstreamMotifFrequencyCounter++; | |
142 } | |
143 } | |
144 | |
145 #store the result into the output file of the motif | |
146 print OUTPUT $motifName . "\t" . $upstreamMotifFrequencyCounter . "\t" . $downstreamMotifFrequencyCounter . "\n"; | |
147 } | |
148 } | |
149 | |
150 #close the input and output files | |
151 close(OUTPUT); | |
152 close(INPUT2); | |
153 close(INPUT1); |