0
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 # a program to compute the frequencies of each motif at a window size, determined by the user, in both
|
|
4 # upstream and downstream sequences flanking indels in all chromosomes.
|
|
5 # the first input is a TABULAR format file containing the motif names and sequences, such that the file
|
|
6 # consists of two columns: the left column represents the motif names and the right column represents
|
|
7 # the motif sequence, one line per motif.
|
|
8 # the second input is a TABULAR format file containing the windows into which both upstream and downstream
|
|
9 # sequences flanking indels have been divided.
|
|
10 # the fourth input is an integer number representing the number of windows to be considered in both
|
|
11 # upstream and downstream flanking sequences.
|
|
12 # the output is a TABULAR format file consisting of three columns: the left column represents the motif
|
|
13 # name, the middle column represents the motif frequency in the window of the upstream sequence flanking
|
|
14 # an indel, and the the right column represents the motif frequency in the window of the downstream
|
|
15 # sequence flanking an indel, one line per indel.
|
|
16 # The total number of lines in the output file = number of motifs x number of indels.
|
|
17
|
|
18 use strict;
|
|
19 use warnings;
|
|
20
|
|
21 #variable to handle the window information
|
|
22 my $window = "";
|
|
23 my $windowNumber = 0;
|
|
24 my $totalWindowsNumber = 0;
|
|
25 my $upstreamAndDownstreamFlankingSequencesWindows = "";
|
|
26
|
|
27 #variable to handle the motif information
|
|
28 my $motif = "";
|
|
29 my $motifName = "";
|
|
30 my $motifSequence = "";
|
|
31 my $motifNumber = 0;
|
|
32 my $totalMotifsNumber = 0;
|
|
33 my $upstreamMotifFrequencyCounter = 0;
|
|
34 my $downstreamMotifFrequencyCounter = 0;
|
|
35
|
|
36 #arrays to sotre window and motif data
|
|
37 my @windowsArray = ();
|
|
38 my @motifNamesArray = ();
|
|
39 my @motifSequencesArray = ();
|
|
40
|
|
41 #variable to handle the indel information
|
|
42 my $indelIndex = 0;
|
|
43
|
|
44 #variable to store line counter value
|
|
45 my $lineCounter = 0;
|
|
46
|
|
47 # check to make sure having correct files
|
|
48 my $usage = "usage: compute_motif_frequencies_for_all_motifs.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] \n";
|
|
49 die $usage unless @ARGV == 4;
|
|
50
|
|
51 #get the input arguments
|
|
52 my $motifsInputFile = $ARGV[0];
|
|
53 my $indelFlankingSequencesWindowsInputFile = $ARGV[1];
|
|
54 my $numberOfConsideredWindows = $ARGV[2];
|
|
55 my $motifFrequenciesOutputFile = $ARGV[3];
|
|
56
|
|
57 #open the input files
|
|
58 open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n");
|
|
59 open (INPUT2, "<", $indelFlankingSequencesWindowsInputFile) || die("Could not open file indelFlankingSequencesWindowsInputFile \n");
|
|
60 open (OUTPUT, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n");
|
|
61
|
|
62 #store the motifs input file in the array @motifsData
|
|
63 my @motifsData = <INPUT1>;
|
|
64
|
|
65 #iterated through the motifs (lines) of the motifs input file
|
|
66 foreach $motif (@motifsData){
|
|
67 chomp ($motif);
|
|
68 #print ($motif . "\n");
|
|
69
|
|
70 #split the motif data into its name and its sequence
|
|
71 my @motifNameAndSequenceArray = split(/\t/, $motif);
|
|
72
|
|
73 #store the name of the motif into the array @motifNamesArray
|
|
74 push @motifNamesArray, $motifNameAndSequenceArray[0];
|
|
75
|
|
76 #store the sequence of the motif into the array @motifSequencesArray
|
|
77 push @motifSequencesArray, $motifNameAndSequenceArray[1];
|
|
78 }
|
|
79
|
|
80 #compute the size of the motif names array
|
|
81 $totalMotifsNumber = @motifNamesArray;
|
|
82
|
|
83
|
|
84 #store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData
|
|
85 my @windowsData = <INPUT2>;
|
|
86
|
|
87 #check if the number of considered window entered by the user is 0 or negative, if so make it equal to 1
|
|
88 if ($numberOfConsideredWindows <= 0){
|
|
89 $numberOfConsideredWindows = 1;
|
|
90 }
|
|
91
|
|
92 #iterated through the motif sequences to check their occurrences in the considered windows
|
|
93 #and store the count of their occurrences in the corresponding ouput file
|
|
94 for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){
|
|
95
|
|
96 #get the motif name
|
|
97 $motifName = $motifNamesArray[$motifNumber];
|
|
98
|
|
99 #get the motif sequence
|
|
100 $motifSequence = $motifSequencesArray[$motifNumber];
|
|
101
|
|
102 #iterated through the lines of the second input file. Each line represents
|
|
103 #the windows of the upstream and downstream flanking sequences of an indel
|
|
104 foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){
|
|
105
|
|
106 chomp ($upstreamAndDownstreamFlankingSequencesWindows);
|
|
107 $lineCounter++;
|
|
108
|
|
109 #split both upstream and downstream flanking sequences into their windows
|
|
110 my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows);
|
|
111
|
|
112 if ($lineCounter == 1){
|
|
113 $totalWindowsNumber = @windowsArray;
|
|
114 $indelIndex = ($totalWindowsNumber - 1)/2;
|
|
115 }
|
|
116
|
|
117 #reset the motif frequency counters
|
|
118 $upstreamMotifFrequencyCounter = 0;
|
|
119 $downstreamMotifFrequencyCounter = 0;
|
|
120
|
|
121 #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter
|
|
122 for ($windowNumber = $indelIndex - 1; $windowNumber > $indelIndex - $numberOfConsideredWindows - 1; $windowNumber--){
|
|
123
|
|
124 #get the window
|
|
125 $window = $windowsArray[$windowNumber];
|
|
126
|
|
127 #if the motif is found in the window, then increment its corresponding counter
|
|
128 if ($window =~ m/$motifSequence/i){
|
|
129 $upstreamMotifFrequencyCounter++;
|
|
130 }
|
|
131 }
|
|
132
|
|
133 #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter
|
|
134 for ($windowNumber = $indelIndex + 1; $windowNumber < $indelIndex + $numberOfConsideredWindows + 1; $windowNumber++){
|
|
135
|
|
136 #get the window
|
|
137 $window = $windowsArray[$windowNumber];
|
|
138
|
|
139 #if the motif is found in the window, then increment its corresponding counter
|
|
140 if ($window =~ m/$motifSequence/i){
|
|
141 $downstreamMotifFrequencyCounter++;
|
|
142 }
|
|
143 }
|
|
144
|
|
145 #store the result into the output file of the motif
|
|
146 print OUTPUT $motifName . "\t" . $upstreamMotifFrequencyCounter . "\t" . $downstreamMotifFrequencyCounter . "\n";
|
|
147 }
|
|
148 }
|
|
149
|
|
150 #close the input and output files
|
|
151 close(OUTPUT);
|
|
152 close(INPUT2);
|
|
153 close(INPUT1); |