annotate compute_motif_frequencies_for_all_motifs.pl @ 1:5319efa51514 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/compute_motif_frequencies_for_all_motifs commit a1517c9d22029095120643bbe2c8fa53754dd2b7
author devteam
date Wed, 11 Nov 2015 12:07:34 -0500
parents acc3dc5d26e3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
1 #!/usr/bin/perl -w
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
2
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
3 # a program to compute the frequencies of each motif at a window size, determined by the user, in both
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
4 # upstream and downstream sequences flanking indels in all chromosomes.
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
5 # the first input is a TABULAR format file containing the motif names and sequences, such that the file
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
6 # consists of two columns: the left column represents the motif names and the right column represents
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
7 # the motif sequence, one line per motif.
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
8 # the second input is a TABULAR format file containing the windows into which both upstream and downstream
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
9 # sequences flanking indels have been divided.
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
10 # the fourth input is an integer number representing the number of windows to be considered in both
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
11 # upstream and downstream flanking sequences.
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
12 # the output is a TABULAR format file consisting of three columns: the left column represents the motif
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
13 # name, the middle column represents the motif frequency in the window of the upstream sequence flanking
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
14 # an indel, and the the right column represents the motif frequency in the window of the downstream
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
15 # sequence flanking an indel, one line per indel.
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
16 # The total number of lines in the output file = number of motifs x number of indels.
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
17
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
18 use strict;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
19 use warnings;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
20
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
21 #variable to handle the window information
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
22 my $window = "";
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
23 my $windowNumber = 0;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
24 my $totalWindowsNumber = 0;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
25 my $upstreamAndDownstreamFlankingSequencesWindows = "";
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
26
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
27 #variable to handle the motif information
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
28 my $motif = "";
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
29 my $motifName = "";
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
30 my $motifSequence = "";
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
31 my $motifNumber = 0;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
32 my $totalMotifsNumber = 0;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
33 my $upstreamMotifFrequencyCounter = 0;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
34 my $downstreamMotifFrequencyCounter = 0;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
35
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
36 #arrays to sotre window and motif data
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
37 my @windowsArray = ();
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
38 my @motifNamesArray = ();
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
39 my @motifSequencesArray = ();
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
40
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
41 #variable to handle the indel information
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
42 my $indelIndex = 0;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
43
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
44 #variable to store line counter value
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
45 my $lineCounter = 0;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
46
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
47 # check to make sure having correct files
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
48 my $usage = "usage: compute_motif_frequencies_for_all_motifs.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] \n";
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
49 die $usage unless @ARGV == 4;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
50
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
51 #get the input arguments
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
52 my $motifsInputFile = $ARGV[0];
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
53 my $indelFlankingSequencesWindowsInputFile = $ARGV[1];
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
54 my $numberOfConsideredWindows = $ARGV[2];
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
55 my $motifFrequenciesOutputFile = $ARGV[3];
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
56
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
57 #open the input files
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
58 open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n");
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
59 open (INPUT2, "<", $indelFlankingSequencesWindowsInputFile) || die("Could not open file indelFlankingSequencesWindowsInputFile \n");
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
60 open (OUTPUT, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n");
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
61
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
62 #store the motifs input file in the array @motifsData
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
63 my @motifsData = <INPUT1>;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
64
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
65 #iterated through the motifs (lines) of the motifs input file
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
66 foreach $motif (@motifsData){
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
67 chomp ($motif);
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
68 #print ($motif . "\n");
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
69
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
70 #split the motif data into its name and its sequence
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
71 my @motifNameAndSequenceArray = split(/\t/, $motif);
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
72
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
73 #store the name of the motif into the array @motifNamesArray
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
74 push @motifNamesArray, $motifNameAndSequenceArray[0];
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
75
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
76 #store the sequence of the motif into the array @motifSequencesArray
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
77 push @motifSequencesArray, $motifNameAndSequenceArray[1];
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
78 }
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
79
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
80 #compute the size of the motif names array
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
81 $totalMotifsNumber = @motifNamesArray;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
82
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
83
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
84 #store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
85 my @windowsData = <INPUT2>;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
86
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
87 #check if the number of considered window entered by the user is 0 or negative, if so make it equal to 1
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
88 if ($numberOfConsideredWindows <= 0){
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
89 $numberOfConsideredWindows = 1;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
90 }
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
91
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
92 #iterated through the motif sequences to check their occurrences in the considered windows
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
93 #and store the count of their occurrences in the corresponding ouput file
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
94 for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
95
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
96 #get the motif name
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
97 $motifName = $motifNamesArray[$motifNumber];
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
98
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
99 #get the motif sequence
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
100 $motifSequence = $motifSequencesArray[$motifNumber];
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
101
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
102 #iterated through the lines of the second input file. Each line represents
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
103 #the windows of the upstream and downstream flanking sequences of an indel
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
104 foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
105
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
106 chomp ($upstreamAndDownstreamFlankingSequencesWindows);
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
107 $lineCounter++;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
108
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
109 #split both upstream and downstream flanking sequences into their windows
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
110 my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows);
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
111
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
112 if ($lineCounter == 1){
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
113 $totalWindowsNumber = @windowsArray;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
114 $indelIndex = ($totalWindowsNumber - 1)/2;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
115 }
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
116
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
117 #reset the motif frequency counters
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
118 $upstreamMotifFrequencyCounter = 0;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
119 $downstreamMotifFrequencyCounter = 0;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
120
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
121 #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
122 for ($windowNumber = $indelIndex - 1; $windowNumber > $indelIndex - $numberOfConsideredWindows - 1; $windowNumber--){
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
123
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
124 #get the window
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
125 $window = $windowsArray[$windowNumber];
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
126
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
127 #if the motif is found in the window, then increment its corresponding counter
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
128 if ($window =~ m/$motifSequence/i){
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
129 $upstreamMotifFrequencyCounter++;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
130 }
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
131 }
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
132
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
133 #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
134 for ($windowNumber = $indelIndex + 1; $windowNumber < $indelIndex + $numberOfConsideredWindows + 1; $windowNumber++){
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
135
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
136 #get the window
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
137 $window = $windowsArray[$windowNumber];
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
138
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
139 #if the motif is found in the window, then increment its corresponding counter
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
140 if ($window =~ m/$motifSequence/i){
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
141 $downstreamMotifFrequencyCounter++;
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
142 }
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
143 }
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
144
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
145 #store the result into the output file of the motif
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
146 print OUTPUT $motifName . "\t" . $upstreamMotifFrequencyCounter . "\t" . $downstreamMotifFrequencyCounter . "\n";
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
147 }
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
148 }
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
149
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
150 #close the input and output files
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
151 close(OUTPUT);
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
152 close(INPUT2);
acc3dc5d26e3 Uploaded tool tarball.
devteam
parents:
diff changeset
153 close(INPUT1);