annotate csem/csem_wrapper.pl @ 0:a8f2c2a5f11b

Uploaded
author dongjun
date Mon, 12 Sep 2011 09:54:50 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
1 # Wrapper for CSEM with Bowtie
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
2 # Written by Dongjun Chung, Sep. 8, 2011
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
3
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
4 #!/usr/bin/env perl;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
5 #use warnings;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
6 #use strict;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
7 use File::Temp qw/tempfile/;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
8 use File::Temp qw/tmpnam/;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
9
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
10 # parse command arguments
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
11
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
12 die "Usage: perl csem_wrapper.pl [infile_name] [infile_format] [outfile_name] [outfile_format] [ref_genome] [pseudo_tags] [n_mismatch] [maxpos] [window_size] [n_iter] [n_core]" unless @ARGV == 11;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
13
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
14 my ( $infile_name, $infile_format, $outfile_name, $outfile_format, $ref_genome, $pseudo_tags, $n_mismatch, $maxpos, $window_size, $n_iter, $n_core ) = @ARGV;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
15
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
16 # construct ref genome file (adapted from "genRef.pl")
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
17
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
18 open ( IN, "bowtie-inspect -s $ref_genome |" ) or die "Cannot run bowtie-inspect!\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
19
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
20 my $line;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
21
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
22 my $size = 0;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
23 my (@names, @lens) = ();
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
24 # $size: # of chromosomes
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
25 # @lens: chromosome size
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
26 # @names: chromosome name
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
27
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
28 for (my $i = 0; $i < 3; $i++) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
29 # skip unnecessary lines
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
30 $line = <IN>;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
31 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
32
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
33 while ( $line = <IN> ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
34 ++$size;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
35 chomp($line);
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
36 my ($seqn, $name, $len) = split(/[ \t]+/, $line);
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
37 push(@names, $name);
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
38 push(@lens, $len);
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
39 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
40 close(IN);
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
41
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
42 my ($fh, $temp_reffile) = tempfile();
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
43 print $fh "$size\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
44 print $fh "@lens\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
45 print $fh "@names\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
46 close($fh);
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
47
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
48 # extract read length from FASTA/FASTQ files
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
49
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
50 open( IN, $infile_name ) or die "Cannot open tag file!\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
51
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
52 $line = <IN>;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
53 if ( $infile_format eq "fasta" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
54 while ( $line =~ /^>/ ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
55 $line = <IN>;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
56 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
57 } elsif ( $infile_format eq "fastq" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
58 while ( $line =~ /^@/ ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
59 $line = <IN>;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
60 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
61 } else {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
62 print "Inappropriate aligned read file format!\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
63 exit 1;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
64 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
65 chomp($line);
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
66 my $read_length = length $line;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
67
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
68 close( IN );
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
69
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
70 # extract read ID
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
71
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
72 open( IN, $infile_name ) or die "Cannot open tag file!\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
73
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
74 my @readID = ();
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
75 if ( $infile_format eq "fasta" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
76 foreach $line (<IN>) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
77 chomp($line);
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
78 if ( $line =~ /^>(\S+)/ ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
79 push @readID, $1;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
80 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
81 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
82 } elsif ( $infile_format eq "fastq" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
83 foreach $line (<IN>) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
84 chomp($line);
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
85 if ( $line =~ /^@(\S+)/ ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
86 push @readID, $1;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
87 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
88 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
89 } else {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
90 print "Inappropriate aligned read file format!\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
91 exit 1;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
92 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
93
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
94 close( IN );
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
95
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
96 # run bowtie & csem
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
97
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
98 my $outfile_temp = tmpnam();
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
99
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
100 if ( $infile_format eq "fasta" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
101 system( "bowtie -f -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
102 } elsif ( $infile_format eq "fastq" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
103 system( "bowtie -q -v $n_mismatch -a -m $maxpos -p $n_core --quiet --concise $ref_genome $infile_name | csem $temp_reffile $window_size $n_iter $outfile_temp > /dev/null" ) == 0 or die "Error occurs while running either bowtie or csem!"
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
104 } else {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
105 print "Inappropriate aligned read file format!\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
106 exit 1;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
107 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
108
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
109 # post-process chromosome & position
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
110
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
111 open( IN, $outfile_temp ) or die "Cannot open csem file!\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
112 open( OUT, ">", $outfile_name ) or die "Cannot open output file!\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
113
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
114 foreach $line (<IN>) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
115 chomp($line);
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
116 my @element = split( /\s/, $line );
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
117 # assume columns are separated by some white space
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
118
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
119 # check for invalid line: may cause error in exporting step
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
120
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
121 if ( scalar(@element)<5 ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
122 next;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
123 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
124
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
125 # post-process lines
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
126
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
127 my ($id, $chr, $str, $loc, $prob) = @element;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
128 if ( $outfile_format ne "bed" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
129 # first base is 0 in bowtie or BED
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
130 # first base is 1 in table or GFF
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
131 $loc++;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
132 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
133 my $chrname = $names[$chr]; # translate chromosome
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
134
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
135 # write down processed lines
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
136 # - generate pseudo-tags, if necessary (adapted from "round_tag_to_integer.pl")
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
137
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
138 if ( $pseudo_tags eq "Y" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
139 # if we want to generate pseudo-tags,
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
140 # then threshold prob at 0.5 & round prob to integer (i.e., set to one)
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
141 # (exclude prob = 0.5 as well in order to avoid a read appears more than once)
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
142
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
143 if ( $prob <= 0.5 ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
144 next;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
145 } else {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
146 $prob = 1;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
147 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
148 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
149
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
150 # write down results
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
151
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
152 my $start;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
153 my $end;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
154 my $score;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
155
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
156 my $id_final = $readID[$id];
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
157 #my $id_final = $id;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
158
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
159 if ( $outfile_format eq "table" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
160 print OUT "$id_final\t$chrname $str $loc $prob\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
161 } elsif ( $outfile_format eq "bed" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
162 # BED
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
163 # - name: read ID
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
164 # - score = prob * 1000
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
165
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
166 $start = $loc;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
167 $end = $start + $read_length - 1;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
168 my $name = $id_final;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
169 $score = $prob * 1000;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
170
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
171 print OUT "$chrname\t$start\t$end\t$name\t$score\t$str\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
172 } elsif ( $outfile_format eq "gff" ) {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
173 # GFF
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
174 # - source: "CSEM"
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
175 # - feature: read ID
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
176 # - score = prob * 1000
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
177
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
178 $start = $loc;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
179 $end = $start + $read_length - 1;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
180 my $source = "CSEM";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
181 my $feature = $id_final;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
182 $score = $prob * 1000;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
183
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
184 print OUT "$chrname\t$source\t$feature\t$start\t$end\t$score\t$str\t.\t.\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
185 } else {
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
186 print "Inappropriate output file format!\n";
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
187 exit 1;
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
188 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
189 }
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
190
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
191 close( IN );
a8f2c2a5f11b Uploaded
dongjun
parents:
diff changeset
192 close( OUT );