Mercurial > repos > yutaka-saito > commet
comparison bin/Bsf2ComMetIn.pl @ 0:dfdfbdd47b32 default tip
migrate from GitHub
author | yutaka-saito |
---|---|
date | Sun, 19 Apr 2015 20:55:17 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfdfbdd47b32 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # Bisulfighter http://epigenome.cbrc.jp/bisulfighter | |
4 # by National Institute of Advanced Industrial Science and Technology (AIST) | |
5 # is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. | |
6 # http://creativecommons.org/licenses/by-nc-sa/3.0/ | |
7 | |
8 use strict; | |
9 use warnings; | |
10 | |
11 # format Bisulfighter's input to ComMet's input | |
12 # (data from both strands are integrated) | |
13 # | |
14 # usage: | |
15 # ./this_program sample1.tsv sample2.tsv > ComMetInput | |
16 | |
17 # input format | |
18 =pod | |
19 # sample1 | |
20 chr1 123 + CG 0.8 10 | |
21 chr1 128 + CHG 0.7 8 | |
22 chr1 1000000 + CG 0.234 9 | |
23 chr2 987 + CG 0.654 12 | |
24 chr2 1000 + CHG 0.788 10 | |
25 chr2 1057 + CG 0.826 50 | |
26 =cut | |
27 | |
28 =pod | |
29 # sample2 | |
30 chr1 123 + CG 0.3 30 | |
31 chr1 128 + CHG 0.24 20 | |
32 chr1 1000000 + CG 0.36 8 | |
33 chr2 987 + CG 0.64 12 | |
34 chr2 1000 + CHG 0.820 9 | |
35 chr2 1057 + CG 0.0 32 | |
36 =cut | |
37 | |
38 # output format | |
39 =pod | |
40 chr1 123 10*(0.8) 10*(1-0.8) 30*(0.3) 30*(1-0.3) | |
41 chr1 1000000 9*(0.234) 9*(1-0.234) 8*(0.36) 8*(1-0.36) | |
42 chr2 987 12*(0.654) 12*(1-0.654) 12*(0.64) 12*(1-0.64) | |
43 chr2 1057 50*(0.826) 50*(1-0.826) 32*(0.0) 32*(1-0.0) | |
44 =cut | |
45 | |
46 my $command = ""; | |
47 my $tmpout1 = "Bsf2ComMetIn.pl.tmp.$$.1"; | |
48 my $tmpout2 = "Bsf2ComMetIn.pl.tmp.$$.2"; | |
49 | |
50 &uniq_add($ARGV[0], $tmpout1); | |
51 &uniq_add($ARGV[1], $tmpout2); | |
52 | |
53 my ($nm1, $pos1, $m1, $u1) = ("", -1, -1, -1); | |
54 my ($nm2, $pos2, $m2, $u2) = ("", -1, -1, -1); | |
55 open(IN1, $tmpout1) or die "couldn't open input file $tmpout1\n"; | |
56 open(IN2, $tmpout2) or die "couldn't open input file $tmpout2\n"; | |
57 while (1) { | |
58 my ($line1, $line2); | |
59 | |
60 if (&chreq($nm1,$pos1,$nm2,$pos2)) { | |
61 (defined($line1=<IN1>) and defined($line2=<IN2>)) or last; | |
62 chomp $line1; | |
63 ($nm1,$pos1,$m1,$u1) = split(/\s+/, $line1); | |
64 chomp $line2; | |
65 ($nm2,$pos2,$m2,$u2) = split(/\s+/, $line2); | |
66 } | |
67 elsif (&chrlt($nm1,$pos1,$nm2,$pos2)) { | |
68 defined($line1=<IN1>) or last; | |
69 chomp $line1; | |
70 ($nm1,$pos1,$m1,$u1) = split(/\s+/, $line1); | |
71 } | |
72 elsif (&chrgt($nm1,$pos1,$nm2,$pos2)) { | |
73 defined($line2=<IN2>) or last; | |
74 chomp $line2; | |
75 ($nm2,$pos2,$m2,$u2) = split(/\s+/, $line2); | |
76 } | |
77 else { | |
78 die "wrong position\n"; | |
79 } | |
80 | |
81 if (&chreq($nm1,$pos1,$nm2,$pos2)) { | |
82 print STDOUT join("\t", ($nm1, $pos1, $m1, $u1, $m2, $u2)), "\n"; | |
83 } | |
84 } | |
85 close(IN1); | |
86 close(IN2); | |
87 | |
88 &invoke_command("\\rm -f $tmpout1"); | |
89 &invoke_command("\\rm -f $tmpout2"); | |
90 | |
91 sub invoke_command { | |
92 my ($command) = @_; | |
93 # print STDERR $command, "\n"; | |
94 system $command; | |
95 } | |
96 | |
97 sub chrgt { | |
98 my ($nm1, $pos1, $nm2, $pos2) = @_; | |
99 | |
100 if ($nm1 gt $nm2 || ($nm1 eq $nm2 && $pos1 > $pos2)) { | |
101 return 1; | |
102 } | |
103 else { | |
104 return 0; | |
105 } | |
106 } | |
107 sub chrlt { | |
108 my ($nm1, $pos1, $nm2, $pos2) = @_; | |
109 | |
110 if ($nm1 lt $nm2 || ($nm1 eq $nm2 && $pos1 < $pos2)) { | |
111 return 1; | |
112 } | |
113 else { | |
114 return 0; | |
115 } | |
116 } | |
117 sub chreq { | |
118 my ($nm1, $pos1, $nm2, $pos2) = @_; | |
119 | |
120 if ($nm1 eq $nm2 && $pos1 == $pos2) { | |
121 return 1; | |
122 } | |
123 else { | |
124 return 0; | |
125 } | |
126 } | |
127 | |
128 sub uniq_add { | |
129 my ($infile, $outfile) = @_; | |
130 my ($nm, $str, $pos, $m, $u) = ("", "", -1, -1, -1); | |
131 my ($pre_nm, $pre_str, $pre_pos, $pre_m, $pre_u) = ("", "", -1, -1, -1); | |
132 my $mind = 2; # minimum distance between two CpGs | |
133 my $thsh = 0; | |
134 | |
135 open(IN, $infile) or die "couldn't open input file $infile\n"; | |
136 open(OUT, "> $outfile") or die "couldn't open output file $outfile\n"; | |
137 while (my $line=<IN>) { | |
138 chomp $line; | |
139 $line =~ /^\#/ and next; | |
140 $line =~ /CG/ or next; | |
141 my @cells = split(/\s+/, $line); | |
142 $nm = $cells[0]; | |
143 $str = $cells[2]; | |
144 $pos = $cells[1]; | |
145 $m = $cells[5] * $cells[4]; | |
146 $u = $cells[5] * (1.0-$cells[4]); | |
147 | |
148 if ($str eq "+") { | |
149 } | |
150 elsif ($str eq "-") { | |
151 $str = "+"; | |
152 $pos -= $mind - 1; | |
153 } | |
154 else { | |
155 print STDERR "warning: abnormal strand $str was removed\n"; | |
156 next; | |
157 } | |
158 | |
159 | |
160 if ($nm eq $pre_nm && $str eq $pre_str && $pos==$pre_pos) { | |
161 $pre_m += $m; | |
162 $pre_u += $u; | |
163 } | |
164 else { | |
165 if ($nm eq $pre_nm && $pos-$pre_pos < $mind) { | |
166 print STDERR "warning: abnormal position difference $pre_pos $pos, "; | |
167 if ($m + $u > $pre_m + $pre_u) { | |
168 print STDERR "$pre_pos was removed.\n"; | |
169 $pre_nm = $nm; | |
170 $pre_str = $str; | |
171 $pre_pos = $pos; | |
172 $pre_m = $m; | |
173 $pre_u = $u; | |
174 } | |
175 else { | |
176 print STDERR "$pos was removed.\n" | |
177 } | |
178 next; | |
179 } | |
180 if ($pre_m + $pre_u > $thsh) { | |
181 print OUT join("\t", ($pre_nm, $pre_pos, $pre_m, $pre_u)), "\n"; | |
182 } | |
183 $pre_nm = $nm; | |
184 $pre_str = $str; | |
185 $pre_pos = $pos; | |
186 $pre_m = $m; | |
187 $pre_u = $u; | |
188 } | |
189 } | |
190 if ($pre_m + $pre_u > $thsh) { | |
191 print OUT join("\t", ($pre_nm, $pre_pos, $pre_m, $pre_u)), "\n"; | |
192 } | |
193 close(IN); | |
194 close(OUT); | |
195 } | |
196 |