comparison bin/Bsf2ComMetIn.pl @ 0:dfdfbdd47b32 default tip

migrate from GitHub
author yutaka-saito
date Sun, 19 Apr 2015 20:55:17 +0900
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfdfbdd47b32
1 #!/usr/bin/perl
2
3 # Bisulfighter http://epigenome.cbrc.jp/bisulfighter
4 # by National Institute of Advanced Industrial Science and Technology (AIST)
5 # is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
6 # http://creativecommons.org/licenses/by-nc-sa/3.0/
7
8 use strict;
9 use warnings;
10
11 # format Bisulfighter's input to ComMet's input
12 # (data from both strands are integrated)
13 #
14 # usage:
15 # ./this_program sample1.tsv sample2.tsv > ComMetInput
16
17 # input format
18 =pod
19 # sample1
20 chr1 123 + CG 0.8 10
21 chr1 128 + CHG 0.7 8
22 chr1 1000000 + CG 0.234 9
23 chr2 987 + CG 0.654 12
24 chr2 1000 + CHG 0.788 10
25 chr2 1057 + CG 0.826 50
26 =cut
27
28 =pod
29 # sample2
30 chr1 123 + CG 0.3 30
31 chr1 128 + CHG 0.24 20
32 chr1 1000000 + CG 0.36 8
33 chr2 987 + CG 0.64 12
34 chr2 1000 + CHG 0.820 9
35 chr2 1057 + CG 0.0 32
36 =cut
37
38 # output format
39 =pod
40 chr1 123 10*(0.8) 10*(1-0.8) 30*(0.3) 30*(1-0.3)
41 chr1 1000000 9*(0.234) 9*(1-0.234) 8*(0.36) 8*(1-0.36)
42 chr2 987 12*(0.654) 12*(1-0.654) 12*(0.64) 12*(1-0.64)
43 chr2 1057 50*(0.826) 50*(1-0.826) 32*(0.0) 32*(1-0.0)
44 =cut
45
46 my $command = "";
47 my $tmpout1 = "Bsf2ComMetIn.pl.tmp.$$.1";
48 my $tmpout2 = "Bsf2ComMetIn.pl.tmp.$$.2";
49
50 &uniq_add($ARGV[0], $tmpout1);
51 &uniq_add($ARGV[1], $tmpout2);
52
53 my ($nm1, $pos1, $m1, $u1) = ("", -1, -1, -1);
54 my ($nm2, $pos2, $m2, $u2) = ("", -1, -1, -1);
55 open(IN1, $tmpout1) or die "couldn't open input file $tmpout1\n";
56 open(IN2, $tmpout2) or die "couldn't open input file $tmpout2\n";
57 while (1) {
58 my ($line1, $line2);
59
60 if (&chreq($nm1,$pos1,$nm2,$pos2)) {
61 (defined($line1=<IN1>) and defined($line2=<IN2>)) or last;
62 chomp $line1;
63 ($nm1,$pos1,$m1,$u1) = split(/\s+/, $line1);
64 chomp $line2;
65 ($nm2,$pos2,$m2,$u2) = split(/\s+/, $line2);
66 }
67 elsif (&chrlt($nm1,$pos1,$nm2,$pos2)) {
68 defined($line1=<IN1>) or last;
69 chomp $line1;
70 ($nm1,$pos1,$m1,$u1) = split(/\s+/, $line1);
71 }
72 elsif (&chrgt($nm1,$pos1,$nm2,$pos2)) {
73 defined($line2=<IN2>) or last;
74 chomp $line2;
75 ($nm2,$pos2,$m2,$u2) = split(/\s+/, $line2);
76 }
77 else {
78 die "wrong position\n";
79 }
80
81 if (&chreq($nm1,$pos1,$nm2,$pos2)) {
82 print STDOUT join("\t", ($nm1, $pos1, $m1, $u1, $m2, $u2)), "\n";
83 }
84 }
85 close(IN1);
86 close(IN2);
87
88 &invoke_command("\\rm -f $tmpout1");
89 &invoke_command("\\rm -f $tmpout2");
90
91 sub invoke_command {
92 my ($command) = @_;
93 # print STDERR $command, "\n";
94 system $command;
95 }
96
97 sub chrgt {
98 my ($nm1, $pos1, $nm2, $pos2) = @_;
99
100 if ($nm1 gt $nm2 || ($nm1 eq $nm2 && $pos1 > $pos2)) {
101 return 1;
102 }
103 else {
104 return 0;
105 }
106 }
107 sub chrlt {
108 my ($nm1, $pos1, $nm2, $pos2) = @_;
109
110 if ($nm1 lt $nm2 || ($nm1 eq $nm2 && $pos1 < $pos2)) {
111 return 1;
112 }
113 else {
114 return 0;
115 }
116 }
117 sub chreq {
118 my ($nm1, $pos1, $nm2, $pos2) = @_;
119
120 if ($nm1 eq $nm2 && $pos1 == $pos2) {
121 return 1;
122 }
123 else {
124 return 0;
125 }
126 }
127
128 sub uniq_add {
129 my ($infile, $outfile) = @_;
130 my ($nm, $str, $pos, $m, $u) = ("", "", -1, -1, -1);
131 my ($pre_nm, $pre_str, $pre_pos, $pre_m, $pre_u) = ("", "", -1, -1, -1);
132 my $mind = 2; # minimum distance between two CpGs
133 my $thsh = 0;
134
135 open(IN, $infile) or die "couldn't open input file $infile\n";
136 open(OUT, "> $outfile") or die "couldn't open output file $outfile\n";
137 while (my $line=<IN>) {
138 chomp $line;
139 $line =~ /^\#/ and next;
140 $line =~ /CG/ or next;
141 my @cells = split(/\s+/, $line);
142 $nm = $cells[0];
143 $str = $cells[2];
144 $pos = $cells[1];
145 $m = $cells[5] * $cells[4];
146 $u = $cells[5] * (1.0-$cells[4]);
147
148 if ($str eq "+") {
149 }
150 elsif ($str eq "-") {
151 $str = "+";
152 $pos -= $mind - 1;
153 }
154 else {
155 print STDERR "warning: abnormal strand $str was removed\n";
156 next;
157 }
158
159
160 if ($nm eq $pre_nm && $str eq $pre_str && $pos==$pre_pos) {
161 $pre_m += $m;
162 $pre_u += $u;
163 }
164 else {
165 if ($nm eq $pre_nm && $pos-$pre_pos < $mind) {
166 print STDERR "warning: abnormal position difference $pre_pos $pos, ";
167 if ($m + $u > $pre_m + $pre_u) {
168 print STDERR "$pre_pos was removed.\n";
169 $pre_nm = $nm;
170 $pre_str = $str;
171 $pre_pos = $pos;
172 $pre_m = $m;
173 $pre_u = $u;
174 }
175 else {
176 print STDERR "$pos was removed.\n"
177 }
178 next;
179 }
180 if ($pre_m + $pre_u > $thsh) {
181 print OUT join("\t", ($pre_nm, $pre_pos, $pre_m, $pre_u)), "\n";
182 }
183 $pre_nm = $nm;
184 $pre_str = $str;
185 $pre_pos = $pos;
186 $pre_m = $m;
187 $pre_u = $u;
188 }
189 }
190 if ($pre_m + $pre_u > $thsh) {
191 print OUT join("\t", ($pre_nm, $pre_pos, $pre_m, $pre_u)), "\n";
192 }
193 close(IN);
194 close(OUT);
195 }
196