Mercurial > repos > bgruening > text_processing
comparison multijoin @ 0:5314e5d6f040 draft
Imported from capsule None
author | bgruening |
---|---|
date | Thu, 29 Jan 2015 07:53:17 -0500 |
parents | |
children | 20344ce0c811 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5314e5d6f040 |
---|---|
1 #!/usr/bin/env perl | |
2 use strict; | |
3 use warnings; | |
4 use Getopt::Long qw(:config no_ignore_case); | |
5 use Data::Dumper; | |
6 use Carp; | |
7 use File::Basename; | |
8 use Sort::Key::Natural qw(natsort); | |
9 | |
10 my $version = "0.1.1"; | |
11 my $field_sep = "\t"; | |
12 my $key_column; | |
13 my @values_columns; | |
14 my $max_value_column; | |
15 my @input_files; | |
16 my $input_headers ; | |
17 my $output_headers; | |
18 my $filler = "0"; | |
19 my $filler_string ; | |
20 my $ignore_duplicates; | |
21 my $debug = 0 ; | |
22 my %input_headers; | |
23 my $have_file_labels; | |
24 my %file_labels; | |
25 | |
26 sub parse_command_line_parameters(); | |
27 sub show_help(); | |
28 sub read_input_file($); | |
29 sub print_combined_data(); | |
30 sub sanitize_filename($); | |
31 sub print_output_header(); | |
32 sub show_examples(); | |
33 | |
34 ## | |
35 ## Program Start | |
36 ## | |
37 | |
38 parse_command_line_parameters(); | |
39 | |
40 my %data; | |
41 foreach my $file (@input_files) { | |
42 read_input_file($file); | |
43 } | |
44 #print STDERR Dumper(\%input_headers),"\n"; | |
45 #print STDERR Dumper(\%data) if $debug; | |
46 print_output_header() if $output_headers; | |
47 print_combined_data(); | |
48 | |
49 | |
50 ## | |
51 ## Program End | |
52 ## | |
53 sub print_output_header() | |
54 { | |
55 my @output = ("key"); | |
56 foreach my $file ( @input_files ) { | |
57 foreach my $column ( @values_columns ) { | |
58 my $column_name = ( exists $input_headers{$file}->{$column} ) ? | |
59 $input_headers{$file}->{$column} : | |
60 "V$column" ; | |
61 | |
62 push @output, $file_labels{$file} . "_" . $column_name; | |
63 } | |
64 } | |
65 print join($field_sep,@output),"\n" | |
66 or die "Output error: can't write output line: $!\n"; | |
67 } | |
68 | |
69 sub print_combined_data() | |
70 { | |
71 my @keys = natsort keys %data ; | |
72 | |
73 foreach my $key ( @keys ) { | |
74 my @outputs; | |
75 | |
76 foreach my $file (@input_files) { | |
77 push @outputs, | |
78 (exists $data{$key}->{$file}) ? $data{$key}->{$file} : $filler_string; | |
79 } | |
80 | |
81 print join($field_sep,$key,@outputs),"\n" | |
82 or die "Output error: can't write output line: $!\n"; | |
83 } | |
84 } | |
85 | |
86 sub sanitize_filename($) | |
87 { | |
88 my ($filename) = shift or croak "missing file name"; | |
89 my $file_ID = basename($filename); | |
90 $file_ID =~ s/\.\w+$//; # remove extension | |
91 $file_ID =~ s/^[^\w\.\-]+//; | |
92 $file_ID =~ s/[^\w\.\-]+$//; | |
93 $file_ID =~ s/[^\w\.\-]+/_/g; # sanitize bad characters | |
94 return $file_ID; | |
95 } | |
96 | |
97 sub read_input_file($) | |
98 { | |
99 my ($filename) = shift or croak "Missing input file name"; | |
100 | |
101 my @value_indexes = map { $_-1 } @values_columns; #zero-based indexes for value columns | |
102 | |
103 open FILE, "<", $filename | |
104 or die "Error: can't open file '$filename': $!\n"; | |
105 | |
106 ## Read file's header | |
107 if ($input_headers) { | |
108 my $line = <FILE>; | |
109 chomp $line; | |
110 my @fields = split $field_sep, $line; | |
111 | |
112 my $num_input_fields = scalar(@fields); | |
113 die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ; | |
114 | |
115 foreach my $col (@values_columns) { | |
116 $input_headers{$filename}->{$col} = $fields[$col-1] ; | |
117 } | |
118 } | |
119 | |
120 | |
121 ## Read file's data | |
122 while ( my $line = <FILE> ) { | |
123 chomp $line; | |
124 my @fields = split $field_sep, $line; | |
125 | |
126 my $num_input_fields = scalar(@fields); | |
127 die "Input error: file '$filename' line $. doesn't have enough columns (key column = $key_column, line has only $num_input_fields columns)\n" if $num_input_fields < $key_column ; | |
128 die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ; | |
129 | |
130 | |
131 my $key = $fields[$key_column-1]; | |
132 my $value = join($field_sep, @fields[@value_indexes]); | |
133 | |
134 die "Input error: file '$filename' line $. have duplicated key '$key'.\n" | |
135 if (exists $data{$key}->{$filename} && !$ignore_duplicates) ; | |
136 $data{$key}->{$filename} = $value; | |
137 } | |
138 close FILE | |
139 or die "Error: can't write and close file '$filename': $!\n"; | |
140 } | |
141 | |
142 sub parse_command_line_parameters() | |
143 { | |
144 my $values_columns_string; | |
145 | |
146 my $rc = GetOptions("help" => \&show_help, | |
147 "key|k=i" => \$key_column, | |
148 "values|v=s" => \$values_columns_string, | |
149 "t=s" => \$field_sep, | |
150 "in-header" => \$input_headers, | |
151 "out-header|h" => \$output_headers, | |
152 "H" => sub { $input_headers = 1 ; $output_headers = 1 ; }, | |
153 "ignore-dups" => \$ignore_duplicates, | |
154 "filler|f=s" => \$filler, | |
155 "examples" => \&show_examples, | |
156 "labels" => \$have_file_labels, | |
157 ); | |
158 die "Error: inalid command-line parameters.\n" unless $rc; | |
159 | |
160 die "Error: missing key column. use --key N. see --help for more details.\n" unless defined $key_column; | |
161 die "Error: Invalid key column ($key_column). Must be bigger than zero. see --help for more details.\n" if $key_column <= 0 ; | |
162 | |
163 die "Error: missing values column. use --values V1,V2,Vn. See --help for more details.\n" unless defined $values_columns_string; | |
164 @values_columns = split(/\s*,\s*/, $values_columns_string); | |
165 | |
166 die "Error: missing values column. use --values N,N,N. see --help for more details.\n" unless scalar(@values_columns)>0; | |
167 foreach my $v (@values_columns) { | |
168 die "Error: invalid value column ($v), please use only numbers>=1. see --help for more details.\n" | |
169 unless $v =~ /^\d+$/ && $v>=1; | |
170 | |
171 $max_value_column = $v unless defined $max_value_column && $max_value_column>$v; | |
172 } | |
173 | |
174 $filler_string = join($field_sep, map { $filler } @values_columns); | |
175 | |
176 | |
177 if ($have_file_labels) { | |
178 ## have file labels - each pair of parameters is a file/label pair. | |
179 die "Error: missing input files and labels\n" if scalar(@ARGV)==0; | |
180 die "Error: when using --labels, a pair of file names + labels is required (got odd number of argiments)\n" unless scalar(@ARGV)%2==0; | |
181 | |
182 while (@ARGV) { | |
183 my $filename = shift @ARGV; | |
184 my $label = shift @ARGV; | |
185 $label =~ s/^[^\.\w\-]+//; | |
186 $label =~ s/[^\.\w\-]+$//g; | |
187 $label =~ s/[^\.\w\-]+/_/g; | |
188 | |
189 my $file_ID = sanitize_filename($filename); | |
190 $file_labels{$filename} = $label; | |
191 push @input_files, $filename; | |
192 } | |
193 } else { | |
194 ## no file labels - the rest of the arguments are just file names; | |
195 @input_files = @ARGV; | |
196 die "Error: missing input files\n" if scalar(@input_files)==0; | |
197 die "Error: need more than one input file to join.\n" if scalar(@input_files)==1; | |
198 | |
199 foreach my $file (@input_files) { | |
200 my $file_ID = sanitize_filename($file); | |
201 $file_labels{$file} = $file_ID; | |
202 } | |
203 } | |
204 | |
205 } | |
206 | |
207 sub show_help() | |
208 { | |
209 print<<EOF; | |
210 Multi-File join, version $version | |
211 Copyright (C) 2012 - A. Gordon (gordon at cshl dot edu) | |
212 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html) | |
213 | |
214 Usage: | |
215 multijoin [OPTIONS] -k N -v V1,V2,Vn,.. FILE1 FILE2 ... FILEn | |
216 | |
217 Options: | |
218 | |
219 --help This helpful help screen. | |
220 | |
221 -k N | |
222 --key N Use column N as key column. | |
223 | |
224 -v V1,V2,Vn | |
225 --values V1,V2,Vn | |
226 Use columns V1,V2,Vn as value columns - those will be joined | |
227 According to the Key column. | |
228 Multiple columns can be specified. | |
229 | |
230 -t SEP Use SEP as field separator character (default: tab). | |
231 | |
232 -h | |
233 --out-header Add a header line to the output file. | |
234 | |
235 --in-header The input files have a header line. | |
236 The first line will not be joined. | |
237 if '--out-header' is also used, the output column headers will | |
238 be constructed based on the input header column names. | |
239 | |
240 -H | |
241 --headers Same as '--in-header --out-header' combined. | |
242 | |
243 --ignore-dups Ignore duplicated keys (within a file). | |
244 By default, duplicated keys cause an error. | |
245 | |
246 -f X | |
247 --filler X Fill missing values with X. | |
248 (Default: '$filler'). | |
249 | |
250 --labels When printning output headers with '-h', instead of using the file name, | |
251 use specific labels. | |
252 Each file name must be followed by a name. | |
253 | |
254 example (without labels): | |
255 \$ multijoin -h -k 1 -v 2 A.TXT B.TXT C.TXT | |
256 | |
257 example (with labels): | |
258 \$ multijoin -h --labels -k 1 -v 2 A.TXT Sample1 B.TXT SampleB C.TXT SampleC | |
259 | |
260 --examples Show detailed examples. | |
261 | |
262 EOF | |
263 exit(0); | |
264 } | |
265 | |
266 sub show_examples() | |
267 { | |
268 print<<EOF; | |
269 | |
270 To join three files, based on the 4th column, and keeping the 7th,8th,9th columns: | |
271 | |
272 \$ head *.txt | |
273 ==> AAA.txt <== | |
274 chr4 888449 890171 FBtr0308778 0 + 266 1527 1722 | |
275 chr4 972167 979017 FBtr0310651 0 - 3944 6428 6850 | |
276 chr4 972186 979017 FBtr0089229 0 - 3944 6428 6831 | |
277 chr4 972186 979017 FBtr0089231 0 - 3944 6428 6831 | |
278 chr4 972186 979017 FBtr0089233 0 - 3944 6428 6831 | |
279 chr4 995793 996435 FBtr0111046 0 + 7 166 642 | |
280 chr4 995793 997931 FBtr0111044 0 + 28 683 2138 | |
281 chr4 995793 997931 FBtr0111045 0 + 28 683 2138 | |
282 chr4 1034029 1047719 FBtr0089223 0 - 5293 13394 13690 | |
283 | |
284 ==> BBB.txt <== | |
285 chr4 90286 134453 FBtr0309803 0 + 657 29084 44167 | |
286 chr4 251355 266499 FBtr0089116 0 + 56 1296 15144 | |
287 chr4 252050 266506 FBtr0308086 0 + 56 1296 14456 | |
288 chr4 252050 266506 FBtr0308087 0 + 56 1296 14456 | |
289 chr4 252053 266528 FBtr0300796 0 + 56 1296 14475 | |
290 chr4 252053 266528 FBtr0300800 0 + 56 1296 14475 | |
291 chr4 252055 266528 FBtr0300798 0 + 56 1296 14473 | |
292 chr4 252055 266528 FBtr0300799 0 + 56 1296 14473 | |
293 chr4 252541 266528 FBtr0300797 0 + 56 1296 13987 | |
294 | |
295 ==> CCC.txt <== | |
296 chr4 972167 979017 FBtr0310651 0 - 9927 6738 6850 | |
297 chr4 972186 979017 FBtr0089229 0 - 9927 6738 6831 | |
298 chr4 972186 979017 FBtr0089231 0 - 9927 6738 6831 | |
299 chr4 972186 979017 FBtr0089233 0 - 9927 6738 6831 | |
300 chr4 995793 996435 FBtr0111046 0 + 5 304 642 | |
301 chr4 995793 997931 FBtr0111044 0 + 17 714 2138 | |
302 chr4 995793 997931 FBtr0111045 0 + 17 714 2138 | |
303 chr4 1034029 1047719 FBtr0089223 0 - 17646 13536 13690 | |
304 | |
305 \$ multijoin -h --key 4 --values 7,8,9 *.txt | head -n 10 | |
306 key AAA__V7 AAA__V8 AAA__V9 BBB__V7 BBB__V8 BBB__V9 CCC__V7 CCC__V8 CCC__V9 | |
307 FBtr0089116 0 0 0 56 1296 15144 0 0 0 | |
308 FBtr0089223 5293 13394 13690 0 0 0 17646 13536 13690 | |
309 FBtr0089229 3944 6428 6831 0 0 0 9927 6738 6831 | |
310 FBtr0089231 3944 6428 6831 0 0 0 9927 6738 6831 | |
311 FBtr0089233 3944 6428 6831 0 0 0 9927 6738 6831 | |
312 FBtr0111044 28 683 2138 0 0 0 17 714 2138 | |
313 FBtr0111045 28 683 2138 0 0 0 17 714 2138 | |
314 FBtr0111046 7 166 642 0 0 0 5 304 642 | |
315 FBtr0300796 0 0 0 56 1296 14475 0 0 0 | |
316 | |
317 | |
318 | |
319 EOF | |
320 exit(0); | |
321 } |