0
|
1 #!/usr/bin/env perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Getopt::Long qw(:config no_ignore_case);
|
|
5 use Data::Dumper;
|
|
6 use Carp;
|
|
7 use File::Basename;
|
|
8 use Sort::Key::Natural qw(natsort);
|
|
9
|
|
10 my $version = "0.1.1";
|
|
11 my $field_sep = "\t";
|
|
12 my $key_column;
|
|
13 my @values_columns;
|
|
14 my $max_value_column;
|
|
15 my @input_files;
|
|
16 my $input_headers ;
|
|
17 my $output_headers;
|
|
18 my $filler = "0";
|
|
19 my $filler_string ;
|
|
20 my $ignore_duplicates;
|
|
21 my $debug = 0 ;
|
|
22 my %input_headers;
|
|
23 my $have_file_labels;
|
|
24 my %file_labels;
|
|
25
|
|
26 sub parse_command_line_parameters();
|
|
27 sub show_help();
|
|
28 sub read_input_file($);
|
|
29 sub print_combined_data();
|
|
30 sub sanitize_filename($);
|
|
31 sub print_output_header();
|
|
32 sub show_examples();
|
|
33
|
|
34 ##
|
|
35 ## Program Start
|
|
36 ##
|
|
37
|
|
38 parse_command_line_parameters();
|
|
39
|
|
40 my %data;
|
|
41 foreach my $file (@input_files) {
|
|
42 read_input_file($file);
|
|
43 }
|
|
44 #print STDERR Dumper(\%input_headers),"\n";
|
|
45 #print STDERR Dumper(\%data) if $debug;
|
|
46 print_output_header() if $output_headers;
|
|
47 print_combined_data();
|
|
48
|
|
49
|
|
50 ##
|
|
51 ## Program End
|
|
52 ##
|
|
53 sub print_output_header()
|
|
54 {
|
|
55 my @output = ("key");
|
|
56 foreach my $file ( @input_files ) {
|
|
57 foreach my $column ( @values_columns ) {
|
|
58 my $column_name = ( exists $input_headers{$file}->{$column} ) ?
|
|
59 $input_headers{$file}->{$column} :
|
|
60 "V$column" ;
|
|
61
|
|
62 push @output, $file_labels{$file} . "_" . $column_name;
|
|
63 }
|
|
64 }
|
|
65 print join($field_sep,@output),"\n"
|
|
66 or die "Output error: can't write output line: $!\n";
|
|
67 }
|
|
68
|
|
69 sub print_combined_data()
|
|
70 {
|
|
71 my @keys = natsort keys %data ;
|
|
72
|
|
73 foreach my $key ( @keys ) {
|
|
74 my @outputs;
|
|
75
|
|
76 foreach my $file (@input_files) {
|
|
77 push @outputs,
|
|
78 (exists $data{$key}->{$file}) ? $data{$key}->{$file} : $filler_string;
|
|
79 }
|
|
80
|
|
81 print join($field_sep,$key,@outputs),"\n"
|
|
82 or die "Output error: can't write output line: $!\n";
|
|
83 }
|
|
84 }
|
|
85
|
|
86 sub sanitize_filename($)
|
|
87 {
|
|
88 my ($filename) = shift or croak "missing file name";
|
|
89 my $file_ID = basename($filename);
|
|
90 $file_ID =~ s/\.\w+$//; # remove extension
|
|
91 $file_ID =~ s/^[^\w\.\-]+//;
|
|
92 $file_ID =~ s/[^\w\.\-]+$//;
|
|
93 $file_ID =~ s/[^\w\.\-]+/_/g; # sanitize bad characters
|
|
94 return $file_ID;
|
|
95 }
|
|
96
|
|
97 sub read_input_file($)
|
|
98 {
|
|
99 my ($filename) = shift or croak "Missing input file name";
|
|
100
|
|
101 my @value_indexes = map { $_-1 } @values_columns; #zero-based indexes for value columns
|
|
102
|
|
103 open FILE, "<", $filename
|
|
104 or die "Error: can't open file '$filename': $!\n";
|
|
105
|
|
106 ## Read file's header
|
|
107 if ($input_headers) {
|
|
108 my $line = <FILE>;
|
|
109 chomp $line;
|
|
110 my @fields = split $field_sep, $line;
|
|
111
|
|
112 my $num_input_fields = scalar(@fields);
|
|
113 die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ;
|
|
114
|
|
115 foreach my $col (@values_columns) {
|
|
116 $input_headers{$filename}->{$col} = $fields[$col-1] ;
|
|
117 }
|
|
118 }
|
|
119
|
|
120
|
|
121 ## Read file's data
|
|
122 while ( my $line = <FILE> ) {
|
|
123 chomp $line;
|
|
124 my @fields = split $field_sep, $line;
|
|
125
|
|
126 my $num_input_fields = scalar(@fields);
|
|
127 die "Input error: file '$filename' line $. doesn't have enough columns (key column = $key_column, line has only $num_input_fields columns)\n" if $num_input_fields < $key_column ;
|
|
128 die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ;
|
|
129
|
|
130
|
|
131 my $key = $fields[$key_column-1];
|
|
132 my $value = join($field_sep, @fields[@value_indexes]);
|
|
133
|
|
134 die "Input error: file '$filename' line $. have duplicated key '$key'.\n"
|
|
135 if (exists $data{$key}->{$filename} && !$ignore_duplicates) ;
|
|
136 $data{$key}->{$filename} = $value;
|
|
137 }
|
|
138 close FILE
|
|
139 or die "Error: can't write and close file '$filename': $!\n";
|
|
140 }
|
|
141
|
|
142 sub parse_command_line_parameters()
|
|
143 {
|
|
144 my $values_columns_string;
|
|
145
|
|
146 my $rc = GetOptions("help" => \&show_help,
|
|
147 "key|k=i" => \$key_column,
|
|
148 "values|v=s" => \$values_columns_string,
|
|
149 "t=s" => \$field_sep,
|
|
150 "in-header" => \$input_headers,
|
|
151 "out-header|h" => \$output_headers,
|
|
152 "H" => sub { $input_headers = 1 ; $output_headers = 1 ; },
|
|
153 "ignore-dups" => \$ignore_duplicates,
|
|
154 "filler|f=s" => \$filler,
|
|
155 "examples" => \&show_examples,
|
|
156 "labels" => \$have_file_labels,
|
|
157 );
|
|
158 die "Error: inalid command-line parameters.\n" unless $rc;
|
|
159
|
|
160 die "Error: missing key column. use --key N. see --help for more details.\n" unless defined $key_column;
|
|
161 die "Error: Invalid key column ($key_column). Must be bigger than zero. see --help for more details.\n" if $key_column <= 0 ;
|
|
162
|
|
163 die "Error: missing values column. use --values V1,V2,Vn. See --help for more details.\n" unless defined $values_columns_string;
|
|
164 @values_columns = split(/\s*,\s*/, $values_columns_string);
|
|
165
|
|
166 die "Error: missing values column. use --values N,N,N. see --help for more details.\n" unless scalar(@values_columns)>0;
|
|
167 foreach my $v (@values_columns) {
|
|
168 die "Error: invalid value column ($v), please use only numbers>=1. see --help for more details.\n"
|
|
169 unless $v =~ /^\d+$/ && $v>=1;
|
|
170
|
|
171 $max_value_column = $v unless defined $max_value_column && $max_value_column>$v;
|
|
172 }
|
|
173
|
|
174 $filler_string = join($field_sep, map { $filler } @values_columns);
|
|
175
|
|
176
|
|
177 if ($have_file_labels) {
|
|
178 ## have file labels - each pair of parameters is a file/label pair.
|
|
179 die "Error: missing input files and labels\n" if scalar(@ARGV)==0;
|
|
180 die "Error: when using --labels, a pair of file names + labels is required (got odd number of argiments)\n" unless scalar(@ARGV)%2==0;
|
|
181
|
|
182 while (@ARGV) {
|
|
183 my $filename = shift @ARGV;
|
|
184 my $label = shift @ARGV;
|
|
185 $label =~ s/^[^\.\w\-]+//;
|
|
186 $label =~ s/[^\.\w\-]+$//g;
|
|
187 $label =~ s/[^\.\w\-]+/_/g;
|
|
188
|
|
189 my $file_ID = sanitize_filename($filename);
|
|
190 $file_labels{$filename} = $label;
|
|
191 push @input_files, $filename;
|
|
192 }
|
|
193 } else {
|
|
194 ## no file labels - the rest of the arguments are just file names;
|
|
195 @input_files = @ARGV;
|
|
196 die "Error: missing input files\n" if scalar(@input_files)==0;
|
|
197 die "Error: need more than one input file to join.\n" if scalar(@input_files)==1;
|
|
198
|
|
199 foreach my $file (@input_files) {
|
|
200 my $file_ID = sanitize_filename($file);
|
|
201 $file_labels{$file} = $file_ID;
|
|
202 }
|
|
203 }
|
|
204
|
|
205 }
|
|
206
|
|
207 sub show_help()
|
|
208 {
|
|
209 print<<EOF;
|
|
210 Multi-File join, version $version
|
|
211 Copyright (C) 2012 - A. Gordon (gordon at cshl dot edu)
|
|
212 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html)
|
|
213
|
|
214 Usage:
|
|
215 multijoin [OPTIONS] -k N -v V1,V2,Vn,.. FILE1 FILE2 ... FILEn
|
|
216
|
|
217 Options:
|
|
218
|
|
219 --help This helpful help screen.
|
|
220
|
|
221 -k N
|
|
222 --key N Use column N as key column.
|
|
223
|
|
224 -v V1,V2,Vn
|
|
225 --values V1,V2,Vn
|
|
226 Use columns V1,V2,Vn as value columns - those will be joined
|
|
227 According to the Key column.
|
|
228 Multiple columns can be specified.
|
|
229
|
|
230 -t SEP Use SEP as field separator character (default: tab).
|
|
231
|
|
232 -h
|
|
233 --out-header Add a header line to the output file.
|
|
234
|
|
235 --in-header The input files have a header line.
|
|
236 The first line will not be joined.
|
|
237 if '--out-header' is also used, the output column headers will
|
|
238 be constructed based on the input header column names.
|
|
239
|
|
240 -H
|
|
241 --headers Same as '--in-header --out-header' combined.
|
|
242
|
|
243 --ignore-dups Ignore duplicated keys (within a file).
|
|
244 By default, duplicated keys cause an error.
|
|
245
|
|
246 -f X
|
|
247 --filler X Fill missing values with X.
|
|
248 (Default: '$filler').
|
|
249
|
|
250 --labels When printning output headers with '-h', instead of using the file name,
|
|
251 use specific labels.
|
|
252 Each file name must be followed by a name.
|
|
253
|
|
254 example (without labels):
|
|
255 \$ multijoin -h -k 1 -v 2 A.TXT B.TXT C.TXT
|
|
256
|
|
257 example (with labels):
|
|
258 \$ multijoin -h --labels -k 1 -v 2 A.TXT Sample1 B.TXT SampleB C.TXT SampleC
|
|
259
|
|
260 --examples Show detailed examples.
|
|
261
|
|
262 EOF
|
|
263 exit(0);
|
|
264 }
|
|
265
|
|
266 sub show_examples()
|
|
267 {
|
|
268 print<<EOF;
|
|
269
|
|
270 To join three files, based on the 4th column, and keeping the 7th,8th,9th columns:
|
|
271
|
|
272 \$ head *.txt
|
|
273 ==> AAA.txt <==
|
|
274 chr4 888449 890171 FBtr0308778 0 + 266 1527 1722
|
|
275 chr4 972167 979017 FBtr0310651 0 - 3944 6428 6850
|
|
276 chr4 972186 979017 FBtr0089229 0 - 3944 6428 6831
|
|
277 chr4 972186 979017 FBtr0089231 0 - 3944 6428 6831
|
|
278 chr4 972186 979017 FBtr0089233 0 - 3944 6428 6831
|
|
279 chr4 995793 996435 FBtr0111046 0 + 7 166 642
|
|
280 chr4 995793 997931 FBtr0111044 0 + 28 683 2138
|
|
281 chr4 995793 997931 FBtr0111045 0 + 28 683 2138
|
|
282 chr4 1034029 1047719 FBtr0089223 0 - 5293 13394 13690
|
|
283
|
|
284 ==> BBB.txt <==
|
|
285 chr4 90286 134453 FBtr0309803 0 + 657 29084 44167
|
|
286 chr4 251355 266499 FBtr0089116 0 + 56 1296 15144
|
|
287 chr4 252050 266506 FBtr0308086 0 + 56 1296 14456
|
|
288 chr4 252050 266506 FBtr0308087 0 + 56 1296 14456
|
|
289 chr4 252053 266528 FBtr0300796 0 + 56 1296 14475
|
|
290 chr4 252053 266528 FBtr0300800 0 + 56 1296 14475
|
|
291 chr4 252055 266528 FBtr0300798 0 + 56 1296 14473
|
|
292 chr4 252055 266528 FBtr0300799 0 + 56 1296 14473
|
|
293 chr4 252541 266528 FBtr0300797 0 + 56 1296 13987
|
|
294
|
|
295 ==> CCC.txt <==
|
|
296 chr4 972167 979017 FBtr0310651 0 - 9927 6738 6850
|
|
297 chr4 972186 979017 FBtr0089229 0 - 9927 6738 6831
|
|
298 chr4 972186 979017 FBtr0089231 0 - 9927 6738 6831
|
|
299 chr4 972186 979017 FBtr0089233 0 - 9927 6738 6831
|
|
300 chr4 995793 996435 FBtr0111046 0 + 5 304 642
|
|
301 chr4 995793 997931 FBtr0111044 0 + 17 714 2138
|
|
302 chr4 995793 997931 FBtr0111045 0 + 17 714 2138
|
|
303 chr4 1034029 1047719 FBtr0089223 0 - 17646 13536 13690
|
|
304
|
|
305 \$ multijoin -h --key 4 --values 7,8,9 *.txt | head -n 10
|
|
306 key AAA__V7 AAA__V8 AAA__V9 BBB__V7 BBB__V8 BBB__V9 CCC__V7 CCC__V8 CCC__V9
|
|
307 FBtr0089116 0 0 0 56 1296 15144 0 0 0
|
|
308 FBtr0089223 5293 13394 13690 0 0 0 17646 13536 13690
|
|
309 FBtr0089229 3944 6428 6831 0 0 0 9927 6738 6831
|
|
310 FBtr0089231 3944 6428 6831 0 0 0 9927 6738 6831
|
|
311 FBtr0089233 3944 6428 6831 0 0 0 9927 6738 6831
|
|
312 FBtr0111044 28 683 2138 0 0 0 17 714 2138
|
|
313 FBtr0111045 28 683 2138 0 0 0 17 714 2138
|
|
314 FBtr0111046 7 166 642 0 0 0 5 304 642
|
|
315 FBtr0300796 0 0 0 56 1296 14475 0 0 0
|
|
316
|
|
317
|
|
318
|
|
319 EOF
|
|
320 exit(0);
|
|
321 }
|