comparison multijoin @ 0:5314e5d6f040 draft

Imported from capsule None
author bgruening
date Thu, 29 Jan 2015 07:53:17 -0500
parents
children 20344ce0c811
comparison
equal deleted inserted replaced
-1:000000000000 0:5314e5d6f040
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Getopt::Long qw(:config no_ignore_case);
5 use Data::Dumper;
6 use Carp;
7 use File::Basename;
8 use Sort::Key::Natural qw(natsort);
9
10 my $version = "0.1.1";
11 my $field_sep = "\t";
12 my $key_column;
13 my @values_columns;
14 my $max_value_column;
15 my @input_files;
16 my $input_headers ;
17 my $output_headers;
18 my $filler = "0";
19 my $filler_string ;
20 my $ignore_duplicates;
21 my $debug = 0 ;
22 my %input_headers;
23 my $have_file_labels;
24 my %file_labels;
25
26 sub parse_command_line_parameters();
27 sub show_help();
28 sub read_input_file($);
29 sub print_combined_data();
30 sub sanitize_filename($);
31 sub print_output_header();
32 sub show_examples();
33
34 ##
35 ## Program Start
36 ##
37
38 parse_command_line_parameters();
39
40 my %data;
41 foreach my $file (@input_files) {
42 read_input_file($file);
43 }
44 #print STDERR Dumper(\%input_headers),"\n";
45 #print STDERR Dumper(\%data) if $debug;
46 print_output_header() if $output_headers;
47 print_combined_data();
48
49
50 ##
51 ## Program End
52 ##
53 sub print_output_header()
54 {
55 my @output = ("key");
56 foreach my $file ( @input_files ) {
57 foreach my $column ( @values_columns ) {
58 my $column_name = ( exists $input_headers{$file}->{$column} ) ?
59 $input_headers{$file}->{$column} :
60 "V$column" ;
61
62 push @output, $file_labels{$file} . "_" . $column_name;
63 }
64 }
65 print join($field_sep,@output),"\n"
66 or die "Output error: can't write output line: $!\n";
67 }
68
69 sub print_combined_data()
70 {
71 my @keys = natsort keys %data ;
72
73 foreach my $key ( @keys ) {
74 my @outputs;
75
76 foreach my $file (@input_files) {
77 push @outputs,
78 (exists $data{$key}->{$file}) ? $data{$key}->{$file} : $filler_string;
79 }
80
81 print join($field_sep,$key,@outputs),"\n"
82 or die "Output error: can't write output line: $!\n";
83 }
84 }
85
86 sub sanitize_filename($)
87 {
88 my ($filename) = shift or croak "missing file name";
89 my $file_ID = basename($filename);
90 $file_ID =~ s/\.\w+$//; # remove extension
91 $file_ID =~ s/^[^\w\.\-]+//;
92 $file_ID =~ s/[^\w\.\-]+$//;
93 $file_ID =~ s/[^\w\.\-]+/_/g; # sanitize bad characters
94 return $file_ID;
95 }
96
97 sub read_input_file($)
98 {
99 my ($filename) = shift or croak "Missing input file name";
100
101 my @value_indexes = map { $_-1 } @values_columns; #zero-based indexes for value columns
102
103 open FILE, "<", $filename
104 or die "Error: can't open file '$filename': $!\n";
105
106 ## Read file's header
107 if ($input_headers) {
108 my $line = <FILE>;
109 chomp $line;
110 my @fields = split $field_sep, $line;
111
112 my $num_input_fields = scalar(@fields);
113 die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ;
114
115 foreach my $col (@values_columns) {
116 $input_headers{$filename}->{$col} = $fields[$col-1] ;
117 }
118 }
119
120
121 ## Read file's data
122 while ( my $line = <FILE> ) {
123 chomp $line;
124 my @fields = split $field_sep, $line;
125
126 my $num_input_fields = scalar(@fields);
127 die "Input error: file '$filename' line $. doesn't have enough columns (key column = $key_column, line has only $num_input_fields columns)\n" if $num_input_fields < $key_column ;
128 die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ;
129
130
131 my $key = $fields[$key_column-1];
132 my $value = join($field_sep, @fields[@value_indexes]);
133
134 die "Input error: file '$filename' line $. have duplicated key '$key'.\n"
135 if (exists $data{$key}->{$filename} && !$ignore_duplicates) ;
136 $data{$key}->{$filename} = $value;
137 }
138 close FILE
139 or die "Error: can't write and close file '$filename': $!\n";
140 }
141
142 sub parse_command_line_parameters()
143 {
144 my $values_columns_string;
145
146 my $rc = GetOptions("help" => \&show_help,
147 "key|k=i" => \$key_column,
148 "values|v=s" => \$values_columns_string,
149 "t=s" => \$field_sep,
150 "in-header" => \$input_headers,
151 "out-header|h" => \$output_headers,
152 "H" => sub { $input_headers = 1 ; $output_headers = 1 ; },
153 "ignore-dups" => \$ignore_duplicates,
154 "filler|f=s" => \$filler,
155 "examples" => \&show_examples,
156 "labels" => \$have_file_labels,
157 );
158 die "Error: inalid command-line parameters.\n" unless $rc;
159
160 die "Error: missing key column. use --key N. see --help for more details.\n" unless defined $key_column;
161 die "Error: Invalid key column ($key_column). Must be bigger than zero. see --help for more details.\n" if $key_column <= 0 ;
162
163 die "Error: missing values column. use --values V1,V2,Vn. See --help for more details.\n" unless defined $values_columns_string;
164 @values_columns = split(/\s*,\s*/, $values_columns_string);
165
166 die "Error: missing values column. use --values N,N,N. see --help for more details.\n" unless scalar(@values_columns)>0;
167 foreach my $v (@values_columns) {
168 die "Error: invalid value column ($v), please use only numbers>=1. see --help for more details.\n"
169 unless $v =~ /^\d+$/ && $v>=1;
170
171 $max_value_column = $v unless defined $max_value_column && $max_value_column>$v;
172 }
173
174 $filler_string = join($field_sep, map { $filler } @values_columns);
175
176
177 if ($have_file_labels) {
178 ## have file labels - each pair of parameters is a file/label pair.
179 die "Error: missing input files and labels\n" if scalar(@ARGV)==0;
180 die "Error: when using --labels, a pair of file names + labels is required (got odd number of argiments)\n" unless scalar(@ARGV)%2==0;
181
182 while (@ARGV) {
183 my $filename = shift @ARGV;
184 my $label = shift @ARGV;
185 $label =~ s/^[^\.\w\-]+//;
186 $label =~ s/[^\.\w\-]+$//g;
187 $label =~ s/[^\.\w\-]+/_/g;
188
189 my $file_ID = sanitize_filename($filename);
190 $file_labels{$filename} = $label;
191 push @input_files, $filename;
192 }
193 } else {
194 ## no file labels - the rest of the arguments are just file names;
195 @input_files = @ARGV;
196 die "Error: missing input files\n" if scalar(@input_files)==0;
197 die "Error: need more than one input file to join.\n" if scalar(@input_files)==1;
198
199 foreach my $file (@input_files) {
200 my $file_ID = sanitize_filename($file);
201 $file_labels{$file} = $file_ID;
202 }
203 }
204
205 }
206
207 sub show_help()
208 {
209 print<<EOF;
210 Multi-File join, version $version
211 Copyright (C) 2012 - A. Gordon (gordon at cshl dot edu)
212 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html)
213
214 Usage:
215 multijoin [OPTIONS] -k N -v V1,V2,Vn,.. FILE1 FILE2 ... FILEn
216
217 Options:
218
219 --help This helpful help screen.
220
221 -k N
222 --key N Use column N as key column.
223
224 -v V1,V2,Vn
225 --values V1,V2,Vn
226 Use columns V1,V2,Vn as value columns - those will be joined
227 According to the Key column.
228 Multiple columns can be specified.
229
230 -t SEP Use SEP as field separator character (default: tab).
231
232 -h
233 --out-header Add a header line to the output file.
234
235 --in-header The input files have a header line.
236 The first line will not be joined.
237 if '--out-header' is also used, the output column headers will
238 be constructed based on the input header column names.
239
240 -H
241 --headers Same as '--in-header --out-header' combined.
242
243 --ignore-dups Ignore duplicated keys (within a file).
244 By default, duplicated keys cause an error.
245
246 -f X
247 --filler X Fill missing values with X.
248 (Default: '$filler').
249
250 --labels When printning output headers with '-h', instead of using the file name,
251 use specific labels.
252 Each file name must be followed by a name.
253
254 example (without labels):
255 \$ multijoin -h -k 1 -v 2 A.TXT B.TXT C.TXT
256
257 example (with labels):
258 \$ multijoin -h --labels -k 1 -v 2 A.TXT Sample1 B.TXT SampleB C.TXT SampleC
259
260 --examples Show detailed examples.
261
262 EOF
263 exit(0);
264 }
265
266 sub show_examples()
267 {
268 print<<EOF;
269
270 To join three files, based on the 4th column, and keeping the 7th,8th,9th columns:
271
272 \$ head *.txt
273 ==> AAA.txt <==
274 chr4 888449 890171 FBtr0308778 0 + 266 1527 1722
275 chr4 972167 979017 FBtr0310651 0 - 3944 6428 6850
276 chr4 972186 979017 FBtr0089229 0 - 3944 6428 6831
277 chr4 972186 979017 FBtr0089231 0 - 3944 6428 6831
278 chr4 972186 979017 FBtr0089233 0 - 3944 6428 6831
279 chr4 995793 996435 FBtr0111046 0 + 7 166 642
280 chr4 995793 997931 FBtr0111044 0 + 28 683 2138
281 chr4 995793 997931 FBtr0111045 0 + 28 683 2138
282 chr4 1034029 1047719 FBtr0089223 0 - 5293 13394 13690
283
284 ==> BBB.txt <==
285 chr4 90286 134453 FBtr0309803 0 + 657 29084 44167
286 chr4 251355 266499 FBtr0089116 0 + 56 1296 15144
287 chr4 252050 266506 FBtr0308086 0 + 56 1296 14456
288 chr4 252050 266506 FBtr0308087 0 + 56 1296 14456
289 chr4 252053 266528 FBtr0300796 0 + 56 1296 14475
290 chr4 252053 266528 FBtr0300800 0 + 56 1296 14475
291 chr4 252055 266528 FBtr0300798 0 + 56 1296 14473
292 chr4 252055 266528 FBtr0300799 0 + 56 1296 14473
293 chr4 252541 266528 FBtr0300797 0 + 56 1296 13987
294
295 ==> CCC.txt <==
296 chr4 972167 979017 FBtr0310651 0 - 9927 6738 6850
297 chr4 972186 979017 FBtr0089229 0 - 9927 6738 6831
298 chr4 972186 979017 FBtr0089231 0 - 9927 6738 6831
299 chr4 972186 979017 FBtr0089233 0 - 9927 6738 6831
300 chr4 995793 996435 FBtr0111046 0 + 5 304 642
301 chr4 995793 997931 FBtr0111044 0 + 17 714 2138
302 chr4 995793 997931 FBtr0111045 0 + 17 714 2138
303 chr4 1034029 1047719 FBtr0089223 0 - 17646 13536 13690
304
305 \$ multijoin -h --key 4 --values 7,8,9 *.txt | head -n 10
306 key AAA__V7 AAA__V8 AAA__V9 BBB__V7 BBB__V8 BBB__V9 CCC__V7 CCC__V8 CCC__V9
307 FBtr0089116 0 0 0 56 1296 15144 0 0 0
308 FBtr0089223 5293 13394 13690 0 0 0 17646 13536 13690
309 FBtr0089229 3944 6428 6831 0 0 0 9927 6738 6831
310 FBtr0089231 3944 6428 6831 0 0 0 9927 6738 6831
311 FBtr0089233 3944 6428 6831 0 0 0 9927 6738 6831
312 FBtr0111044 28 683 2138 0 0 0 17 714 2138
313 FBtr0111045 28 683 2138 0 0 0 17 714 2138
314 FBtr0111046 7 166 642 0 0 0 5 304 642
315 FBtr0300796 0 0 0 56 1296 14475 0 0 0
316
317
318
319 EOF
320 exit(0);
321 }