Previous changeset 1:9e0251d2ecb5 (2017-10-26) Next changeset 3:0354759bf0ce (2017-10-26) |
Commit message:
Uploaded |
added:
CreateMatrixMultiple.pl |
b |
diff -r 9e0251d2ecb5 -r cabe6c55780a CreateMatrixMultiple.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CreateMatrixMultiple.pl Thu Oct 26 07:37:17 2017 -0400 |
[ |
@@ -0,0 +1,146 @@ +#!/usr/bin/perl +#V1.0.0 +use strict; +use warnings; +use Getopt::Long; + + +my $input_matrix_files; +GetOptions ( +"input_matrix_files=s" => \$input_matrix_files +) or die("Error in command line arguments\n"); + +my @files = split(/,/,$input_matrix_files); +my @tbl_hash; +my %global_hash; +my @tbl_genotype_names; +my %chromosome_hash; +my %result_by_chr; +my @tbl_chr_name; + +for (my $i=0;$i<=$#files;$i++){ + my $current_file = $files[$i]; + my $current_genotype = "NA"; + my %current_hash; + + open (CF,$current_file) or die ("Can't open file $current_file\n"); + my $header = <CF>; + if ($header =~ /Chrom\s*Pos\s*Ref\s*(.*?)\s*$/){ + $current_genotype = $1; + } + else { + print STDERR "Unable to recognize header in matrix file\n$header\n"; + exit(0); + } + while (my $line=<CF>){ + if ($line!~/^\s*$/){ + my @fields = split (/\t+/,$line); + my $chr; + my $pos; + my $ref; + my $variant; + + if ($fields[0]=~/^\s*([\w\-]+)\s*$/){ + $chr = $1; + } + else { + print STDERR "Unable to detect chromosome in matrix file\n$line\n"; + exit(0); + } + if ($fields[1]=~/^\s*(\d+)\s*$/){ + $pos = $1; + } + else { + print STDERR "Unable to detect position in matrix file\n$line\n"; + exit(0); + } + + if ($fields[2]=~/^\s*([ATGCNX])\s*$/i){ + $ref = $1; + } + else { + print STDERR "Unable to detect reference base in matrix file\n$line\n"; + exit(0); + } + + if ($fields[3]=~/^\s*([\w\/]+)\s*$/i){ + $variant = $1; + } + else { + print STDERR "Unable to detect variant in matrix file\n$line\n"; + exit(0); + } + $current_hash{"$chr#$pos"} = $variant; + $global_hash{"$chr#$pos"} = $ref; + } + } + close CF; + push(@tbl_genotype_names,$current_genotype); + push(@tbl_hash,\%current_hash); +} + +print "Chrom\tPos\tRef"; +for (my $i=0;$i<=$#tbl_genotype_names;$i++){ + print "\t".$tbl_genotype_names[$i]; +} +print "\n"; + +my @tbl_line_to_display; + +#exit(0); + +foreach my $key (keys %global_hash){ + my @tbl = split (/\#/,$key); + my $chr = $tbl[0]; + my $pos = $tbl[1]; + my $ref = $global_hash{$key}; + my $line = "$chr\t$pos\t$ref"; + my $isvariant = 0; + for (my $i=0;$i<=$#tbl_hash;$i++){ + #my %current_hash = %{$tbl_hash[$i]}; + if ($tbl_hash[$i]->{$key}){ + $line.="\t".$tbl_hash[$i]->{$key}; + if ($tbl_hash[$i]->{$key} ne $ref){ + $isvariant = 1; + } + } + else { + $line.="\t"."NA"; + } + + } + + $line .="\n"; + if ($isvariant == 1){ + push(@tbl_line_to_display,$line); + } +} + + +for (my $i=0;$i<=$#tbl_line_to_display;$i++){ + my @tbl = split (/\s+/,$tbl_line_to_display[$i]); + my $current_chr = $tbl[0]; + my @current_tbl; + if ($result_by_chr{$current_chr}){ + push (@{$result_by_chr{$current_chr}},$tbl_line_to_display[$i]); + } + else { + push (@current_tbl,$tbl_line_to_display[$i]); + $result_by_chr{$current_chr} = \@current_tbl; + } +} + +foreach my $key (sort keys %result_by_chr){ + my @current_tbl = sort mysort @{$result_by_chr{$key}}; + for (my $i=0;$i<=$#current_tbl;$i++){ + print $current_tbl[$i]; + } +} + +sub mysort { + my @tbla = split (/\s+/,$a); + my @tblb = split (/\s+/,$b); + $tbla[0] cmp $tblb[0] || $tbla[1]<=>$tblb[1]; +} + + |