diff rapsodyn/CreateMatrixMultiple.pl @ 15:56d328bce3a7 draft default tip

Uploaded
author mcharles
date Thu, 29 Jan 2015 08:54:06 -0500
parents 0a6c1cfe4dc8
children
line wrap: on
line diff
--- a/rapsodyn/CreateMatrixMultiple.pl	Mon Jan 26 18:10:52 2015 -0500
+++ b/rapsodyn/CreateMatrixMultiple.pl	Thu Jan 29 08:54:06 2015 -0500
@@ -11,7 +11,136 @@
 ) or die("Error in command line arguments\n");
 
 my @files = split(/,/,$input_matrix_files);
+my @tbl_hash;
+my %global_hash;
+my @tbl_genotype_names;
+my %chromosome_hash;
+my %result_by_chr;
+my @tbl_chr_name;
+
 for (my $i=0;$i<=$#files;$i++){
-	print $files[$i],"\n";
+	my $current_file = $files[$i];
+	my $current_genotype = "NA";
+	my %current_hash;
+	
+	open (CF,$current_file) or die ("Can't open file $current_file\n");
+	my $header = <CF>;
+	if ($header =~ /Chrom\s*Pos\s*Ref\s*(.*?)\s*$/){
+		$current_genotype = $1;
+	}
+	else {
+		print STDERR "Unable to recognize header in matrix file\n$header\n";
+		exit(0);
+	}
+	while (my $line=<CF>){
+		if ($line!~/^\s*$/){
+			my @fields = split (/\t+/,$line);
+			my $chr;
+			my $pos;
+			my $ref;
+			my $variant;
+			
+			if ($fields[0]=~/^\s*([\w\-]+)\s*$/){
+				$chr = $1;
+			}
+			else {
+				print STDERR "Unable to detect chromosome in matrix file\n$line\n";
+				exit(0);
+			}
+			if ($fields[1]=~/^\s*(\d+)\s*$/){
+				$pos = $1;
+			}
+			else {
+				print STDERR "Unable to detect position in matrix file\n$line\n";
+				exit(0);
+			}
+			
+			if ($fields[2]=~/^\s*([ATGCNX])\s*$/i){
+				$ref = $1;
+			}
+			else {
+				print STDERR "Unable to detect reference base in matrix file\n$line\n";
+				exit(0);
+			}
+			
+			if ($fields[3]=~/^\s*([\w\/]+)\s*$/i){
+				$variant = $1;
+			}
+			else {
+				print STDERR "Unable to detect variant in matrix file\n$line\n";
+				exit(0);
+			}
+			$current_hash{"$chr#$pos"} = $variant;
+			$global_hash{"$chr#$pos"} = $ref;
+		}
+	}
+	close CF;
+	push(@tbl_genotype_names,$current_genotype);
+	push(@tbl_hash,\%current_hash);
 }
 
+print "Chrom\tPos\tRef";
+for (my $i=0;$i<=$#tbl_genotype_names;$i++){
+	print "\t".$tbl_genotype_names[$i];
+}
+print "\n";
+
+my @tbl_line_to_display;
+
+#exit(0);
+
+foreach my $key (keys %global_hash){
+	my @tbl = split (/\#/,$key);
+	my $chr = $tbl[0];
+	my $pos = $tbl[1];
+	my $ref = $global_hash{$key};
+	my $line = "$chr\t$pos\t$ref";
+	my $isvariant = 0;
+	for (my $i=0;$i<=$#tbl_hash;$i++){
+		#my %current_hash = %{$tbl_hash[$i]};
+		if ($tbl_hash[$i]->{$key}){
+			$line.="\t".$tbl_hash[$i]->{$key};
+			if ($tbl_hash[$i]->{$key} ne $ref){
+				$isvariant = 1;
+			}
+		}
+		else {
+			$line.="\t"."NA";
+		}
+		
+	}
+	
+	$line .="\n";
+	if ($isvariant == 1){
+		push(@tbl_line_to_display,$line);
+	}
+}
+
+
+for (my $i=0;$i<=$#tbl_line_to_display;$i++){
+	my @tbl = split (/\s+/,$tbl_line_to_display[$i]);
+	my $current_chr = $tbl[0];
+	my @current_tbl;
+	if ($result_by_chr{$current_chr}){
+		push (@{$result_by_chr{$current_chr}},$tbl_line_to_display[$i]);
+	}
+	else {
+		push (@current_tbl,$tbl_line_to_display[$i]);
+		$result_by_chr{$current_chr} = \@current_tbl;
+	}
+}
+
+foreach my $key (sort keys %result_by_chr){
+	my @current_tbl = sort mysort @{$result_by_chr{$key}};
+	for (my $i=0;$i<=$#current_tbl;$i++){
+		print $current_tbl[$i];
+	}
+}
+
+sub mysort {
+	my @tbla = split (/\s+/,$a);
+	my @tblb = split (/\s+/,$b);
+	$tbla[0] cmp $tblb[0] || $tbla[1]<=>$tblb[1];
+}
+
+