diff check_gwas_inputs/CheckGWASInputs.pl @ 9:98c37a5d67f4 draft

Uploaded
author dereeper
date Wed, 07 Feb 2018 22:08:47 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/check_gwas_inputs/CheckGWASInputs.pl	Wed Feb 07 22:08:47 2018 -0500
@@ -0,0 +1,184 @@
+#!/usr/bin/perl
+
+use strict;
+
+use Getopt::Long;
+
+my $usage = qq~Usage:$0 <args> [<opts>]
+where <args> are:
+    -h, --hapmap       <Hapmap input file>
+    -t, --trait        <Trait input file>
+    -o, --out          <Output base name>
+~;
+$usage .= "\n";
+
+my ($hapmap,$trait,$out);
+
+
+GetOptions(
+        "trait=s"      => \$trait,
+        "out=s"        => \$out,
+        "hapmap=s"     => \$hapmap
+);
+
+
+die $usage
+  if ( !$trait || !$out || !$hapmap);
+  
+my %inds;
+
+#######################################
+# get individuals in trait file
+#######################################
+my %traits;
+my $head_trait = `head -1 $trait`;
+open(my $T,$trait);
+<$T>;
+while(<$T>)
+{
+	my @infos = split(/\t/,$_);
+	my $ind = $infos[0];
+	$inds{$ind}++;
+	$traits{$ind} = $_;
+}
+close($T);
+my $nb_ind_trait = scalar keys(%traits);
+
+#######################################
+# get individuals in hapmap file
+#######################################
+my $line_ind = `head -1 $hapmap`;
+chomp($line_ind);
+my @infos = split(/\t/,$line_ind);
+for (my $i = 11; $i <= $#infos; $i++)
+{
+	my $ind = $infos[$i];
+	$inds{$ind}++;
+}
+my $nb_ind_hapmap = scalar @infos - 11;
+
+#################################################################
+# create trait output by keeping individuals found in both files
+#################################################################
+open(my $O,">$out.trait");
+print $O $head_trait;
+my $nb_common = 0;
+foreach my $ind(keys(%inds))
+{
+	my $nb_found = $inds{$ind};
+	if ($nb_found == 2)
+	{
+		$nb_common++;
+		print $O $traits{$ind};
+	}
+}
+close($O);
+
+
+#####################################################################
+# create hapmap output after keeping individuals found in both files
+# and removing monomorphic positions
+#####################################################################
+open(my $O2,">$out.hapmap");
+my $numline = 0;
+my %genotypes;
+my %columns_to_keep;
+my $nb_monomorphic = 0;
+my $not_biallelic = 0;
+my $diff_variation = 0;
+open(my $H,$hapmap);
+while(<$H>)
+{
+	$numline++;
+	my $line = $_;
+	$line =~s/\n//g;
+	$line =~s/\r//g;
+	my @infos = split(/\t/,$line);
+	if ($numline == 1)
+	{
+		my @titles;
+		for (my $i = 0; $i <= 10; $i++)
+		{
+			my $title = $infos[$i];
+			push(@titles,$title);
+		}
+		print $O2 join("\t",@titles);
+		for (my $i = 11; $i <= $#infos; $i++)
+		{
+			my $ind = $infos[$i];
+			my $nb_found = $inds{$ind};
+			if ($nb_found == 2)
+			{
+				print $O2 "	$ind";
+				$columns_to_keep{$i} = 1;
+			}
+		}
+		print $O2 "\n";
+	}
+	else
+	{
+		my $to_be_printed = "";
+		my $variation = $infos[1];
+        for (my $i = 0; $i <= 10; $i++)
+		{
+			my $title = $infos[$i];
+			$to_be_printed .= "$title	";
+		}
+		my %letters;
+		for (my $i = 11; $i <= $#infos; $i++)
+		{
+			if ($columns_to_keep{$i})
+			{
+				my $genotype = $infos[$i];
+				if ($genotype ne 'NN')
+				{
+					my ($allele1,$allele2) = split(//,$genotype);
+					$letters{$allele1}=1;
+					$letters{$allele2}=1;
+				}
+				$to_be_printed .= "$genotype	";
+			}
+		}
+		chop($to_be_printed);
+		
+		my $variation_obs = join("/",sort keys(%letters));
+		
+		# print only if polymorphic
+		if (scalar keys(%letters) < 2)
+		{
+			$nb_monomorphic++;
+		}
+		elsif (scalar keys(%letters) > 2)
+		{
+			$not_biallelic++;
+		}
+		else
+		{
+			if ($variation ne $variation_obs)
+			{
+				$to_be_printed =~s/$variation/$variation_obs/;
+				$diff_variation++;
+			}
+			
+			print $O2 $to_be_printed . "\n";
+		}
+	}
+}
+close($H);
+close($O2);
+
+print "==============================================\n";
+print "Individuals\n";
+print "==============================================\n";
+print "Individuals in hapmap file: $nb_ind_hapmap\n";
+print "Individuals in trait file: $nb_ind_trait\n";
+print "Individuals found in both files: $nb_common\n";
+print "==============================================\n";
+print "Markers\n";
+print "==============================================\n";
+print "Discarded markers:\n";
+print "Monomorphic: $nb_monomorphic\n";
+print "Not biallelic: $not_biallelic\n";
+print "Modified markers:\n";
+print "Difference in variation: $diff_variation\n";
+