view scripts/pickUniqMate.pl @ 13:72156de89c7a draft

planemo upload for repository https://github.com/portiahollyoak/Tools commit 65ddf081d2f1a76bc4d6d91f01ab72667b9e1549
author portiahollyoak
date Mon, 23 May 2016 05:42:49 -0400
parents ca36262102d8
children 9672fe07a232
line wrap: on
line source

#!/share/bin/perl
use List::Util qw(sum);
use Bio::Seq;

die "perl $0 <mate sam with header> <uniq bed>\n" if @ARGV<1;

open in,$ARGV[1];
my %uniq;
while(<in>)
{
	chomp;
	my @f=split;
	$uniq{$f[3]}=[@f];
}
close in;

open in,$ARGV[0];
my (%te,@ref,%ref);
while(<in>)
{
	chomp;
	my @f=split/\s+/,$_;
	# headers
	if(/^\@SQ/)
	{
		my ($sn,$ln)=/SN:(.*?)\tLN:(\d+)/;
		push @ref,[$sn,$ln];
		$ref{$sn}=$#ref;
		next;
	}

	# unmapped
	next if $f[2] eq "*";
	
	my $mm=200;
	my $xa="";
	for my $q (11..$#f)
	{
	    if($f[$q]=~/NM:/)
	    {
		$mm=$f[$q];
		$mm =~ s/NM://;
	    }

	    if($f[$q]=~/XA:Z:/)
	    {
		($xa)=$f[$q]=~/XA:Z:(.*);$/;
		last;
	    }
	}

	if ($mm > 5) {next;}

	my ($rnum)=$f[1]=~/(\d)$/;
	# CIGAR
	my (@cigar_m)=$f[5]=~/(\d+)M/g;
	my (@cigar_d)=$f[5]=~/(\d+)D/g;
	my (@cigar_s)=$f[5]=~/(\d+)S/g;
	my (@cigar_i)=$f[5]=~/(\d+)I/g;
	my $aln_ln=sum(@cigar_m,@cigar_d);
	
	my $strand="+";
	if($f[1]=~/r/)
	{
	    my $seq=Bio::Seq->new(-seq=>$f[9], -alphabet => 'dna');
	    $f[9]=$seq->revcom->seq;
	    $strand="-";
	}
	
	# align to the junctions
	if(($f[3]+$aln_ln-1)>${$ref[$ref{$f[2]}]}[1])
	{
	    if(($f[3]+($aln_ln-1)/2)>${$ref[$ref{$f[2]}]}[1])
	    {
		$f[2]=${$ref[$ref{$f[2]}+1]}[0];
		$f[3]=1;
		$aln_ln=$aln_ln-(${$ref[$ref{$f[2]}]}[1]-$f[3]+1);
	    }
	    else
	    {
		$aln_ln=${$ref[$ref{$f[2]}]}[1]-$f[3]+1;
	    }
	}
	
	$pe{$f[0]}{$rnum}=$f[2].",".$strand."$f[3]".";";
	    
	# XA tag
	my @xa=split(";",$xa);
	$pe{$f[0]}{$rnum}.=join(",",(split/,/)[0,1]).";" foreach @xa;

}
close in;

foreach my $id (keys %pe)
{
	next if exists $pe{$id}{1} && exists $pe{$id}{2} && exists $uniq{$id."/1"} && exists $uniq{$id."/2"};
	foreach my $rid (keys %{$pe{$id}})
	{
		my $mate_id=($rid==1)?2:1;
		if(exists $uniq{$id."/".$mate_id})
		{
			${$uniq{$id."/".$mate_id}}[4]=$pe{$id}{$rid};
			print join("\t",@{$uniq{$id."/".$mate_id}}),"\n";
		}
	}
}