view extractSClip.pl @ 0:acc8d8bfeb9a

Uploaded
author jjohnson
date Wed, 08 Feb 2012 16:59:24 -0500
parents
children
line wrap: on
line source

#!/usr/bin/perl -w
use strict;
#use lib '/home/jwang2/AssembleTest/Detect/nonSJsrc/dev';
use Carp;
use Getopt::Long;
use English;
use Pod::Usage;
use Data::Dumper;
use Bio::DB::Sam;
use File::Temp qw/ tempfile tempdir /;
use File::Spec;
use File::Basename;
use Cwd;
use List::MoreUtils qw/ uniq /;
#custom packages
use SCValidator qw($lowqual_cutoff $min_percent_id $min_percent_hq LEFT_CLIP RIGHT_CLIP);
use SVUtil qw($use_scratch $work_dir get_input_bam get_work_dir parse_range is_PCR_dup);

use constant FQ_BASE_NUMBER => 33;

my $rmdup = 0;
my $print_read = 1;
$use_scratch = 0;
$min_percent_id = 90;
my ($work_dir, $out_dir);
my $ref_genome;
# input/output
my ($out_prefix, $range, $input_bam );
my $out_suffix = "sclip.txt";
my $paired = 1;
my ( $help, $man, $version, $usage );
my $optionOK = GetOptions(
	'i|in|input=s'	=> \$input_bam,	
	'o|out_dir=s'	=> \$out_dir,
	'ref_genome=s'  => \$ref_genome,
	'p=s'			=> \$out_prefix,
	'scratch!'		=> \$use_scratch,
	'paired!'		=> \$paired,
	'rmdup!'		=> \$rmdup,
	'lq_cutoff=i'	=> \$lowqual_cutoff,
	'min_pct_id=i'	=> \$min_percent_id,
	'min_pct_hq=i'	=> \$min_percent_hq,
	'print_read!'	=> \$print_read,
	'r|range=s'		=> \$range,
	'h|help|?'		=> \$help,
	'man'			=> \$man,
	'usage'			=> \$usage,
	'v|version'		=> \$version,
);

pod2usage(-verbose=>2) if($man or $usage);
pod2usage(1) if($help or $version );

my $start_dir = getcwd;
if($input_bam) {
	croak "The bam file you specified does not exist!\n" unless(-e $input_bam);
	$input_bam = File::Spec->rel2abs($input_bam);
}
else{
	croak "You must specify the input bam file or sample name";
}
croak "You must provide the reference genome in fasta format!" if(!$ref_genome);
croak "The reference genome file you speicified does not exist!\n"
	unless(-e $ref_genome);

my $input_base = fileparse($input_bam);

#setup output dir and workind directory
$out_dir = getcwd if(!$out_dir);
mkdir $out_dir if(!-e $out_dir || ! -d $out_dir);
$work_dir = get_work_dir() if($use_scratch);
$work_dir = $out_dir if(!$work_dir);
$use_scratch = undef if($work_dir eq $out_dir);
chdir($work_dir); 

# figure out output prefix
$out_prefix = $input_base if(!$out_prefix);

my $sam = Bio::DB::Sam->new( -bam => $input_bam, -fasta => $ref_genome);

my $output_file;
my $validator = SCValidator->new();
if(!$paired) {
	$validator->remove_validator("strand_validator");
}
if($range) {
	my ($chr, $start, $end) = parse_range($range);
	my $tmp = $chr;
	$tmp = $tmp . ".$start" if($start);
	$tmp = $tmp . ".$end" if($end);
	$output_file = join('.', $out_prefix, $tmp, $out_suffix);
	$output_file = File::Spec->catfile($out_dir, $output_file);

	my($pcover, $ncover) = extract_range_sclip(
		-SAM => $sam, 
		-RANGE => $range, 
		-WORK_DIR => $work_dir, 
		-OUTPUT => $output_file,  
		-VALIDATOR => $validator);
	$output_file = join('.', $out_prefix, $tmp, "cover");
	$output_file = File::Spec->catfile($out_dir, $output_file);
	warn "$output_file file exists and it will be replaced!" if(-e $output_file);
	$rmdup = 0;
	#$start = 0 unless $start;
    #my ($tid) = $sam->header->parse_region($chr);
    #my $chr_len = $sam->header->target_len->[$tid];
	#$end = $chr_len unless $end;
	
	#my ($coverage) = $sam->features(-type => 'coverage', -seq_id => $chr,
	#	-start => $start, -end => $end);
	#my @cc = $coverage->coverage;

	$rmdup = 0;
	my($fh, $fname) = tempfile( DIR => $work_dir);
	foreach my $p (keys(%{$pcover})) {
		my $c = ($rmdup ? scalar(@{$pcover->{$p}}) : $pcover->{$p});
		print $fh join("\t", $chr, $p, "+", $c, count_coverage($sam, $chr, $p)), "\n";
	}
	foreach my $p (keys(%{$ncover})) {
		my $c = ($rmdup ? scalar(@{$ncover->{$p}}) : $ncover->{$p});
		print $fh join("\t", $chr, $p, "-", $c , count_coverage($sam, $chr, $p)), "\n";
	}
	system("sort -k 2 -n $fname -o $output_file");
	system("rm $fname");
}
else{
	$output_file = join('.', $out_prefix, $out_suffix);
	$output_file = File::Spec->catfile($out_dir, $output_file);
    my $header = $sam->header;
    my $target_names = $header->target_name;
    $output_file = join('.', $out_prefix, "cover");
	my $read_file = join('.', $out_prefix, "sclip.txt");
    $output_file = File::Spec->catfile($out_dir, $output_file);
    $read_file = File::Spec->catfile($out_dir, $read_file);

	if(-e $output_file) {
		warn "$output_file file exists and it will be replaced!";
		system("rm $output_file");
	}
	my @t_names = uniq @{$target_names};
    foreach my $chr (@t_names){
		my($fh, $fname) = tempfile( DIR => $work_dir);
        my($pcover, $ncover) = extract_range_sclip(
			-SAM => $sam,
			-RANGE =>$chr, 
			-WORK_DIR => $work_dir, 
			-OUTPUT => $read_file, 
			-VALIDATOR => $validator);
		foreach my $p (keys(%{$pcover})) {
			my $c = ($rmdup ? scalar(@{$pcover->{$p}}) : $pcover->{$p});
			print $fh join("\t", $chr, $p, "+", $c, count_coverage($sam, $chr, $p) ), "\n";
		}
		foreach my $p (keys(%{$ncover})) {
			my $c = ($rmdup ? scalar(@{$ncover->{$p}}) : $ncover->{$p});
			print $fh join("\t", $chr, $p, "-", $c, count_coverage($sam, $chr, $p)), "\n";
		}
		system("sort -k 2 -n $fname -o $fname.sorted");
		system("cat $fname.sorted >> $output_file");
		system("rm $fname");
		system("rm $fname.sorted");
    }
}
chdir $start_dir;
exit(0);

sub count_coverage {
	my ($sam, $chr, $pos, $clip) = @_;
	if($rmdup) {
		my @pairs;
		my $seg = $sam->segment(-seq_id => $chr, -start => $pos, -end => $pos);
		my $n = 0;
		my $itr = $seg->features(-iterator => 1);
		while( my $a = $itr->next_seq) {
			my $sclip_len = 0;
			if($clip) {
				my @cigar_array = @{$a->cigar_array};
				$sclip_len = $cigar_array[0]->[1] if($cigar_array[0]->[0] eq 'S' && $clip == RIGHT_CLIP);
				$sclip_len = $cigar_array[$#cigar_array]->[1] if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP);
			}
			next if(@pairs > 0 && is_PCR_dup($a, \@pairs, $sclip_len));
			$n++;
			#return $n if( $n > $max_repetitive_cover);
			if($a->mpos) {
				push @pairs, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len];
			}
			else {
				push @pairs, [$a->start, $a->end, 0, 0, $sclip_len];
			}

		}
		return $n;
	}
	else{
		my ($c) = $sam->features(-type => 'coverage', -seq_id=> $chr, 
			-start => $pos, -end => $pos);
		my @c_d = $c->coverage;
		return $c_d[0];
	}
}

sub extract_range_sclip {
	my %arg = @_;
	my $sam = $arg{-SAM} || croak "missing -SAM";
	my $range = $arg{-RANGE} || croak "missing -RANGE";
	my $output_file = $arg{-OUTPUT} || croak "missing -OUTPUT";
	my $validator = $arg{-VALIDATOR} || croak "missing -VALIDATOR";

	my($fh, $fname) = tempfile( DIR => $work_dir);
	my (%plus_cover, %neg_cover);
	$sam->fetch($range, 
		sub {
			my $a = shift;
			my $cigar_str = $a->cigar_str;
#			print STDERR $a->qname, "\t", $cigar_str, "\n";
			my @cigar_array = @{$a->cigar_array};
			return if($a->cigar_str !~ m/S/);
			if($paired && !$a->proper_pair) { #paired but mate is not mapped 
				$validator->remove_validator("strand_validator");
			}
			#return if(!$a->proper_pair && $paired);
			#return if($paired && !$a->mpos); 
			my ($sclip_len, $ort, $pos, $seq, $qual_str, $qual);
			$qual_str = join( "",  (map { chr $_ + FQ_BASE_NUMBER } $a->qscore));
			if($cigar_array[0]->[0] eq 'S' && $validator->validate($a, LEFT_CLIP) ) {
				$sclip_len = $cigar_array[0]->[1]; $ort = "-"; $pos = $a->start;
				$seq = substr($a->query->dna, 0, $sclip_len );
				$qual = substr($qual_str, 0, $sclip_len);
				
				my $print = 1;
				if($rmdup) {
					if(exists $neg_cover{$pos}) {
						$print = 0 if(is_PCR_dup($a, $neg_cover{$pos}, $sclip_len));
					}
					else {
						$neg_cover{$pos} = [];
					}
					if($print == 1) {
						if($a->mpos) {
							push @{$neg_cover{$pos}}, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len];
						}
						else {
							push @{$neg_cover{$pos}}, [$a->start, $a->end, 0, 0, $sclip_len];
						}
					}
				}
				else {
					if(exists $neg_cover{$pos}) {
						$neg_cover{$pos}++;
					}
					else{
						$neg_cover{$pos} = 1;
					}
				}
				print $fh join("\t", $a->seq_id, $pos, $ort, $a->qname, $seq, $qual), "\n"
					if($print_read && $print == 1);
			}

			if($cigar_array[$#cigar_array]->[0] eq 'S' &&  $validator->validate($a, RIGHT_CLIP)) {
				$sclip_len = $cigar_array[$#cigar_array]->[1]; $ort = '+'; $pos = $a->end;
				my $l = length($a->query->dna);
				$seq = substr($a->query->dna, $l - $sclip_len);
				$qual = substr($qual_str, $l - $sclip_len);

				my $print = 1;
				if($rmdup) {
					if(exists $plus_cover{$pos}) {
						$print = 0 if(is_PCR_dup($a, $plus_cover{$pos}, $sclip_len));
					}
					else {
						$plus_cover{$pos} = [];
					}
					if($print ==1 ) {
						if($a->mpos) {
							push @{$plus_cover{$pos}}, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len];
						}
						else {
							push @{$plus_cover{$pos}}, [$a->start, 0, 0, $a->mate_end, $sclip_len];
						}
					}
				}
				else {
					if(exists $plus_cover{$pos}) {
						$plus_cover{$pos}++;
					}
					else{
						$plus_cover{$pos} = 1;
					}
				}
				$print = is_PCR_dup($a, $plus_cover{$pos}, $sclip_len) if($rmdup);
				print $fh join("\t", $a->seq_id, $pos, $ort, $a->qname, $seq, $qual), "\n"
					if($print_read && $print);
			}
			if($paired && !$a->proper_pair) { #paired but mate is not mapped, add back
				$validator->add_validator("strand_validator"); 
			}
		}
	);
	if($print_read) {
		system("sort -k 2 -n $fname -o $fname.sorted");
		system("cat $fname.sorted >> $output_file");
		system("rm $fname");
		system("rm $fname.sorted");
	}
	return(\%plus_cover, \%neg_cover);
}


=head1 NAME

extractSClip.pl - extract positions with soft clipped read in bam file.


=head1 VERSION

This documentation refers to extractSClip.pl version 0.0.1.


=head1 USAGE

	# extract all positions with soft clipped reads in whole genome:
	./extractSClip.pl -i sample.bam -g hg18.fa 
	# extract chr1 positions with soft clipped reads
	./extractSClip.pl -i sample.bam -g hg18.fa -r chr1


=head1 REQUIRED ARGUMENTS

	-i: Input bam file.
	--ref_genome: The genome file in fa file, must be the same used to map reads.
	

=head1 OPTIONS

	-r: The range of positions need to be extracted. Format: chr1 or chr1:500-5000.
	-o: The output directory, default is current directory.
	--scratch: use scracth space, default off.
	--rmdup: remove PCR dumplicate reads, default on, use --normdup to turn it off.
	--lq_cutoff: low quality cutoff value, default 20. 
	--min_pct_id: minimum percent identify for the aligned high qual part,default 90.
	--min_pct_hq: minimum percent high quality for soft clipped part, default 80.
	--print_read: individual soft-clipped read will be printed, default off.
	-h, --help: The help page.
	--man: Print the man page.
	--usage: Print usage information.
	-v, --version: print version information.


=head1 DESCRIPTION

This is a program to extract all soft-clipped positions such that for each position
a list of requirements need to be satisfied. More specifically, the orientaion of 
pair-end read should be satisfied, the minimum percent identify of aligned part 
need to be satisfied, and the minimum percent of hiqh quality soft-clipped part 
should be satisfied.


=head1 DEPENDENCIES

The program depend on several packages:
1. Bioperl perl module.
2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed.

=head1 BUGS AND LIMITATIONS

There are no known bugs in this module, but the method is limitted to bam file 
that has soft-clipping cigar string generated.Please report problems to 
Jianmin Wang  (Jianmin.Wang@stjude.org)
Patches are welcome.

=head1 AUTHOR

Jianmin Wang (Jianmin.Wang@stjude.org)


=head1 LICENCE AND COPYRIGHT

Copyright (c) 2010 by St. Jude Children's Research Hospital.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version. 

This program is distributed in the hope that it will be useful, 
but WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.