Mercurial > repos > dongjun > mosaics
view mosaics/inst/Perl/readfile2wig_PET.pl @ 10:d78c3c5e8ff8 draft
Uploaded
author | dongjun |
---|---|
date | Thu, 10 Jan 2013 16:01:28 -0500 |
parents | b6d0c6ceda2c |
children |
line wrap: on
line source
################################################################### # # Process read files into WIG files # # Command arguments: # - infile: Input file (directory + file name) # - outdir: Directory of output files # - format: File format # - span: wig span # - norm_const: normalizing constant # - collapse: Maximum # of reads allowed at each position (skip if collapse=0) # - bychr: Construct bin-level files by chromosome? (Y or N) # - chrinfo: Is the file for chromosome info provided? # - chrfile: File name for chromosome info (chr size) # - @excludeChr: Chromosomes to be excluded (vector) # # Supported file format: # - eland_result, sam # # Note # - chromosomes are extracted from read files, after excluding invalid lines # - uni-reads are assumed # ################################################################### #!/usr/bin/env perl; use warnings; use strict; use Cwd; use File::Basename; my ($infile, $outdir, $format, $span, $norm_const, $collapse, $bychr, $chrinfo, $chrfile, @exclude_chr) = @ARGV; # extract only filename from $infile my @pr = fileparse( $infile ); my $filename = $pr[0]; # remember current working directory my $pwd = cwd(); # construct bin-level data based on chromsome info, if provided my %bin_count = (); my $bin_start = 0; my $bin_stop = 0; if ( $chrinfo eq "Y" ) { open CHR, "$chrfile" or die "Cannot open $chrfile\n"; while (<CHR>) { chomp; my ( $chrname, $chrsize ) = split /\s+/, $_; $bin_start = 0; $bin_stop = int($chrsize/$span); for (my $i = $bin_start; $i <= $bin_stop; $i++) { ${$bin_count{$chrname}}[$i] = 0; } } close CHR; } # chromosomes to be excluded my $ecyn; my %ec_hash; if ( scalar(@exclude_chr) == 0 ) { $ecyn = "N"; } else { $ecyn = "Y"; @ec_hash{ @exclude_chr } = 0; } # process PET read file # (chromosome information is extracted from read file) open IN, "$infile" or die "Cannot open input file $infile\n"; my %seen =(); my %seen_pos =(); # position (leftmost position, starting from 1) my %seen_str = (); # strand ('F' or 'R') my $start; my $end; while(<IN>){ # parse chomp; # procee read file, based on "format" option my @parsed; if ( $format eq "eland_result" ) { @parsed = eland_result( $_ ); } elsif ( $format eq "sam" ) { @parsed = sam( $_ ); } else { # Unsupported file format -> exit and return 1 to environment exit 1; } # skip if invalid line next if ( $parsed[0] == 0 ); # otherwise, process it my ($status, $id, $chrt, $pos, $str, $read_length, $prob ) = @parsed; my ($id_only, $pair) = split /\//, $id; # skip if chrt \in @exclude_chr if ( $ecyn eq "Y" && exists $ec_hash{ $chrt } ) { next; } # process coordinates if ( exists $seen_pos{$id_only} ) { # if other pair was already observed (w.r.t. id), # then write the coordinates of both reads in the pair if ( $seen_str{$id_only} ne $str ) { # pos1 & pos2 should have different strands and pair numbers my $pos1 = $seen_pos{$id_only}; my $pos2 = $pos; # in the pair, write upstream pos first, # then downstream pos + read length - 1. # [Note] eland_result: pos = leftmost position, starting from 1. if ( $pos1 < $pos2 && $str eq "R" ) { $pos2 += $read_length - 1; $start = $pos1; $end = $pos2; } elsif ( $pos1 > $pos2 && $seen_str{$id_only} eq "R" ) { $pos1 += $read_length - 1; $start = $pos2; $end = $pos1; } else { print "check id $id!\n"; } # process to bin-level files if collapse condition is satisfied my $id_collapse = join("",$chrt,$start,$end); $seen{$id_collapse}++; if ( $collapse > 0 && $seen{$id_collapse} > $collapse ) { next; } # update bin count if ( exists $bin_count{$chrt} ) { # if there is already a matrix for chromosome, update it $bin_start = int(($start-1)/$span) ; $bin_stop = int(($end-1)/$span) ; for (my $i = $bin_start; $i <= $bin_stop; $i++) { ${$bin_count{$chrt}}[$i] += 1; } } else { # if there is no matrix for chromosome yet, construct one @{$bin_count{$chrt}} = (); $bin_start = int(($start-1)/$span) ; $bin_stop = int(($end-1)/$span) ; for (my $i = $bin_start; $i <= $bin_stop; $i++) { ${$bin_count{$chrt}}[$i] += 1; } } # delete the key to save memory delete $seen_pos{$id_only}; delete $seen_str{$id_only}; } else { print "inappropriate pair for id $id.\n"; } } else { # if other pair was not observed yet, record it $seen_pos{$id_only} = $pos; $seen_str{$id_only} = $str; } } close( IN ); # move to output directory chdir($outdir); # write bin-level files if ( $bychr eq "N" ) { # genome-wide version: all chromosome in one file my $outfile = $filename."_span".$span.".wig"; open OUT, ">$outfile" or die "Cannot open $outfile\n"; print OUT "track type=wiggle_0 name=\"".$outfile."\" "; print OUT "description=\"".$outfile."\"\n"; foreach my $chr_id (keys %bin_count) { my @bin_count_chr = @{$bin_count{$chr_id}}; print OUT "variableStep chrom=$chr_id span=$span\n"; for( my $i = 0; $i < scalar(@bin_count_chr); $i++ ){ my $coord = $i*$span + 1; if ( $bin_count_chr[$i] ) { my $count_final = $bin_count_chr[$i] * $norm_const; print OUT "$coord $count_final\n"; } else { print OUT "$coord 0\n"; } } } close OUT; } else { # chromosome version: one chromosome in each file foreach my $chr_id (keys %bin_count) { my $outfile = $filename."_span".$span."_".$chr_id.".wig"; open OUT, ">$outfile" or die "Cannot open $outfile\n"; print OUT "track type=wiggle_0 name=\"".$outfile."\" "; print OUT "description=\"".$outfile."\"\n"; my @bin_count_chr = @{$bin_count{$chr_id}}; print OUT "variableStep chrom=$chr_id span=$span\n"; for( my $i = 0; $i < scalar(@bin_count_chr); $i++ ){ my $coord = $i*$span + 1; if ( $bin_count_chr[$i] ) { my $count_final = $bin_count_chr[$i] * $norm_const; print OUT "$coord $count_final\n"; } else { print OUT "$coord 0\n"; } } } close OUT; } ################################################################################ # # # subroutines # # # ################################################################################ # -------------------------- subroutine: eland result -------------------------- sub eland_result { my $line = shift; # parse line my ($id, $seq, $map, $t3, $t4, $t5, $chrt, $pos, $str, @rest) = split /\t/, $line; my $read_length = length $seq; # exclude invalid lines if ( $map eq "QC" || $map eq "NM" ) { my @status = 0; return @status; } if ( $chrt eq "QC" || $chrt eq "NM" ) { my @status = 0; return @status; } # exclude multi-reads if ( $map eq "R0" || $map eq "R1" || $map eq "R2" ) { my @status = 0; return @status; } if ( $chrt eq "R0" || $chrt eq "R1" || $chrt eq "R2" ) { my @status = 0; return @status; } my @pos_sets = split( /,/, $pos ); if ( scalar(@pos_sets) > 1 ) { my @status = 0; return @status; } # return my $status = 1; my $prob = 1; my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob ); return @parsed; } # -------------------------- subroutine: SAM ----------------------------------- sub sam { my $line = shift; # parse line if ( $line =~ /^[^@].+/ ) { # exclude lines starting with "@" (comments) my ($id, $bwflag, $chrt, $pos, $t2, $t3, $t4, $t5, $t6, $seq, $t7 ) = split /\t/, $line; $pos = int($pos); my $read_length = length $seq; # strand my $str; if ( $bwflag & 4 or $bwflag & 512 or $bwflag & 1024 ) { # exclude invalid lines my @status = 0; return @status; } elsif ( $bwflag & 16 ) { # reverse strand $str = "R"; } else { $str = "F"; } # exclude multi-reads? # return my $status = 1; my $prob = 1; my @parsed = ( $status, $id, $chrt, $pos, $str, $read_length, $prob ); return @parsed; } else { my @status = 0; return @status; } }