Mercurial > repos > grau > dimont
diff extract_data_single_galaxy.pl @ 0:ee1124489f9c draft
Uploaded
author | grau |
---|---|
date | Wed, 13 Nov 2013 04:10:29 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_data_single_galaxy.pl Wed Nov 13 04:10:29 2013 -0500 @@ -0,0 +1,128 @@ +use strict; + +if(@ARGV == 0){ +die <<USAGE +usage: +perl extract_data.pl <chromFa> <bedfile> <chromcol> <startcol> <seccolm> <secondcol> <width> <statcol> <outfile> + + <chromFa>: the chromosome FastA containing all chromosome sequences + <bedfile>: the file containing the peaks in tabular format, + e.g., bed, gff, narrowPeak + <chromcol>: the column of <bedfile> containing the chromosome + <startcol>: the column of <bedfile> containing the start position relative to + the chromosome start + <seccolm>: center: "Center of peak (relative to start)", end: "End of peak (global coordinates)" + <secondcol>: the column of <bedfile> containing the peak center position (center) relative to + <startcol> or the column of <bedfile> containing the end position (end) + <width>: fixed width of all regions + <statcol>: the column of <bedfile> containing the peak statistic + or a similar measure of confidence + <outfile>: the path to the output file, written as FastA +USAGE +} + + +my $chromFa = $ARGV[0]; +my $bed = $ARGV[1]; +my $chromcol = $ARGV[2]-1; +my $startcol = $ARGV[3]-1; +my $seccolm = $ARGV[4]; +my $seccol = $ARGV[5]-1; +my $width = $ARGV[6]; +my $statcol = $ARGV[7]-1; +my $outfile = $ARGV[8]; + +my $sort = 1; + + +sub loadSeq{ + my $prefix = shift; + print $prefix," "; + open(FA,$chromFa); + my $head = ""; + my @lines = (); + while(<FA>){ + chomp(); + if(/^>/){ + if($head){ + last; + } + if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){ + $head = $_; + } + }elsif($head){ + push(@lines,lc($_)); + } + } + my $str = join("",@lines); + print "loaded\n"; + return $str; +} + + + +open(IN,$ARGV[1]); + +my @lines = (); + +while(<IN>){ + chomp(); + my @parts = split("\t",$_); + $parts[$chromcol] =~ s/chr0/chr/g; + my @vals = (); + if($seccolm eq "center"){ + @vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]); + }else{ + @vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]); + } + push(@vals,$width); + push(@lines,\@vals); +} + +close(IN); +#print "Read input file ".$bed."\n"; + + +if($sort){ + + @lines = sort { ${$a}[0] cmp ${$b}[0] } @lines; + +} + +open(OUT,">".$outfile); + +print "Extracting sequences...\n\n"; + +my $oldchr = ""; +my $sequence = ""; +for my $line (@lines){ + my @ar = @{$line}; + my $chr = $ar[0]; + unless($chr eq $oldchr){ + $sequence = loadSeq($chr); + } + $oldchr = $chr; + my $w = $ar[3]; + if($w <= 0){ + print $w," -> next\n"; + next; + } + if($w % 2 == 0){ + $w = $w/2; + }else{ + $w = ($w-1)/2; + } + + my $start = $ar[1]-$w-1; + + my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n"; + my $curr = substr($sequence,$start,$ar[3]); + if($curr =~ /[^ACGTacgt]/){ + print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n"; + }else{ + print OUT $head,$curr,"\n"; + } +} + +close(OUT); +print "\nDone.\n"; \ No newline at end of file