Mercurial > repos > grau > dimont
diff extract_data_single_galaxy.pl @ 1:083e32135da3 draft default tip
Deleted selected files
author | grau |
---|---|
date | Wed, 13 Nov 2013 04:13:57 -0500 |
parents | ee1124489f9c |
children |
line wrap: on
line diff
--- a/extract_data_single_galaxy.pl Wed Nov 13 04:10:29 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,128 +0,0 @@ -use strict; - -if(@ARGV == 0){ -die <<USAGE -usage: -perl extract_data.pl <chromFa> <bedfile> <chromcol> <startcol> <seccolm> <secondcol> <width> <statcol> <outfile> - - <chromFa>: the chromosome FastA containing all chromosome sequences - <bedfile>: the file containing the peaks in tabular format, - e.g., bed, gff, narrowPeak - <chromcol>: the column of <bedfile> containing the chromosome - <startcol>: the column of <bedfile> containing the start position relative to - the chromosome start - <seccolm>: center: "Center of peak (relative to start)", end: "End of peak (global coordinates)" - <secondcol>: the column of <bedfile> containing the peak center position (center) relative to - <startcol> or the column of <bedfile> containing the end position (end) - <width>: fixed width of all regions - <statcol>: the column of <bedfile> containing the peak statistic - or a similar measure of confidence - <outfile>: the path to the output file, written as FastA -USAGE -} - - -my $chromFa = $ARGV[0]; -my $bed = $ARGV[1]; -my $chromcol = $ARGV[2]-1; -my $startcol = $ARGV[3]-1; -my $seccolm = $ARGV[4]; -my $seccol = $ARGV[5]-1; -my $width = $ARGV[6]; -my $statcol = $ARGV[7]-1; -my $outfile = $ARGV[8]; - -my $sort = 1; - - -sub loadSeq{ - my $prefix = shift; - print $prefix," "; - open(FA,$chromFa); - my $head = ""; - my @lines = (); - while(<FA>){ - chomp(); - if(/^>/){ - if($head){ - last; - } - if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){ - $head = $_; - } - }elsif($head){ - push(@lines,lc($_)); - } - } - my $str = join("",@lines); - print "loaded\n"; - return $str; -} - - - -open(IN,$ARGV[1]); - -my @lines = (); - -while(<IN>){ - chomp(); - my @parts = split("\t",$_); - $parts[$chromcol] =~ s/chr0/chr/g; - my @vals = (); - if($seccolm eq "center"){ - @vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]); - }else{ - @vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]); - } - push(@vals,$width); - push(@lines,\@vals); -} - -close(IN); -#print "Read input file ".$bed."\n"; - - -if($sort){ - - @lines = sort { ${$a}[0] cmp ${$b}[0] } @lines; - -} - -open(OUT,">".$outfile); - -print "Extracting sequences...\n\n"; - -my $oldchr = ""; -my $sequence = ""; -for my $line (@lines){ - my @ar = @{$line}; - my $chr = $ar[0]; - unless($chr eq $oldchr){ - $sequence = loadSeq($chr); - } - $oldchr = $chr; - my $w = $ar[3]; - if($w <= 0){ - print $w," -> next\n"; - next; - } - if($w % 2 == 0){ - $w = $w/2; - }else{ - $w = ($w-1)/2; - } - - my $start = $ar[1]-$w-1; - - my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n"; - my $curr = substr($sequence,$start,$ar[3]); - if($curr =~ /[^ACGTacgt]/){ - print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n"; - }else{ - print OUT $head,$curr,"\n"; - } -} - -close(OUT); -print "\nDone.\n"; \ No newline at end of file