diff extract_data_single_galaxy.pl @ 1:083e32135da3 draft default tip

Deleted selected files
author grau
date Wed, 13 Nov 2013 04:13:57 -0500
parents ee1124489f9c
children
line wrap: on
line diff
--- a/extract_data_single_galaxy.pl	Wed Nov 13 04:10:29 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,128 +0,0 @@
-use strict;
-
-if(@ARGV == 0){
-die <<USAGE
-usage:
-perl extract_data.pl <chromFa> <bedfile> <chromcol> <startcol> <seccolm> <secondcol> <width> <statcol> <outfile>
-    
-    <chromFa>: the chromosome FastA containing all chromosome sequences
-    <bedfile>:  the file containing the peaks in tabular format, 
-                e.g., bed, gff, narrowPeak
-    <chromcol>: the column of <bedfile> containing the chromosome
-    <startcol>: the column of <bedfile> containing the start position relative to
-                the chromosome start
-    <seccolm>:   center: "Center of peak (relative to start)", end: "End of peak (global coordinates)"
-    <secondcol>:  the column of <bedfile> containing the peak center position (center) relative to
-                <startcol> or the column of <bedfile> containing the end position (end)
-    <width>:    fixed width of all regions
-    <statcol>:  the column of <bedfile> containing the peak statistic
-                or a similar measure of confidence
-    <outfile>:  the path to the output file, written as FastA
-USAGE
-}
-
-
-my $chromFa = $ARGV[0];
-my $bed = $ARGV[1];
-my $chromcol = $ARGV[2]-1;
-my $startcol = $ARGV[3]-1;
-my $seccolm = $ARGV[4];
-my $seccol = $ARGV[5]-1;
-my $width = $ARGV[6];
-my $statcol = $ARGV[7]-1;
-my $outfile = $ARGV[8];
-
-my $sort = 1;
-
-
-sub loadSeq{
-	my $prefix = shift;
-	print $prefix," ";
-	open(FA,$chromFa);
-	my $head = "";
-	my @lines = ();
-	while(<FA>){
-		chomp();
-		if(/^>/){
-			if($head){
-				last;
-			}
-			if(/^>\s*(${prefix}|chr${prefix})(\s.*$|$)/i){
-				$head = $_;
-			}
-		}elsif($head){
-			push(@lines,lc($_));
-		}
-	}
-	my $str = join("",@lines);
-	print "loaded\n";
-	return $str;
-}
-
-
-
-open(IN,$ARGV[1]);
-
-my @lines = ();
-
-while(<IN>){
-	chomp();
-	my @parts = split("\t",$_);
-	$parts[$chromcol] =~ s/chr0/chr/g;
-	my @vals = ();
-	if($seccolm eq "center"){
-		@vals = ($parts[$chromcol],$parts[$startcol]+$parts[$seccol],$parts[$statcol]);
-	}else{
-		@vals = ($parts[$chromcol],int(($parts[$startcol]+$parts[$seccol])/2),$parts[$statcol]);
-	}
-	push(@vals,$width);
-	push(@lines,\@vals);
-}
-
-close(IN);
-#print "Read input file ".$bed."\n";
-
-
-if($sort){
-
-	@lines = sort { ${$a}[0] cmp ${$b}[0]  } @lines;
-
-}
-
-open(OUT,">".$outfile);
-
-print "Extracting sequences...\n\n";
-
-my $oldchr = "";
-my $sequence = "";
-for my $line (@lines){
-	my @ar = @{$line};
-	my $chr = $ar[0];
-	unless($chr eq $oldchr){
-		$sequence = loadSeq($chr);
-	}
-	$oldchr = $chr;
-	my $w = $ar[3];
-	if($w <= 0){
-		print $w," -> next\n";
-		next;
-	}
-	if($w % 2 == 0){
-		$w = $w/2;
-	}else{
-		$w = ($w-1)/2;
-	}
-
-	my $start = $ar[1]-$w-1;
-
-	my $head = "> chr: ".$chr."; start: ".$start."; peak: ".($ar[1]-$start)."; signal: ".$ar[2]."\n";
-	my $curr = substr($sequence,$start,$ar[3]);
-	if($curr =~ /[^ACGTacgt]/){
-		print "Sequence for\n\t",substr($head,1),"omitted due to ambiguous nucleotides.\n\n";
-	}else{
-		print OUT $head,$curr,"\n";
-	}
-}
-
-close(OUT);
-print "\nDone.\n";
\ No newline at end of file