| Next changeset 1:fafe271a0842 (2015-05-15) |
|
Commit message:
Perl code |
|
added:
AGEseq_web.pl |
| b |
| diff -r 000000000000 -r a9c5e846dd76 AGEseq_web.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/AGEseq_web.pl Fri May 15 16:37:02 2015 -0400 |
| [ |
| b'@@ -0,0 +1,1057 @@\n+#!/usr/bin/perl\r\n+use strict;\r\n+my $file_design = $ARGV[0];\r\n+my $read_file = $ARGV[1];\r\n+my $mismatch_cutoff = $ARGV[2]; # mismatch rate to filter low quality alignment, default 0.1 (10 %)\r\n+my $min_cutoff = $ARGV[3]; # cutoff to filter reads with low abundance, default 0\r\n+my $wt_like_report = $ARGV[4]; # report top xx WT like records, default 20 \r\n+my $indel_report = $ARGV[5]; # report top xx records with indel, default 50 \r\n+my $final_out = $ARGV[6];\r\n+\r\n+#my $blat = \'\'; # working directory or PATH\r\n+my $blat = \'/usr/local/bin/blat\'; # Your prefered full location\r\n+\r\n+my $rand = rand 1000000;\r\n+## setting for reports\r\n+if(not defined ($mismatch_cutoff )){\r\n+ $mismatch_cutoff = 0.1 ; # mismatch rate to filter low quality alignment, default 0.1 (10 %)\r\n+}\r\n+\r\n+if(not defined ($min_cutoff )){\r\n+\t$min_cutoff = 0 ; # cutoff to filter reads with low abundance, default 0\r\n+}\r\n+\r\n+if(not defined ($wt_like_report )){\r\n+\t$wt_like_report = 20 ; # report top xx WT like records, default 20 \r\n+}\r\n+\r\n+if(not defined ($indel_report )){\r\n+\t $indel_report = 50 ; # report top xx records with indel, default 50 \r\n+}\r\n+ \r\n+\r\n+my $remove_files = 1 ; # keep (0) or delete (1) intermediate files, default = 1\r\n+\r\n+###############################################################\r\n+# default setting\r\n+\r\n+my $remove = \'rm\';\r\n+my $osname = $^O;\r\n+\r\n+my $design_fas = \'/tmp/fasta\'.$rand.\'_DESIGN.fa\';\r\n+my @psl_file_data = ();\r\n+\r\n+if(not defined ($file_design )){\r\n+\tdie "Design file is needed\\n";\r\n+}\r\n+\r\n+if(not defined ($final_out )){\r\n+\t$final_out = \'AGE_output.txt\';\r\n+}\r\n+##########################################################\r\n+### step 1 load design file\r\n+\r\n+open DESIGN,"<$file_design" or die "File $file_design not found error here\\n";\r\n+open DESIGNFAS,">$design_fas" or die "can not wirte $design_fas not found error here\\n";\r\n+\r\n+my $num = 0;\r\n+my %design_hash = ();\r\n+\r\n+\r\n+while (my $line = <DESIGN>) {\r\n+\t $num ++;\r\n+\t if($num == 1){\r\n+\t \t next;\r\n+\t }\r\n+\t if($line !~ m/\\w/){\r\n+\t \t next;\r\n+\t }\r\n+\t \r\n+\t chomp $line;\r\n+\t my @temp = split(/\\t/,$line);\r\n+\t my $id = $temp[0];\r\n+\t my $seq = $temp[1];\r\n+\t $seq =~ s/\\s+//g;\r\n+\t $seq =~ m/([^\\r\\n]+)/;\r\n+\t $seq = $1;\r\n+\t $id =~ s/\\s+/\\_/g;\r\n+\t\r\n+\t $design_hash{$id} = $seq;\r\n+\t print DESIGNFAS ">$id\\n$seq\\n";\r\n+}\r\n+close(DESIGNFAS);\r\n+\r\n+\t \r\n+#######################################################\r\n+############## step 2 - load read files #############\r\n+\r\n+my %total_read_num = ();\r\n+my $num_c = 0; \r\n+my @fasta_files = ();\r\n+\r\n+my $file_in = $read_file;\t\r\n+my $fasta_out = \'/tmp/fasta\'.$rand.\'_Read.fa\'; \r\n+ \r\n+ \r\n+my $type = check_file_type($file_in ) ;\r\n+ \r\n+if($type eq \'fq\'){\r\n+\t# fastq\r\n+ # print "this is fastq file\\n";\r\n+ my $total_num = &fastq2fasta($file_in,$fasta_out); \r\n+\t$total_read_num{$fasta_out} = $total_num;\r\n+}\r\n+elsif($type eq \'fa\'){\r\n+\t # fasta \r\n+\t# print "this is fasta file\\n";\r\n+\t my $total_num = &fasta2fasta($file_in,$fasta_out);\r\n+\t # load number\r\n+\t $total_read_num{$fasta_out} = $total_num;\r\n+}\r\n+else{\r\n+\tdie "Please check the format of read file\\n";\r\n+}\r\n+\r\n+\r\n+########################################################\t \r\n+## Here will put step3 to step 5 into one loop\r\n+\r\n+\r\n+open (REPORT,">$final_out") || die "cannot write\\n";\r\n+\r\n+\r\n+ my $file_blat_in = $fasta_out;\r\n+ \r\n+ ############ step 3 - blat ###########################\r\n+ my $blat_out = \'/tmp/fasta\'.$rand.\'_blat_crispr.psl\';\t \r\n+ \r\n+\t my $command_line = $blat.\' \'.\' -tileSize=7 -oneOff=1 -maxGap=20 -minIdentity=70 -minScore=20 \'.$file_blat_in.\' \'. $design_fas.\' \'.$blat_out .\' >/tmp/blat.txt\';\r\n+\t \r\n+\t system("$command_line");\r\n+\r\n+\t #print "blat job $file_blat_in is done\\n"; \r\n+\t ########## step 4 - convert psl -> bed ############## \r\n+\t \r\n+\t my $bed_hash_address = &psl2bed ($blat_out );\t## address of one hash\r\n+\t \r\n+\t #######'..b'\t\t\tif(not exists $psl_hash{$read_target}{$query}){\t\t## all alignment\r\n+\t\t\t#if(not exists $psl_hash{$read_target}){ ## only keep the best hit\t\t\t\r\n+\t\t\t\t $psl_hash{$read_target}{$query} = \\@lineArr;\r\n+\t\t\t}\r\n+\t}# while array\r\n+\t#print $psl_line_num." total psl lines\\n";\r\n+\treturn (\\%psl_hash);\r\n+}\r\n+\r\n+\r\n+\r\n+\r\n+sub fasta2fasta {\r\n+ my $input_file = shift @_;\r\n+ my $out_file = shift @_;\r\n+ \r\n+ open IN,"<$input_file " or die "File $input_file not found error here\\n";\r\n+ open TGT,">$out_file" or die "File $out_file not found error here\\n";\r\n+ my $c = 0;\r\n+ my @line = ();\r\n+ my $id =\'\';\r\n+ my $seq = \'\';\r\n+ my $processed = 0;\r\n+ \r\n+ while(<IN>){\r\n+ chomp;\r\n+ if(m/^\\>/){\r\n+ \t$processed ++;\r\n+ \tif($processed == 1){\r\n+ \t\tmy @seq_id = split(/\\s+/, $_);\r\n+ \t\tmy $seq_id = $seq_id[0];\r\n+ \t\t $seq_id =~ s/\\_//g; \t\t\r\n+ \t\t $seq_id = $seq_id.\'_1\';\r\n+ \t\t print TGT \'>\'.$seq_id,"\\n";\r\n+ \t}\r\n+ \telse{\r\n+ \t\t$seq =~ s/\\s|\\n|\\r//g;\r\n+ \t\tprint TGT $seq,"\\n";\r\n+ \t\t$seq = \'\';\r\n+ \t\t my @seq_id = split(/\\s+/, $_);\r\n+ \t\tmy $seq_id = $seq_id[0];\r\n+ \t\t $seq_id =~ s/\\_//g; \t\t\r\n+ \t\t $seq_id = $seq_id.\'_1\';\r\n+ \t\t print TGT \'>\'.$seq_id,"\\n";\r\n+ \t}\r\n+ \t\r\n+ }\r\n+ else{\r\n+ \t$seq = $seq.$_;\r\n+ }\r\n+ }\r\n+ # last records\r\n+ $seq =~ s/\\s|\\n|\\r//g;\r\n+ print TGT $seq,"\\n";\r\n+ \r\n+ close IN;\r\n+ close TGT;\r\n+ return $processed;\r\n+}\r\n+ \r\n+\t\r\n+\t\r\n+sub fastq2fasta {\r\n+ my $input_file = shift @_;\r\n+ my $out_file = shift @_;\r\n+ \r\n+ open IN,"<$input_file " or die "File $input_file not found error here\\n";\r\n+ open TGT,">$out_file" or die "File $out_file not found error here\\n";\r\n+ my $c = 0;\r\n+ my @line = ();\r\n+ my $id =\'\';\r\n+ my $seq = \'\';\r\n+ my $processed = 0;\r\n+ my %read_hash = ();\r\n+ \r\n+ ##############################################################################################\r\n+\r\n+ while(<IN>){\r\n+ chomp;\r\n+ $c++;\r\n+ if($c == 1){\r\n+ $processed++;\r\n+ # print STDERR "$processed reads processed\\r";\r\n+ @line = split();\r\n+ $id = $line[0];\r\n+ $id =~ s/\\@//;\r\n+ }elsif($c == 2){\r\n+ $seq = $_;\r\n+ }elsif($c == 4){\r\n+ $c = 0;\r\n+ #print TGT ">$id\\n$seq\\n";\r\n+ # print TGT ">seq_$processed\\n$seq\\n";\r\n+ $read_hash{$seq}{$id} = 1;\r\n+ $id =\'\';\r\n+ @line =();\r\n+ $seq =\'\';\r\n+ }else{}\r\n+ }\r\n+ \r\n+ ## write TGT\r\n+ my $count = 0;\r\n+ for my $seq_out (sort keys %read_hash){\r\n+ \t$count++;\r\n+ \tmy @hits = keys %{$read_hash{$seq_out}};\r\n+ \tmy $num = @hits+0;\r\n+ \tmy $id_out = "R".$count.\'_\'.$num;\r\n+ \tprint TGT ">$id_out\\n", $seq_out ,"\\n"; \t \r\n+ }\r\n+ \r\n+ close IN;\r\n+ close TGT;\r\n+ return $processed;\r\n+}\r\n+\r\n+\r\n+sub check_read_num{\r\n+\t my $file_in = shift @_;\r\n+\t # check read number\r\n+\t \r\n+ open FAS_in,"$file_in" || die "can not read fasta file\\n";\r\n+ my $seq_num = 0;\r\n+ while (<FAS_in>){\r\n+\t \tif( m/^\\>/){\r\n+\t \t\t $seq_num ++;\r\n+\t \t}\r\n+\t \tif( $seq_num>1){\r\n+\t \t\t last;\r\n+\t \t}\r\n+ }\r\n+ \r\n+ return $seq_num;\r\n+\t\r\n+}\r\n+\r\n+\r\n+\r\n+sub check_file_type {\r\n+\t my $file_in = shift @_;\r\n+\t my $out = \'no\';\r\n+\t\r\n+\t open FAS_in, "$file_in" || die "can not read fasta file\\n";\r\n+ \r\n+ while (my $line = <FAS_in>){\r\n+ \t if($line =~ m/\\w/){\r\n+ \t \tif($line =~ m/^\\>/){\r\n+ \t \t\t$out = \'fa\';\r\n+ \t \t\tlast;\r\n+ \t \t}\r\n+ \t \tif($line =~ m/^\\@/){\r\n+ \t \t\t$out = \'fq\';\r\n+ \t \t\tlast;\r\n+ \t \t}\r\n+ \t }\r\n+ }\r\n+ \r\n+ return $out;\r\n+}\r\n+\r\n+\r\n+sub rev_com {\r\n+ my $seq = shift @_;\r\n+ $seq = reverse($seq);\r\n+ $seq =~ tr/ACGT/TGCA/;\r\n+ return($seq);\r\n+}\r\n+\t\r\n+\r\n' |