comparison AGEseq_web.pl @ 0:a9c5e846dd76 draft

Perl code
author lxue
date Fri, 15 May 2015 16:37:02 -0400
parents
children 449c8cf8fa3f
comparison
equal deleted inserted replaced
-1:000000000000 0:a9c5e846dd76
1 #!/usr/bin/perl
2 use strict;
3 my $file_design = $ARGV[0];
4 my $read_file = $ARGV[1];
5 my $mismatch_cutoff = $ARGV[2]; # mismatch rate to filter low quality alignment, default 0.1 (10 %)
6 my $min_cutoff = $ARGV[3]; # cutoff to filter reads with low abundance, default 0
7 my $wt_like_report = $ARGV[4]; # report top xx WT like records, default 20
8 my $indel_report = $ARGV[5]; # report top xx records with indel, default 50
9 my $final_out = $ARGV[6];
10
11 #my $blat = ''; # working directory or PATH
12 my $blat = '/usr/local/bin/blat'; # Your prefered full location
13
14 my $rand = rand 1000000;
15 ## setting for reports
16 if(not defined ($mismatch_cutoff )){
17 $mismatch_cutoff = 0.1 ; # mismatch rate to filter low quality alignment, default 0.1 (10 %)
18 }
19
20 if(not defined ($min_cutoff )){
21 $min_cutoff = 0 ; # cutoff to filter reads with low abundance, default 0
22 }
23
24 if(not defined ($wt_like_report )){
25 $wt_like_report = 20 ; # report top xx WT like records, default 20
26 }
27
28 if(not defined ($indel_report )){
29 $indel_report = 50 ; # report top xx records with indel, default 50
30 }
31
32
33 my $remove_files = 1 ; # keep (0) or delete (1) intermediate files, default = 1
34
35 ###############################################################
36 # default setting
37
38 my $remove = 'rm';
39 my $osname = $^O;
40
41 my $design_fas = '/tmp/fasta'.$rand.'_DESIGN.fa';
42 my @psl_file_data = ();
43
44 if(not defined ($file_design )){
45 die "Design file is needed\n";
46 }
47
48 if(not defined ($final_out )){
49 $final_out = 'AGE_output.txt';
50 }
51 ##########################################################
52 ### step 1 load design file
53
54 open DESIGN,"<$file_design" or die "File $file_design not found error here\n";
55 open DESIGNFAS,">$design_fas" or die "can not wirte $design_fas not found error here\n";
56
57 my $num = 0;
58 my %design_hash = ();
59
60
61 while (my $line = <DESIGN>) {
62 $num ++;
63 if($num == 1){
64 next;
65 }
66 if($line !~ m/\w/){
67 next;
68 }
69
70 chomp $line;
71 my @temp = split(/\t/,$line);
72 my $id = $temp[0];
73 my $seq = $temp[1];
74 $seq =~ s/\s+//g;
75 $seq =~ m/([^\r\n]+)/;
76 $seq = $1;
77 $id =~ s/\s+/\_/g;
78
79 $design_hash{$id} = $seq;
80 print DESIGNFAS ">$id\n$seq\n";
81 }
82 close(DESIGNFAS);
83
84
85 #######################################################
86 ############## step 2 - load read files #############
87
88 my %total_read_num = ();
89 my $num_c = 0;
90 my @fasta_files = ();
91
92 my $file_in = $read_file;
93 my $fasta_out = '/tmp/fasta'.$rand.'_Read.fa';
94
95
96 my $type = check_file_type($file_in ) ;
97
98 if($type eq 'fq'){
99 # fastq
100 # print "this is fastq file\n";
101 my $total_num = &fastq2fasta($file_in,$fasta_out);
102 $total_read_num{$fasta_out} = $total_num;
103 }
104 elsif($type eq 'fa'){
105 # fasta
106 # print "this is fasta file\n";
107 my $total_num = &fasta2fasta($file_in,$fasta_out);
108 # load number
109 $total_read_num{$fasta_out} = $total_num;
110 }
111 else{
112 die "Please check the format of read file\n";
113 }
114
115
116 ########################################################
117 ## Here will put step3 to step 5 into one loop
118
119
120 open (REPORT,">$final_out") || die "cannot write\n";
121
122
123 my $file_blat_in = $fasta_out;
124
125 ############ step 3 - blat ###########################
126 my $blat_out = '/tmp/fasta'.$rand.'_blat_crispr.psl';
127
128 my $command_line = $blat.' '.' -tileSize=7 -oneOff=1 -maxGap=20 -minIdentity=70 -minScore=20 '.$file_blat_in.' '. $design_fas.' '.$blat_out .' >/tmp/blat.txt';
129
130 system("$command_line");
131
132 #print "blat job $file_blat_in is done\n";
133 ########## step 4 - convert psl -> bed ##############
134
135 my $bed_hash_address = &psl2bed ($blat_out ); ## address of one hash
136
137 ########## step 5 - get sequences number from bed ##############
138
139 my $read_num_address = &MAIN ($file_blat_in,$bed_hash_address);## address of one hash
140
141
142 ## remove files
143
144 if($remove_files == 1){
145 $command_line = $remove.' '.$file_blat_in;
146 system("$command_line");
147
148 $command_line = $remove.' '.$blat_out ;
149 system("$command_line");
150
151 my $command_line2 = $remove.' '.$design_fas ;
152 system("$command_line2");
153 }
154
155
156 #########################################################################################
157 ## functions
158
159
160 sub MAIN{
161 my $fas_file_in = shift @_;
162 my $bed_file_address_in = shift @_;
163
164 my %bed_hash_in = %{$bed_file_address_in};
165
166
167 #########################################################
168 # read convereted reads fasta file
169
170 open (FAS,"$fas_file_in ")||die "can not open fasta file: $fas_file_in ";
171
172 my %query_with_seq2num = ();
173 my %seq2alignment = ();
174 my %seq_part2assignment = ();
175
176 while (my $lineIn =<FAS>){
177 chomp $lineIn;
178 if($lineIn =~ m/^\>/){
179 my $id = substr $lineIn,1;
180
181 my @numbers = split(/\_/, $id);
182 my $total_num = $numbers[-1];
183
184 $lineIn = <FAS>;
185 chomp $lineIn;
186 my $seq = $lineIn;
187
188 if(exists $bed_hash_in{$id}){
189 # $psl_hash{$read_target}{$query} # format of hash
190 ## asign read to target reference
191
192 ##########################################################
193 my @read_asignment = ();
194 ## loop for best
195 for my $read_key (keys %{$bed_hash_in{$id}}){
196 ### get annotation from array
197 ## Q: design T: read
198 my @get_array = @{$bed_hash_in{$id}{$read_key}};
199 my ($strand, $q_name,$q_size,$q_start,$q_end,
200 $t_name,$t_size,$t_start,$t_end,
201 $block_num,$block_size,$q_start_s,$t_start_s) = @get_array ;
202
203 my $seq_out = substr $seq,$t_start,($t_end-$t_start);
204
205 if($strand eq '-'){
206 $seq_out = &rev_com ($seq_out);
207 }
208 ## get orignal
209 my $ref_ori_seq = '';
210 if(exists $design_hash{$q_name}){
211 $ref_ori_seq = $design_hash{$q_name};
212 }
213
214
215 my $edit_note = &get_editing_note(\@get_array,$ref_ori_seq, $seq_out,$seq);
216 # target # part of read
217 my ($align_ref,$align_read,$note) = split(/\t/, $edit_note);
218
219 my $len_ori = length($ref_ori_seq);
220 my @align_1 = split(//, $align_ref);
221 my @align_2 = split(//, $align_read);
222 my $site_snp = 0;
223 my @site_snp = ();
224 my $iden_num = 0;
225
226 for my $a1(@align_1){
227 my $a2 = shift @align_2;
228 if($a1 ne '-' and $a2 ne '-'){
229 if($a1 ne $a2){
230 push @site_snp, $site_snp;
231 }
232 }
233
234 if($a1 eq $a2){
235 $iden_num++;
236 }
237
238 $site_snp++;
239 }
240 $iden_num = $iden_num-2;
241
242 ## filter on SNP number
243 my $snp_num_for_filter = @site_snp+0;
244
245 my $dis_max = 0;
246 my $dis_current = 0;
247 my @continuous_site_temp = ();
248 my @continuous_site = ();
249 my @site_snp_checking = @site_snp;
250 my $start = shift @site_snp_checking ;
251 for my $site_checking (@site_snp_checking){
252 my $dis_in = $site_checking -$start;
253 if($dis_in <4){
254 $dis_current++;
255 push @continuous_site_temp,$site_checking ;
256 if($dis_current > $dis_max){
257 $dis_max = $dis_current;
258 @continuous_site = @continuous_site_temp;
259 }
260 }
261 else{
262 $dis_current = 0;
263 @continuous_site_temp = ();
264
265 }
266
267 $start = $site_checking;
268 }
269
270
271 if( ($snp_num_for_filter/$len_ori) > 0.5){ # first filtering
272 if($dis_max <5){
273 next;
274 }
275 }
276
277
278 if($dis_max>=5){
279 @site_snp = ();
280 my @align_x = split(//, $align_read);
281 for my $iii(@continuous_site ){
282 $align_x[$iii] = 'N';
283 }
284 $align_read = join('',@align_x);
285 $note = 'strange editing';
286 my @ttt = ($align_ref,$align_read,$note) ;
287
288 $edit_note = join("\t", @ttt );
289 }
290
291
292
293
294 my @ooo = ($iden_num,$ref_ori_seq,$seq_out,$edit_note,\@site_snp,\@get_array);
295 push @read_asignment,[@ooo];
296 }## for loop for best hit of each fasta reads
297
298 ## get the best hits
299
300 if(@read_asignment<1){
301 next;
302 }
303
304 ## assign the first hit
305 @read_asignment = sort {$b->[0] <=> $a->[0]}@read_asignment ;
306 my ($iden_num,$ref_ori_seq,$seq_out,$edit_note,$array_snp_addr,$bed_array_addr) = @{$read_asignment[0]};
307
308 my @snp_filtering = @{$array_snp_addr};
309
310 if((@snp_filtering+0)/length($ref_ori_seq) >$mismatch_cutoff){
311 next;
312 }
313
314
315 ## after asignment
316
317 ## check whether seq_out is assigned earlier
318 ## $seq_out has been asigned to one record
319 ## just assign reads to that one althogh it may be get one new assignment
320 if( exists $seq_part2assignment{$seq_out}){
321 my $first_assignment = $seq_part2assignment{$seq_out};
322 $query_with_seq2num{$first_assignment}{$seq_out} = $query_with_seq2num{$first_assignment}{$seq_out}+$total_num;
323 next;
324 }
325
326
327 # number for part of reads
328 my ($strand, $q_name,$q_size,$q_start,$q_end,
329 $t_name,$t_size,$t_start,$t_end,
330 $block_num,$block_size,$q_start_s,$t_start_s) = @{$bed_array_addr};
331
332 # this the first asignment
333 if(not exists $seq2alignment{$seq_out}){
334 my @out = ($ref_ori_seq,$edit_note,$array_snp_addr);
335 $seq2alignment{$seq_out} = \@out;
336
337 }
338 # assigned
339 if(not exists $seq_part2assignment{$seq_out}) {
340 $seq_part2assignment{$seq_out} = $q_name;
341 $query_with_seq2num{$q_name}{$seq_out} = $total_num;
342 }
343
344
345
346 }# exists bed file
347 }# lineIN
348 }# while file
349 close(FAS);
350
351 ### summary data for output
352 my ($hash_addr1, $hash_addr2) = &get_out_buffer($fas_file_in,\%query_with_seq2num, \%seq2alignment);
353
354 ### write output
355
356 &write_buffer_out($hash_addr1, $hash_addr2);
357
358 } # Main function
359
360
361
362
363 sub get_out_buffer{
364 my $fas_file_in = $_[0];
365 my %query_with_seq2num_in = %{$_[1]};
366 my %seq2alignment_in = %{$_[2]};
367
368 my %buffer_out = ();
369 my %buffer_num = ();
370
371 my $total_non_redun = 0;
372 ## push output in buffer before printing
373 my $test_case_num = $total_read_num{$fas_file_in};
374
375 for my $ref_name (sort keys %query_with_seq2num_in){
376 my %in_hash = %{$query_with_seq2num_in{$ref_name}};
377
378 ## each group
379 my $report_wt_count = 0;
380 my $report_indel_count = 0;
381 my $other_num = 0;
382 my $sub_hit_num = 0;
383 my $indel_hit_num = 0;
384
385
386 my @other_record = ();
387 ### loop for records
388 for my $part_of_read (sort {$in_hash{$b} <=> $in_hash{$a}} keys %in_hash){
389
390 my $hitnum = $in_hash{$part_of_read};
391
392 if($hitnum < $min_cutoff) {
393 next;
394 }
395
396 if($hitnum/$test_case_num < 0.00005) {
397 next;
398 }
399
400 ## get Editing Note for output only
401
402 if(exists $seq2alignment_in{$part_of_read}){
403 my ($ref_ori_seq,$edit_note,$array_snp_addr)= @{$seq2alignment_in{$part_of_read}};
404 my ($align_ref,$align_read,$note) = split(/\t/,$edit_note);
405 my @snp_out = @{$array_snp_addr};
406 my $snp_num = @snp_out +0;
407 # deep sequencing SNP
408 if($test_case_num >500 and $hitnum <=2 and $snp_num/length($ref_ori_seq)>0.05){
409 next;
410 }
411
412 my $snp_note = '';
413 if($snp_num>0 ){
414 $snp_note = $snp_num.' SNP('.join(',',@snp_out).')';
415 }
416
417 $sub_hit_num = $sub_hit_num+$hitnum;
418 $total_non_redun = $total_non_redun + $hitnum;
419
420
421 # output here
422 if($ref_ori_seq ne $design_hash{$ref_name} ){
423 print "error in read assingment \n";
424 }
425
426
427 my @out_line = ($fas_file_in,$ref_name,$ref_ori_seq, $part_of_read,$hitnum, $align_ref,$align_read,$note,$snp_note);
428 ## indel
429 if($edit_note !~ m/no.+indel/ ){
430
431 if((($test_case_num >500 and $hitnum <=3) or ($hitnum /$test_case_num <0.001 ) ) and $edit_note =~ m/strange/){
432 next; # filter strange case in deep sequencing with few reads
433 }
434
435
436 $report_indel_count ++;
437 # total indel
438 $indel_hit_num = $indel_hit_num + $hitnum;
439
440 if($report_indel_count <= $indel_report ){
441 #print REPORT join("\t",@out_line),"\n";
442 ## in buffer
443 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
444 my @bbb = ();
445 push @bbb, [@out_line];
446 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
447 }
448 else{
449 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line];
450 }
451 }
452 else{
453 $other_num = $other_num+$hitnum;
454 @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-');
455 }
456
457 }
458 else{
459 $report_wt_count ++;
460 if($report_wt_count <= $wt_like_report ){
461 #print REPORT join("\t",@out_line),"\n";
462 # in buffer
463 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
464 my @bbb = ();
465 push @bbb, [@out_line];
466 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
467 }
468 else{
469 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@out_line];
470 }
471 }
472 else{
473 $other_num = $other_num+$hitnum;
474 @other_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'others',$hitnum, '-', '-', '-', '-');
475 }
476
477 }# wt like report
478
479 } # if exists
480
481 }# for part of reads
482
483 ## other report
484 if($other_num>0){
485 $other_record[-5] = $other_num;
486 #print REPORT join("\t",@other_record),"\n";
487
488 if(not exists $buffer_out{$fas_file_in}{$ref_name}{"data"}){
489 my @bbb = ();
490 push @bbb, [@other_record];
491 $buffer_out{$fas_file_in}{$ref_name}{"data"} = \@bbb;
492 }
493 else{
494 push @{$buffer_out{$fas_file_in}{$ref_name}{"data"}},[@other_record];
495 }
496 }
497
498 #my @summary_record = ($fas_file_in,$ref_name,$ref_ori_seq, 'total_hit:'.$total_non_redun,'sub_hit:'.$sub_hit_num, 'indel_hit:'.$indel_hit_num);
499 ## total read number
500 my $total_num = $total_read_num{$fas_file_in};
501
502 # 0 1 2
503 # my @summary_record = ($fas_file_in,$ref_name,'Total Reads: '., 'Total Hits: '.$total_non_redun,'Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num);
504 my @summary_record = ('Sub Hits: '.$sub_hit_num, 'Indel Hits: '.$indel_hit_num);
505
506 $buffer_out{$fas_file_in}{$ref_name}{'sum'} = \@summary_record ;
507 $buffer_num{$fas_file_in}{'total'} = $total_num ;
508 $buffer_num{$fas_file_in}{'sub'} = $total_non_redun ;
509 #print REPORT join("\t",@summary_record),"\n";
510
511 }# for ref_seq
512
513
514 return (\%buffer_out,\%buffer_num);
515 } # sub
516
517
518 ### write the output
519
520 sub write_buffer_out{
521 my %hash_out = %{$_[0]};
522 my %hash_out_num = %{$_[1]};
523 my $print_tracking = 0;
524 for my $fas_file_in (sort keys %hash_out){
525
526 my $total_num = $hash_out_num{$fas_file_in}{'total'} ;
527 my $total_non_redun = $hash_out_num{$fas_file_in}{'sub'} ;
528
529 for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){
530 my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}};
531
532 for (@data){
533 $print_tracking++;
534 if($print_tracking==1){
535 print REPORT "INPUT","\t","Target\t","TargetSequence\t","ReadSequence\t","Read#","\t",
536 "AlignedTarget\t","AlignedRead\t", "Indels\t","SNPs\n";
537 }
538 my @ooo = @{$_};
539 $ooo[0] = 'Input1';
540 print REPORT join("\t", @ooo),"\n";
541 }
542
543 # sum
544 my @sum_p2 = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ;
545
546
547 my @sum_p1 = ('Input1',$ref_name,'Total Reads: '.$total_num, 'Total Hits: '.$total_non_redun);
548
549 print REPORT join("\t", @sum_p1),"\t", join("\t", @sum_p2),"\n";
550
551 }
552
553
554 ##### print summary region
555 print REPORT "Summary \t -----------------------------------------------------\n";
556 print REPORT "Sum:INPUT\t","Target\t","AlignedTarget\t","AlignedRead\t","Total Hits\t","Sub Hits\t","Indel or WT Hits\t","Indel or WT rate %\t","Pattern\n";
557
558
559
560 for my $ref_name (sort keys %{$hash_out{$fas_file_in }}){
561 my @data = @{$hash_out{$fas_file_in}{$ref_name}{"data"}};
562
563 my $wt_pair = ''."\t".'';
564 my $indel_pair = ''."\t".'';
565 my $indel_out = '';
566 my $i = 0;
567
568 for (@data){
569 $i++;
570 my @in= @{$_};
571
572
573 if($in[7] !~ m/no\_indel/ and $in[7] ne '-'){
574 if($indel_pair eq ''."\t".''){
575 $indel_pair = $in[5]."\t".$in[6];
576 my @indel_temp = split(/\s+/,$in[7]);
577
578 my @ooo_indel = ();
579 for my $ooo_in (@indel_temp ){
580 if($ooo_in =~ m/I(\d+)/){
581 push @ooo_indel, '+'.$1;
582 }
583 if($ooo_in =~ m/D(\d+)/){
584 push @ooo_indel, '-'.$1;
585 }
586 if($ooo_in =~ m/strange/){
587 push @ooo_indel, 'strange editing';
588 }
589 }
590 $indel_out = 'Indel:'.join(',', @ooo_indel);
591 }
592 }
593 else{
594 # no indel
595 if($wt_pair eq ''."\t".''){
596 $wt_pair = $in[5]."\t".$in[6];
597 }
598 }
599 }
600
601 # first record
602
603
604
605 # sum
606 my ($sub_string, $indel_string) = @{$hash_out{$fas_file_in}{$ref_name}{'sum'}} ;
607 my $sub_num ;
608 my $indel_num;
609 #print $sub_string. "sub summary\n";
610 #print $indel_string ."sub summary\n";
611
612
613 if($sub_string =~ m/\:[^\d]+(\d+)/){
614 $sub_num = $1;
615 }
616 if($indel_string =~ m/\:[^\d]+(\d+)/){
617 $indel_num = $1;
618 }
619
620 my $rate = int(($indel_num/$sub_num)*10000)/100;
621
622 ## indel
623 # two columns
624 my @out_sum = ("Input1",$ref_name, $indel_pair,$total_non_redun,$sub_num,$indel_num,$rate,$indel_out);
625 print REPORT 'Sum:',join("\t", @out_sum),"\n";
626 ## wt like
627 my $wt_num = $sub_num - $indel_num;
628 my $wt_rate = 100-$rate;
629 my @out_sum = ("Input1",$ref_name, $wt_pair ,$total_non_redun,$sub_num,$wt_num,$wt_rate,"WT_like");
630 print REPORT 'Sum:',join("\t", @out_sum),"\n";
631
632 } # records
633
634 print REPORT "\n";
635
636 } # file
637 } # sub
638 ###################################################################
639
640
641
642 sub get_editing_note{
643 my ($address_in,$query_seq, $target_seq ,$full_read) = @_;
644 # target: part of read
645 # query : original sequence in design file
646
647 my ($strand, $q_name,$q_size,$q_start,$q_end,
648 $t_name,$t_size,$t_start,$t_end,
649 $block_num,$block_size,$q_start_s,$t_start_s) = @{$address_in};
650
651 my @out = ();
652 #print $strand ,"strand\n";
653 if($strand eq '+'){
654 @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,$query_seq, $full_read);
655 }
656 else{
657 @out = get_alignment($block_num,$block_size,$q_start_s,$t_start_s,rev_com($query_seq), $full_read);
658 @out = get_reverse(@out);
659 }
660
661 $out[0]='B'.$out[0].'E';
662 $out[1]='B'.$out[1].'E';
663 if($out[2] !~ m/\w/){
664 $out[2] = 'no_indel';
665 }
666 return join("\t",@out);
667 }
668
669
670
671 sub get_alignment{
672 my ($blocks , $blockLengths, $qStarts , $tStarts,$query_seq, $target_seq) = @_;
673 my @blockSizes = split(/\,/,$blockLengths);
674 my @qArray = split(/\,/,$qStarts);
675 my @tArray = split(/\,/,$tStarts);
676
677 # target: part of read
678 # query : original sequence in design file
679
680 # print join("\t",@_),"\n";
681
682 ### before blocks
683 my $before_q = substr $query_seq, 0,$qArray[0];
684 my $before_t = substr $target_seq,0,$tArray[0];
685
686
687 ($before_q,$before_t) = treat_sequence($before_q,$before_t,"before");
688
689 ### after blocks
690 my $after_q = substr $query_seq, $qArray[-1]+$blockSizes[-1];
691 my $after_t = substr $target_seq,$tArray[-1]+$blockSizes[-1];
692
693
694 ($after_q,$after_t) = treat_sequence($after_q,$after_t,"after") ;
695
696 ### blocks
697 my @med_q = ();
698 my @med_t = ();
699
700 my @indel_out = ();
701 my $out = '';
702 ## first block
703 my $med_q_seq = substr $query_seq,$qArray[0],$blockSizes[0];
704 my $med_t_seq = substr $target_seq,$tArray[0],$blockSizes[0];
705 push @med_q,$med_q_seq;
706 push @med_t,$med_t_seq;
707
708 if($blocks>1){
709 for (my $i= 0;$i<($blocks-1);$i++){
710 ####### interval
711 my $inter_q_seq = substr $query_seq,($qArray[$i]+$blockSizes[$i]), ($qArray[$i+1]-($qArray[$i]+$blockSizes[$i]));
712 my $inter_t_seq = substr $target_seq,($tArray[$i]+$blockSizes[$i]),($tArray[$i+1]-($tArray[$i]+$blockSizes[$i]));
713
714 ### count deletion insertion
715 if(length($inter_t_seq) - length($inter_q_seq)>0){
716 # to fix the mismatch near deletion length($inter_q_seq)
717 $out = ($qArray[$i]+$blockSizes[$i]+length($inter_q_seq)).'I'.(length($inter_t_seq) - length($inter_q_seq));
718 }
719 if(length($inter_q_seq)-length($inter_t_seq)>0){
720 $out = ($qArray[$i]+$blockSizes[$i]+length($inter_t_seq)).'D'.(length($inter_q_seq)-length($inter_t_seq));
721 }
722 push @indel_out,$out;
723
724 ($inter_q_seq,$inter_t_seq) = treat_inter ($inter_q_seq,$inter_t_seq) ;
725 push @med_q, $inter_q_seq;
726 push @med_t, $inter_t_seq;
727
728 ###### block after interval
729 $med_q_seq = substr $query_seq,$qArray[$i+1],$blockSizes[$i+1];
730 $med_t_seq = substr $target_seq,$tArray[$i+1],$blockSizes[$i+1];
731 push @med_q,$med_q_seq;
732 push @med_t,$med_t_seq;
733 }
734 }
735
736 push @med_q,$after_q;
737 push @med_t,$after_t;
738
739 unshift @med_q,$before_q;
740 unshift @med_t,$before_t;
741 my $q_string = join('',@med_q);
742 my $t_string = join('',@med_t);
743
744 if($q_string=~m/^(\-+)/){
745 my $len = length($1);
746 $q_string = substr $q_string,$len;
747 $t_string = substr $t_string, $len;
748 }
749 if($q_string=~m/(\-+)$/){
750 my $len = length($1);
751 $q_string = substr $q_string,0,(-1)*$len;
752 $t_string = substr $t_string,0, (-1)*$len;
753 }
754
755
756 return ($q_string,$t_string,join(' ',@indel_out));
757
758 }
759
760
761 sub get_reverse{
762 my ($q_string,$t_string,$indel_string )= @_;
763 $q_string = rev_com($q_string);
764 $t_string = rev_com($t_string);
765
766 my $string_test = $q_string;
767 $string_test =~ s/\-//g;
768 my $len = length($string_test);
769
770 my @indel_in = split(/\s/, $indel_string);
771 my @indel_out = ();
772 for (@indel_in){
773
774 if(m/^(\d+)(.)(\d+)/){
775 my $ooo = ($len-$1-$3).$2.$3;
776 if($2 eq 'I'){
777 $ooo = ($len-$1).$2.$3;
778 }
779
780 unshift @indel_out,$ooo;
781 }
782 }
783 $indel_string = join(' ',@indel_out);
784 return($q_string,$t_string, $indel_string);
785 }
786
787
788
789
790
791 sub treat_inter{
792 my ($q,$t) = @_;
793 my $q_len = length($q);
794 my $t_len = length($t);
795 my $dis = abs($q_len-$t_len);
796 my @oooo = ();
797
798 for(1..$dis){
799 push @oooo,'-';
800 }
801
802 if($q_len-$t_len >0){
803 $t = $t.join('',@oooo);
804 }
805 else{
806 $q = $q.join('',@oooo);
807 }
808 return ($q,$t);
809 }
810
811
812
813 sub treat_sequence{
814 my ($q,$t,$position) = @_;
815 my $q_len= length($q);
816 my $t_len = length($t);
817 my $dis = abs($q_len-$t_len);
818
819 if($dis > 0){
820 my @oooo = ();
821 for(1..$dis){
822 push @oooo,'-';
823 }
824 if($q_len>$t_len){
825 if($position eq 'before'){
826 $t = join('',@oooo).$t ;
827 }
828 else{
829 $t = $t.join('',@oooo) ;
830 }
831 }
832 else{ # q small
833 if($position eq 'before'){
834 $q = join('',@oooo).$q ;
835 }
836 else{
837 $q = $q.join('',@oooo) ;
838 }
839 }
840 }
841 return ($q,$t);
842 }
843
844
845 sub psl2bed {
846 my $psl_file = shift @_; # file in @psl_file_data
847 my @psl_array = ();
848
849
850 open (FILEINBLAT,"$psl_file") || die "Can not open PSL file, Please check BLAT software is working\n";
851
852 ## should be correct
853
854 for(1..5){
855 my $line=<FILEINBLAT>;
856 ## remove head
857 }
858 ## read psl
859 while (<FILEINBLAT>)
860 {
861 my $lineIn = $_;
862 chomp $lineIn;
863 my @lineArr = split ("\t", $lineIn);
864
865 my $q_size = $lineArr[10];
866 my $m_size = $lineArr[0];
867
868 if( $m_size/$q_size > 0.20){
869 push @psl_array,[@lineArr];
870 }
871
872 }# while file
873 close(FILEINBLAT);
874 ## score large to small
875
876 #@psl_array = sort {$b->[0] <=>$a->[0]}@psl_array ;
877
878 my @out_array = ();
879 my $psl_line_num = 0;
880 my %psl_hash = ();
881 for (@psl_array)
882 {
883 my @lineArr = @{$_};
884 $psl_line_num++;
885 my $read_target = $lineArr[13];
886 my $query = $lineArr[9];
887 for(1..8){
888 shift @lineArr;
889 }
890
891 if(not exists $psl_hash{$read_target}{$query}){ ## all alignment
892 #if(not exists $psl_hash{$read_target}){ ## only keep the best hit
893 $psl_hash{$read_target}{$query} = \@lineArr;
894 }
895 }# while array
896 #print $psl_line_num." total psl lines\n";
897 return (\%psl_hash);
898 }
899
900
901
902
903 sub fasta2fasta {
904 my $input_file = shift @_;
905 my $out_file = shift @_;
906
907 open IN,"<$input_file " or die "File $input_file not found error here\n";
908 open TGT,">$out_file" or die "File $out_file not found error here\n";
909 my $c = 0;
910 my @line = ();
911 my $id ='';
912 my $seq = '';
913 my $processed = 0;
914
915 while(<IN>){
916 chomp;
917 if(m/^\>/){
918 $processed ++;
919 if($processed == 1){
920 my @seq_id = split(/\s+/, $_);
921 my $seq_id = $seq_id[0];
922 $seq_id =~ s/\_//g;
923 $seq_id = $seq_id.'_1';
924 print TGT '>'.$seq_id,"\n";
925 }
926 else{
927 $seq =~ s/\s|\n|\r//g;
928 print TGT $seq,"\n";
929 $seq = '';
930 my @seq_id = split(/\s+/, $_);
931 my $seq_id = $seq_id[0];
932 $seq_id =~ s/\_//g;
933 $seq_id = $seq_id.'_1';
934 print TGT '>'.$seq_id,"\n";
935 }
936
937 }
938 else{
939 $seq = $seq.$_;
940 }
941 }
942 # last records
943 $seq =~ s/\s|\n|\r//g;
944 print TGT $seq,"\n";
945
946 close IN;
947 close TGT;
948 return $processed;
949 }
950
951
952
953 sub fastq2fasta {
954 my $input_file = shift @_;
955 my $out_file = shift @_;
956
957 open IN,"<$input_file " or die "File $input_file not found error here\n";
958 open TGT,">$out_file" or die "File $out_file not found error here\n";
959 my $c = 0;
960 my @line = ();
961 my $id ='';
962 my $seq = '';
963 my $processed = 0;
964 my %read_hash = ();
965
966 ##############################################################################################
967
968 while(<IN>){
969 chomp;
970 $c++;
971 if($c == 1){
972 $processed++;
973 # print STDERR "$processed reads processed\r";
974 @line = split();
975 $id = $line[0];
976 $id =~ s/\@//;
977 }elsif($c == 2){
978 $seq = $_;
979 }elsif($c == 4){
980 $c = 0;
981 #print TGT ">$id\n$seq\n";
982 # print TGT ">seq_$processed\n$seq\n";
983 $read_hash{$seq}{$id} = 1;
984 $id ='';
985 @line =();
986 $seq ='';
987 }else{}
988 }
989
990 ## write TGT
991 my $count = 0;
992 for my $seq_out (sort keys %read_hash){
993 $count++;
994 my @hits = keys %{$read_hash{$seq_out}};
995 my $num = @hits+0;
996 my $id_out = "R".$count.'_'.$num;
997 print TGT ">$id_out\n", $seq_out ,"\n";
998 }
999
1000 close IN;
1001 close TGT;
1002 return $processed;
1003 }
1004
1005
1006 sub check_read_num{
1007 my $file_in = shift @_;
1008 # check read number
1009
1010 open FAS_in,"$file_in" || die "can not read fasta file\n";
1011 my $seq_num = 0;
1012 while (<FAS_in>){
1013 if( m/^\>/){
1014 $seq_num ++;
1015 }
1016 if( $seq_num>1){
1017 last;
1018 }
1019 }
1020
1021 return $seq_num;
1022
1023 }
1024
1025
1026
1027 sub check_file_type {
1028 my $file_in = shift @_;
1029 my $out = 'no';
1030
1031 open FAS_in, "$file_in" || die "can not read fasta file\n";
1032
1033 while (my $line = <FAS_in>){
1034 if($line =~ m/\w/){
1035 if($line =~ m/^\>/){
1036 $out = 'fa';
1037 last;
1038 }
1039 if($line =~ m/^\@/){
1040 $out = 'fq';
1041 last;
1042 }
1043 }
1044 }
1045
1046 return $out;
1047 }
1048
1049
1050 sub rev_com {
1051 my $seq = shift @_;
1052 $seq = reverse($seq);
1053 $seq =~ tr/ACGT/TGCA/;
1054 return($seq);
1055 }
1056
1057