Mercurial > repos > devteam > pileup_parser
annotate pileup_parser.pl @ 2:85bedbea8a12 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/pileup_parser commit ab627176cd4f6efe6d1fe4b85baa679aaa651eb1
| author | devteam |
|---|---|
| date | Wed, 05 Oct 2016 06:30:36 -0400 |
| parents | ff1ba9b75337 |
| children |
| rev | line source |
|---|---|
| 0 | 1 #! /usr/bin/perl -w |
| 2 | |
| 3 use strict; | |
| 4 use POSIX; | |
| 5 | |
| 6 | |
| 7 die "Usage: pileup_parser.pl <in_file> <ref_base_column> <read_bases_column> <base_quality_column> <coverage column> <qv cutoff> <coverage cutoff> <SNPs only?> <output bed?> <coord_column> <out_file> <total_diff> <print_qual_bases>\n" unless @ARGV == 13; | |
| 8 | |
| 9 my $in_file = $ARGV[0]; | |
| 10 my $ref_base_column = $ARGV[1]-1; # 1 based | |
| 11 my $read_bases_column = $ARGV[2]-1; # 1 based | |
| 12 my $base_quality_column = $ARGV[3]-1; # 1 based | |
| 13 my $cvrg_column = $ARGV[4]-1; # 1 based | |
| 14 my $quality_cutoff = $ARGV[5]; # phred scale integer | |
| 15 my $cvrg_cutoff = $ARGV[6]; # unsigned integer | |
| 16 my $SNPs_only = $ARGV[7]; # set to "Yes" to print only positions with SNPs; set to "No" to pring everything | |
| 17 my $bed = $ARGV[8]; #set to "Yes" to convert coordinates to bed format (0-based start, 1-based end); set to "No" to leave as is | |
| 18 my $coord_column = $ARGV[9]-1; #1 based | |
| 19 my $out_file = $ARGV[10]; | |
| 20 my $total_diff = $ARGV[11]; # set to "Yes" to print total number of deviant based | |
| 21 my $print_qual_bases = $ARGV[12]; #set to "Yes" to print quality and read base columns | |
| 22 | |
| 23 my $invalid_line_counter = 0; | |
| 24 my $first_skipped_line = ""; | |
| 25 my %SNPs = ('A',0,'T',0,'C',0,'G',0); | |
| 26 my $above_qv_bases = 0; | |
| 27 my $SNPs_exist = 0; | |
| 28 my $out_string = ""; | |
| 29 my $diff_count = 0; | |
| 30 | |
| 31 open (IN, "<$in_file") or die "Cannot open $in_file $!\n"; | |
| 32 open (OUT, ">$out_file") or die "Cannot open $out_file $!\n"; | |
| 33 | |
| 34 while (<IN>) { | |
| 35 chop; | |
| 36 next if m/^\#/; | |
| 37 my @fields = split /\t/; | |
| 38 next if $fields[ $ref_base_column ] eq "*"; # skip indel lines | |
| 39 my $read_bases = $fields[ $read_bases_column ]; | |
|
2
85bedbea8a12
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/pileup_parser commit ab627176cd4f6efe6d1fe4b85baa679aaa651eb1
devteam
parents:
0
diff
changeset
|
40 die "Coverage column" . ($cvrg_column+1) . " contains non-numeric values. Check your input parameters as well as format of input dataset." if ( not $fields[ $cvrg_column ] =~ qr/^[[:digit:]]+$/x ); |
| 0 | 41 next if $fields[ $cvrg_column ] < $cvrg_cutoff; |
| 42 my $base_quality = $fields[ $base_quality_column ]; | |
| 43 if ($read_bases =~ m/[\$\^\+-]/) { | |
| 44 $read_bases =~ s/\^.//g; #removing the start of the read segement mark | |
| 45 $read_bases =~ s/\$//g; #removing end of the read segment mark | |
| 46 while ($read_bases =~ m/[\+-]{1}(\d+)/g) { | |
| 47 my $indel_len = $1; | |
| 48 $read_bases =~ s/[\+-]{1}$indel_len.{$indel_len}//; # remove indel info from read base field | |
| 49 } | |
| 50 } | |
| 51 if ( length($read_bases) != length($base_quality) ) { | |
| 52 $first_skipped_line = $. if $first_skipped_line eq ""; | |
| 53 ++$invalid_line_counter; | |
| 54 next; | |
| 55 } | |
| 56 # after removing read block and indel data the length of read_base | |
| 57 # field should identical to the length of base_quality field | |
| 58 | |
| 59 my @bases = split //, $read_bases; | |
| 60 my @qv = split //, $base_quality; | |
| 61 | |
| 62 for my $base ( 0 .. @bases - 1 ) { | |
| 63 if ( ord( $qv[ $base ] ) - 33 >= $quality_cutoff and $bases[ $base ] ne '*') | |
| 64 { | |
| 65 ++$above_qv_bases; | |
| 66 | |
| 67 if ( $bases[ $base ] =~ m/[ATGC]/i ) | |
| 68 { | |
| 69 $SNPs_exist = 1; | |
| 70 $SNPs{ uc( $bases[ $base ] ) } += 1; | |
| 71 $diff_count += 1; | |
| 72 } elsif ( $bases[ $base ] =~ m/[\.,]/ ) { | |
| 73 $SNPs{ uc( $fields[ $ref_base_column ] ) } += 1; | |
| 74 } | |
| 75 } | |
| 76 } | |
| 77 | |
| 78 if ($bed eq "Yes") { | |
| 79 my $start = $fields[ $coord_column ] - 1; | |
| 80 my $end = $fields[ $coord_column ]; | |
| 81 $fields[ $coord_column ] = "$start\t$end"; | |
| 82 } | |
| 83 | |
| 84 if ($print_qual_bases ne "Yes") { | |
| 85 $fields[ $base_quality_column ] = ""; | |
| 86 $fields[ $read_bases_column ] = ""; | |
| 87 } | |
| 88 | |
| 89 | |
| 90 $out_string = join("\t", @fields); # \t$read_bases\t$base_quality"; | |
| 91 foreach my $SNP (sort keys %SNPs) { | |
| 92 $out_string .= "\t$SNPs{$SNP}"; | |
| 93 } | |
| 94 | |
| 95 if ($total_diff eq "Yes") { | |
| 96 $out_string .= "\t$above_qv_bases\t$diff_count\n"; | |
| 97 } else { | |
| 98 $out_string .= "\t$above_qv_bases\n"; | |
| 99 } | |
| 100 | |
| 101 $out_string =~ s/\t+/\t/g; | |
| 102 | |
| 103 if ( $SNPs_only eq "Yes" ) { | |
| 104 print OUT $out_string if $SNPs_exist == 1; | |
| 105 } else { | |
| 106 print OUT $out_string; | |
| 107 } | |
| 108 | |
| 109 | |
| 110 %SNPs = (); | |
| 111 %SNPs = ('A',0,'T',0,'C',0,'G',0); | |
| 112 $above_qv_bases = 0; | |
| 113 $SNPs_exist = 0; | |
| 114 $diff_count = 0; | |
| 115 | |
| 116 | |
| 117 } | |
| 118 | |
| 119 print "Skipped $invalid_line_counter invalid line(s) beginning with line $first_skipped_line\n" if $invalid_line_counter > 0; | |
| 120 close IN; | |
| 121 close OUT; |
