annotate fasta_summary.pl @ 0:8d1c7f2a3f5c draft default tip

Uploaded
author aaronpetkau
date Sat, 04 Jul 2015 09:43:13 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
1 #!/usr/bin/perl
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
2
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
3 #==============================================================================================
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
4
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
5 # Script to output statistsics and histograms for reads and contigs/isotigs
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
6
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
7
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
8 # Outputs include:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
9 # Mean, N50, StdDev or reads or contig lengths,
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
10 # Mean and Modal read or contig lengths.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
11 # Number of reads or contigs > 1 kb in length
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
12 # Summed contig length (by number of contigs, in sorted order)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
13 # Histogram of read or contig lengths,
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
14 # Graph of sums of read-lengths
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
15 # File of reads or contigs sorted by read or contig length
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
16 # Test for mono/di-nucelotide repeats
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
17 # Randomly selected reads or contigs
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
18
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
19
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
20 # Needs gnuplot installed to create the histograms:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
21 # On Fedora/Redhat linux: sudo yum install gnuplot
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
22 # On Ubuntu/Debian: sudo apt-get install gnuplot
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
23
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
24 # Uses a linux pipe to call gnu-plot directly, rather than as a separate shell script.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
25
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
26 # Original written by Sujai Kumar, 2008-09-05 University of Edinburgh
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
27 # Modified by Stephen: 29-Apr-2009:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
28 # Last changed by Stephen: 9-Aug-2010
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
29
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
30
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
31 # Usage: fasta_summary.pl -i infile.fasta -o process_reads -t read OR contig OR isotig (to use 'read' or 'contig' or 'isotig' in the output table & graphs. Isotig is for 'runAssembly -cdna ...' output file '454Isotigs.fna') [-r 1 to indicate count simple nucleotide repeats] [-n number of random reads to output] [-c cutoff_length] [-l 1 to indicate output the longest read] [-f (s or t or w) for spacer, tab or wiki format table output.]
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
32
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
33 # Note: The parameters above in the [] are optional.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
34
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
35 # eg: fasta_summary.pl -i myfile.fasta -o process_reads -t read
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
36 # Where:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
37 # -i reads or contigs as input, in fasta format.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
38 # -o output_dir (created if it doesn't exist)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
39 # -t read, contig or isotig
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
40
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
41 # Gives back
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
42 # - N50
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
43 # - num of contigs > 1 kb
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
44 # - num of contigs
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
45 # - Read or Contig Histogram and graphs.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
46 # - Summed contig length (by number of contigs, in sorted order)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
47
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
48 #==============================================================================================
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
49
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
50
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
51 use strict;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
52 use warnings;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
53 use Getopt::Long;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
54
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
55 my $infile;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
56 my $output_dir;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
57 my $type='read'; # Defaults to 'read' at present
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
58 my $repeats=1;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
59 my $num_random_reads_to_output=0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
60 my $cutoff_length=-1; # -1 means won't check this cutoff
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
61 my $longest_read=-1; # -1 mean's don't output the sequence for the longest read.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
62 my $doCommify=1; # Outputs statistics numbers in format: 9,999,999
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
63 my $format="t"; # "s"=spaces between columns, "t"=tabs between columns, "w"=wiki '||' and '|'.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
64 my $bucket1=0; # For optional exact length histogram distribution as asked for by JH.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
65
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
66 if ($#ARGV==-1) {die "
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
67 Usage:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
68
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
69 fasta_summary.pl -i infile.fasta -o output_dir -t ( read | contig | isotig ) [ -r 0 ] [ -n num_reads ] [ -c cutoff_length ] [ -l 1 ] [ -d 0 ] [ -f (w | t ) ] [ -bucket1 ]
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
70
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
71 where:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
72
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
73 -i or -infile infile.fasta : input fatsa file of raeds, contigs or isotigs,
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
74
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
75 -o or -output_dir output_directory : directory to put output stats and graphs into.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
76
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
77 -t or -type (read or contig or isotig) : for displaying the graph title, where type is 'read' or 'contig' or 'isotig'.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
78
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
79 -r or -repeats 0 or 1 : 1=count number of reads that contain over 70% simple mono-nucleotide and di-nucleotide repeat bases; 0=don't count.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
80
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
81 -n or -number num_reads : For outputting specified number of randomly selected reads or contigs.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
82
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
83 -c or -cutoff cutoff_length : Give a number of reads to do extra analysis (calculating again the number of reads and number of bases in reads above this length)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
84
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
85 -l or -longest 0 or 1 : 1=Output the longest read; 0= don't output the longest read
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
86
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
87 -d or -doCommify 0 or 1 : Output numbers formatted with commas to make easier to read: 0=no commas, default=1
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
88
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
89 -f or -format w or t : w=wiki_format (ie. table with || and | for column dividers), t=tabs between column symbols for the wiki pages, default is spaces between columns.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
90
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
91 -b or -bucket1 : To also output histogram file of exact read lengths (ie. bucket size of 1)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
92
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
93
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
94 eg: For 454 reads: fasta_summary.pl -i RawReads.fna -o read_stats -t read
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
95 For 454 isotigs: fasta_summary.pl -i 454Isotigs.fna -o isotig_stats -t isotig
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
96
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
97 ";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
98
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
99 GetOptions (
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
100 "infile=s" => \$infile,
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
101 "output_dir=s" => \$output_dir,
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
102 "type=s" => \$type, ## type is 'read' or 'contig' or 'isotig' - for displaying the graph title
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
103 "repeats=i" => \$repeats, # To count simple repeats
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
104 "number=i" => \$num_random_reads_to_output, # For outputting specified number of random reads
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
105 "cutoff=i" => \$cutoff_length, # Give a number of reads to do extra analysis (calculating again the number of reads and number of bases in reads above this length)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
106 "longest=i" => \$longest_read, # Output the longest read.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
107 "doCommify=i" => \$doCommify, # Output numbers formatted with commas to make easier to read: 0=no commas, default=1
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
108 "format=s" => \$format, # "w"=wiki_format (ie. table with || and | for column dividers), "t"=tabs between column symbols for the wiki pages, default is spaces between columns.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
109 "bucket1" => \$bucket1, # To also output histogram file of exact read lengths (ie. bucket size of 1)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
110 );
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
111 if ($#ARGV>-1) {die "Unused options specified: @ARGV\n";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
112 if ( (! defined $infile) || ($infile eq '') ) {die "\nPlease give input fasta file, preceeded by the '-i' option\n\n";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
113 if ( (! defined $output_dir) || ($output_dir eq '') ) {die "Please give output_directory, preceeded by the '-o' option\n\n";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
114 if ( (! defined $type) || (($type ne 'contig') && ($type ne 'read') && ($type ne 'isotig')) ) {die "ERROR: On commandline: -t type must be 'contig' or 'read' or 'isotig'\n\n";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
115
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
116
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
117 my ($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
118 if ($format eq 's') {($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab)=('',' ','', '',' ','', "\n",'');}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
119 elsif ($format eq 't') {($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab)=("\t","\t",'', "","\t",'', "\n",'');} # There is correctly a tab for the $L, but not the $Lh.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
120 elsif ($format eq 'w') {($L,$M,$R, $Lh,$Mh,$Rh, $Lhnewline,$Mhnotab)=('| ',' | ',' |', '|| ',' || ',' ||', '|| ',' || ');}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
121 else {die "\nInvalid output format code: '$format'. Should be 's', 't' or 'w'.\n\n";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
122
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
123 ### create output_dir if it doesn't exist
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
124 if (-d $output_dir) {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
125 print STDERR " Directory '$output_dir' exists, so the existing fasta_summary.pl output files will be overwritten\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
126 } else {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
127 mkdir $output_dir;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
128 print STDERR " Directory '$output_dir' created\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
129 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
130
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
131 my $gc_count = 0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
132
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
133 #--------------- Read in contigs from fasta file -------------------
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
134
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
135 open INFILE, "<$infile" or die "Failed to open file: '$infile' : $!";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
136 open STATS, ">$output_dir/stats.txt" or die "Failed to open $output_dir/stats.txt: '' : $!";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
137
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
138 my $header = <INFILE>;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
139 if (! defined($header) ) {print "\n** ERROR: First line of input fasta file is undefined - so file must be empty **\n\n"; print STATS "No sequences found\n"; exit 1;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
140 if ($header!~/^>/) {print "\nERROR: First line of input fasta file should start with '>' BUT first line is: '$header'\n"; print STATS "No sequences found\n"; exit 1;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
141
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
142 my $seq = "";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
143 my @sequences;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
144
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
145 while (<INFILE>) {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
146 if (/^>/) {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
147 push @sequences, [length($seq), $header, $seq];
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
148 $header = $_;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
149 $seq = "";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
150 } else {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
151 chomp;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
152 $gc_count += tr/gcGC/gcGC/;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
153 $seq .= $_;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
154 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
155 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
156 push @sequences, [length($seq), $header, $seq];
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
157 close INFILE;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
158 if ($#sequences==-1) {print "\nERROR: There are zero sequences in the input file: $infile\n\n"; print STATS "No sequences found\n"; exit 1;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
159
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
160
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
161 my $all_contigs_length=0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
162 foreach (@sequences) {$all_contigs_length += $_->[0];}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
163 if ($all_contigs_length==0) {print "\nERROR: Length of all contigs is zero\n\n"; exit 2;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
164
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
165 # Find number and number of bases in reads greater than the optional cut-off length given at command-line.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
166 my $num_reads_above_cutoff=0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
167 my $num_of_bases_in_reads_above_cutoff=0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
168 if ($cutoff_length>0)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
169 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
170 foreach (@sequences)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
171 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
172 if ($_->[0]>=$cutoff_length) {$num_of_bases_in_reads_above_cutoff+= $_->[0]; $num_reads_above_cutoff++;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
173 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
174 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
175
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
176
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
177 #--------------- Gather Plots Data, Find N50, Print sorted contig file -------------------
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
178
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
179 my $summed_contig_length = 0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
180 my @summed_contig_data; # <-- For graph of summed length (in number of bases) versus number of contigs.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
181 my @summed_contig_data_contigLens; # <-- Added by SJBridgett to get graph of summed contig length versus min. contig length included (ie. X-axis is sort of inverse of above)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
182
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
183 my $contig1k_count = 0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
184 my $contig1k_length = 0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
185
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
186 open SORTED, ">$output_dir/sorted_contigs.fa" or die $!;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
187
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
188 # top row in stats file
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
189 #print STATS "N50\nMax contig size\nNumber of bases in contigs\nNumber of contigs\nNumber of contigs >=1kb\nNumber of contigs in N50\nNumber of bases in contigs >=1kb\nGC Content of contigs\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
190
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
191 my $N50size=-1;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
192 my $N50_contigs = 0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
193
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
194 my @sorted_by_contig_length = sort {$b->[0] <=> $a->[0]} @sequences;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
195
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
196 ### variables and initialization for histogram (stored in @bins)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
197 my $max = $sorted_by_contig_length[0][0];
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
198 my $mean= $all_contigs_length/($#sequences+1); # <-- Added by Stephen Bridgett. Note: as $# gives the highest index number, so add 1 as arrays are zero-based.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
199
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
200 # Calculate standard deviation
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
201 my $sumsquares = 0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
202 foreach (@sequences) {$sumsquares += ($_->[0] - $mean) ** 2;} # <-- Taken from John's "mean_fasta_length.pl" script.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
203 my $stddev = ( $sumsquares/($#sequences+1) ) ** 0.5;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
204
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
205 my $min = 0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
206 # Aim for approximately 100 bins, so
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
207
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
208 my $bin_size=1;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
209 my $min_max_range=$max - $min;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
210 # $bin_size = ($min_max_range)/(99); # (99 is 100-1) so 1000/100
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
211 if ($min_max_range>=100000000) {$bin_size=1000000;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
212 elsif ($min_max_range>=10000000) {$bin_size=100000;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
213 elsif ($min_max_range>=1000000) {$bin_size=10000;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
214 elsif ($min_max_range>=100000) {$bin_size=1000;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
215 elsif ($min_max_range>=10000) {$bin_size=100;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
216 else {$bin_size=10;} # elsif ($min_max_range>=1000) {$bin_size=10;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
217 #elsif ($min_max_range>=100) {$bin_size=1;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
218 #elsif ($min_max_range>=10) {}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
219 #elsif ($min_max_range>=1) {}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
220 # WAS: my $bin_size = ($type eq 'contig') ? 1000 : 10;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
221
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
222 my @bins;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
223 $#bins = int(($min_max_range)/$bin_size) + 1; # <-- Set the bins array size.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
224 foreach (@bins) {$_ = 0};
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
225
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
226 foreach (@sorted_by_contig_length) {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
227
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
228 my $curr_contig_length = $_->[0];
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
229 push @summed_contig_data_contigLens, $curr_contig_length; # <-- added by Stephen.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
230
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
231 $bins[int(($curr_contig_length + 1 - $min)/$bin_size)]++;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
232
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
233 $summed_contig_length += $curr_contig_length;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
234 push @summed_contig_data, $summed_contig_length;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
235
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
236 ### sorted contigs file
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
237 print SORTED $_->[1] . $_->[2] . "\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
238
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
239 if ($curr_contig_length >= 1000) {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
240 $contig1k_count++;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
241 $contig1k_length += $curr_contig_length;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
242 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
243
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
244 $N50_contigs++ unless ($N50size>-1); # Was unless $N50_found
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
245
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
246 if ($summed_contig_length > ($all_contigs_length / 2) and $N50size == -1) {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
247 $N50size = $curr_contig_length;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
248 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
249 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
250
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
251
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
252 if ($bucket1!=0)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
253 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
254 =pod
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
255 # This firsdt method works and agress with the second, but the lengths are in reverse order, at the @sorted_by_contig_length array was sorted with longest contig first.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
256 open BUCKET1, ">$output_dir/lengths_hist1.txt" or die "Failed to open file '$output_dir/lengths_hist1.txt' : $!\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
257 print BUCKET1 "Length\tFrequency\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
258 my $len=-1;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
259 my $count=0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
260 foreach (@sorted_by_contig_length)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
261 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
262 if ( $len != $_->[0] ) {if ($len>-1) {print BUCKET1 "$len\t$count\n";} $len=$_->[0]; $count=0;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
263 $count++;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
264 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
265 if ($len>-1) {print BUCKET1 "$len\t$count\n";} # Print length of final length grouping.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
266 close BUCKET1;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
267 =cut
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
268 open BUCKET1, ">$output_dir/lengths_hist1_with_zeros.txt" or die "Failed to open file '$output_dir/lengths_hist1_with_zeros.txt' : $!\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
269 print BUCKET1 "Length\tFrequency\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
270 my @bucket=(); # To check the result by using array.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
271 foreach (@sequences)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
272 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
273 my $len=$_->[0];
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
274 if (defined $bucket[$len]) {$bucket[$len]++;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
275 else {$bucket[$len]=1;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
276 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
277 for (my $i=0; $i<=$#bucket; $i++)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
278 # for (my $i=$#bucket; $i>=0; $i--) # <-- for reverse order
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
279 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
280 if (defined $bucket[$i]) {print BUCKET1 "$i\t$bucket[$i]\n";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
281 else {print BUCKET1 "$i\t0\n";} # Can uncomment this later if don't want zeros in the output.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
282 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
283 close BUCKET1;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
284 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
285
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
286
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
287 my $type_plural=$type.'s';
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
288 print STATS $Lh."Statistics for $type lengths:".$Mhnotab.$Rh."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
289 print STATS $L."Min $type length:".$M.&commify_if($sorted_by_contig_length[$#sequences][0],$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
290 print STATS $L."Max $type length:".$M.&commify_if($max,$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
291 printf STATS $L."Mean %s length:".$M."%.2f".$R."\n", $type,$mean; # <-- Added by Stephen Bridgett, April 2009.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
292 printf STATS $L."Standard deviation of %s length:".$M."%.2f".$R."\n", $type,$stddev; ## <-- Added by Stephen Bridgett, May 2009.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
293 print STATS $L."Median $type length:".$M.&commify_if($sorted_by_contig_length[int($#sequences/2)][0],$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
294 print STATS $L."N50 $type length:".$M.&commify_if($N50size,$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
295
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
296 print STATS $Lhnewline."Statistics for numbers of $type_plural:".$Mhnotab.$Rh."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
297 print STATS $L."Number of $type_plural:".$M.&commify_if($#sequences+1,$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
298 print STATS $L."Number of $type_plural >=1kb:".$M.&commify_if($contig1k_count,$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
299 print STATS $L."Number of $type_plural in N50:".$M.&commify_if($N50_contigs,$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
300
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
301 print STATS $Lhnewline."Statistics for bases in the $type_plural:".$Mhnotab.$Rh."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
302 print STATS $L."Number of bases in all $type_plural:".$M.&commify_if($all_contigs_length,$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
303 print STATS $L."Number of bases in $type_plural >=1kb:".$M.&commify_if($contig1k_length,$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
304 printf STATS $L."GC Content of %s:".$M."%.2f %%".$R."\n", $type_plural,(100*$gc_count/$all_contigs_length);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
305
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
306 if ($cutoff_length>0)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
307 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
308 print STATS $Lhnewline."Statistics for $type_plural >= $cutoff_length bp in length:".$Mhnotab.$Rh."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
309 print STATS $L."Number of $type_plural >= $cutoff_length bp:".$M.&commify($num_reads_above_cutoff,$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
310 print STATS $L."\tNumber of bases in $type_plural >= $cutoff_length bp:".$M.&commify($num_of_bases_in_reads_above_cutoff,$doCommify).$R."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
311 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
312
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
313 if ($repeats==1) {&countRepeats();}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
314
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
315 print STATS "\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
316
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
317
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
318 # Output random selection of reads if requested on commandline:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
319 my $fastaLineLen=60; # <-- The line length used for 454 sffinfo output, but could use a value read from input file (but be careful not to read a short line)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
320 if ($num_random_reads_to_output>0)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
321 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
322 my @randlist;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
323 if ($num_random_reads_to_output<($#sequences+1))
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
324 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
325 print STATS "\nSome randomly selected reads:\n\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
326 @randlist= &getListOfRandomNumbers($#sequences, $num_random_reads_to_output); # Don't use ($#sequences + 1), just ($#sequences) otherwise would be outside the array.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
327 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
328 else # Just print all the sequences:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
329 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
330 print STATS "\nAll ".($#sequences+1)." reads:\n\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
331 for (my $i=0;$i<=$#sequences;$i++) {push @randlist,$i;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
332 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
333 &print_sequences(\@randlist)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
334 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
335
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
336
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
337 # Print the longest read:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
338 if ($longest_read>0)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
339 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
340 my $length_of_longest_read=-1;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
341 my @longest_read=();
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
342 my $i=0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
343 foreach (@sequences)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
344 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
345 if ($_->[0]>$length_of_longest_read) {$length_of_longest_read=$_->[0]; $longest_read[0]=$i;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
346 $i++;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
347 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
348 if ($length_of_longest_read>0) {print STATS "\nLongest read:\n"}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
349 &print_sequences(\@longest_read);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
350 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
351
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
352
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
353 =pod
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
354 print STATS "\n$type\tSummed\nlength\tlength\n"; # <-- Added by Stephen Bridgett, but better to produce a graph instead.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
355
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
356 my $i=0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
357 foreach (@summed_contig_data) {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
358 # print STATS $sorted_by_contig_length[$i]->[0]."\t".$summed_contig_data_contigLens[$i]."\t".$_."\t".$summed_contig_data[$i]."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
359 print STATS $sorted_by_contig_length[$i]->[0]."\t".$_."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
360 $i++;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
361 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
362 =cut
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
363
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
364 open SUMMED, ">$output_dir/summed_contig_lengths.dat" or die $!;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
365 print SUMMED join "\n",@summed_contig_data;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
366 close SUMMED;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
367
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
368 open HISTOGRAMBINS, ">$output_dir/histogram_bins.dat" or die $!;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
369 my $bin_size_counter = 0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
370 foreach (@bins) {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
371 print HISTOGRAMBINS eval($bin_size_counter++ * $bin_size + $bin_size/2) . "\t$_\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
372 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
373 close HISTOGRAMBINS;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
374
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
375
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
376 # Graph of cumulative (summed) number of reads on y-axis, versus length of read (decending order) on x-axis
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
377 open SUMREAD_READLEN, ">$output_dir/sum_reads_vs_read_len.dat" or die $!;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
378 #my $read_counter= 0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
379 my $read_counter= $#sorted_by_contig_length+1;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
380
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
381 foreach (@sorted_by_contig_length) {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
382 # $read_counter++;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
383 $read_counter--;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
384 print SUMREAD_READLEN "$_->[0]\n"; # $read_counter
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
385 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
386 close SUMREAD_READLEN;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
387
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
388
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
389
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
390
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
391 my $properType=ucfirst($type); # Makes the first letter an upper case letter, ie. 'Config' or 'Read'
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
392 #if ($type eq 'contig')
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
393 # {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
394 # print the outcome of the gnu_plot as may have a write permissions error sometimes.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
395 my $YHistogramScaleType = ($type eq 'read') ? '' : 'log y'; # Not using log scale for reads, just for contig/isotigs.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
396 &plot_graph('histogram', "$output_dir/histogram_bins.dat", "Histogram of $type lengths", "$properType length", "Number of $type_plural", '0.9', $YHistogramScaleType);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
397 &plot_graph('line', "$output_dir/summed_contig_lengths.dat", "Summed $type lengths", "Number of $type_plural", "Summed $type length in bases", '0.9', '');
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
398 &plot_graph('line', "$output_dir/sum_reads_vs_read_len.dat", "X-axis gives the Number of $type_plural that are greater than the $properType-length given on the Y-axis", "$properType length", "Cummulative number of $type_plural", '0.9', '');
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
399
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
400 =pod
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
401 # print `gnuplot_histogram.sh $output_dir/histogram_bins.dat`;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
402
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
403 &plot_graph("$output_dir/summed_contig_lengths.dat", 'Summed contig lengths', 'Number of contigs', 'Summed contig length in bases', '0.9', '');
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
404 # print `gnuplot_summedcontigs.sh $output_dir/summed_contig_lengths.dat`;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
405
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
406 &plot_graph('line', "$output_dir/sum_reads_vs_read_len.dat", 'X-axis gives the Number of contigs that are greater than the Contig-length given on the Y-axis', 'Contig length', 'Cummulative number of contigs', '0.9', '');
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
407 # print `gnuplot_sum_contig_vs_contig_len.sh $output_dir/sum_reads_vs_read_len.dat`;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
408 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
409 elsif ($type eq 'read')
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
410 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
411
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
412 # print `gnuplot_readshistogram_logY.sh $output_dir/histogram_bins.dat`; # There's also optionally a "...._linearY.sh"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
413
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
414 &plot_graph('line', "$output_dir/summed_contig_lengths.dat",'Summed read lengths', 'Number of reads', 'Summed read length in bases', '0.9', '');
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
415 # print `gnuplot_summedreads.sh $output_dir/summed_contig_lengths.dat`;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
416
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
417 &plot_graph('line', "$output_dir/sum_reads_vs_read_len.dat", 'X-axis gives the Number of reads that are greater than the Read-length given on the Y-axis', 'Read length', 'Cummulative number of reads', '0.9', '');
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
418 # print `gnuplot_sum_reads_vs_read_len.sh $output_dir/sum_reads_vs_read_len.dat`;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
419 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
420 else {die "\n** ERROR: Invalid type='$type' **\n\n";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
421 =cut
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
422
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
423 close SORTED;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
424 close STATS;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
425
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
426
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
427 # Use pipe to plot directly with gnuplot, rather than calling a separate shell script:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
428 # http://www.vioan.ro/wp/2008/09/30/calling-gnuplot-from-perl/
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
429 # http://forums.devshed.com/perl-programming-6/plotting-with-gnuplot-within-perl-script-549682.html
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
430 # Another option is the perl module: GnuplotIF: http://lug.fh-swf.de/perl/GnuplotIF.html OR: http://lug.fh-swf.de/perl/
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
431
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
432 # PlPlot: Perl: http://search.cpan.org/~dhunt/PDL-Graphics-PLplot-0.47/plplot.pd
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
433 # http://plplot.sourceforge.net/
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
434 # dislin: http://www.mps.mpg.de/dislin/overview.html
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
435 # MathGL: http://mathgl.sourceforge.net/index.html
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
436
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
437
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
438 sub plot_graph
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
439 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
440 # Plots a histogram or xy-line graph
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
441 # Parameters: GraphType (histogram/line) DataFile, Title, X-label, Y-label, Y-range
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
442 # Graphfile should end with '.png'
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
443 # The $yloglinear is 'log y' for log, or '' for linear
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
444 my ($graphtype, $datafile, $title,$xlabel,$ylabel,$yrange,$yloglinear)=@_; # yrange for reads: 0.1, and for contigs: 0.9
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
445
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
446 my $graphstyle='';
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
447 if ($graphtype eq 'histogram') {$graphstyle="plot \"$datafile\" using 1:2 with boxes";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
448 elsif ($graphtype eq 'line') {$graphstyle="plot \"$datafile\" using 1 with lines";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
449 else {die "\n** ERROR: Invalid graphtype='$graphtype'\n\n";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
450 my $yloglinearscale= ($yloglinear eq '') ? '' : "set $yloglinear";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
451
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
452 # To capture any errors that are normally sent from gnuplot to stderr, could use: open3 pipe interface:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
453 # http://www.clintoneast.com/articles/perl-open3-example.php
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
454 # http://hell.org.ua/Docs/oreilly/perl2/prog/ch16_03.htm
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
455 # But the following should be fine, as the stderr will display when running the script anyway.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
456 # If needed a simpler way would be to sent the output to a file using eg: open (GNUPLOT, "|gnuplot > gnu_out.txt 2>&1") or die .... The read the resulting file.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
457 open (GNUPLOT, "|gnuplot") or die "\n**ERROR: Failed to open gnuplot : $!\n\n **";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
458 print GNUPLOT <<ENDPLOT;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
459 set terminal png
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
460 set output "$datafile.png"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
461 set nokey
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
462 $yloglinearscale
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
463 set xlabel "$xlabel"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
464 set ylabel "$ylabel"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
465 set yrange [$yrange:]
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
466 set title "$title"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
467 $graphstyle
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
468 ENDPLOT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
469 close(GNUPLOT);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
470 if ($? != 0) {print "\n** WARNING: GNUplot pipe returned non-zero status: '$?'\n\n";} # $? is the status returned by the last pipe close (or backtick or system operator)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
471 if (! -e "$datafile.png") {die "\n** ERROR: Failed to create '$datafile.png'**\n\n";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
472
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
473 =pod
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
474 #PNG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
475 set term png small xFFFFFF
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
476 set output "$file.png"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
477 set size 1 ,1
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
478 set nokey
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
479 set data style line
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
480 set xlabel "frequency" font "VeraMono,10"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
481 set title "Fast Fourier Transform" font "VeraMono,10"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
482 set grid xtics ytics
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
483 set xtics 100
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
484 plot "$file" using 1:2 w lines 1
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
485 =cut
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
486
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
487 #WAS PREVIOUSLY AS .sh script
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
488 =pod
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
489 # The 'gnuplot_readshistogram_logY.sh' is:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
490 set terminal png
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
491 set output "$1.png"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
492 set log y
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
493 set xlabel "Read length"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
494 set ylabel "Frequency"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
495 set yrange [0.9:]
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
496 set title "Histogram of read lengths"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
497 plot "$1" using 1:2 with boxes
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
498 =cut
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
499 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
500
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
501
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
502 # Was previously a separate .sh file:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
503 =pod
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
504 #!/bin/sh
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
505 gnuplot << EOF
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
506 set terminal png
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
507 set output "$1.png"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
508 set xlabel "Number of contigs"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
509 set ylabel "Summed contig length in bases"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
510 set yrange [0.9:]
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
511 set title "Summed contig lengths"
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
512 plot "$1" using 1 with lines
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
513 EOF
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
514 =cut
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
515
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
516
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
517
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
518
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
519 # Added function to count number of simple dinucleotide repeats:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
520 sub countRepeats
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
521 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
522 # To count the number of sequences that contain mostly repeats.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
523 # This would be faster if called a C program on the file.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
524
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
525 # Common simple repeats are listed here: http://www.bioinfo.de/isb/2005/05/0041/
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
526 # Dinucleotide
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
527 # AT/TA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
528 # AC/TG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
529 # AG/TC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
530 # CG/GC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
531 # Trinucleotide
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
532 # AAT/TTA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
533 # CTA/GAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
534 # ATG/TAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
535 # ACT/TGA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
536 # CTC/GAG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
537 # AGG/TCC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
538 # CAG/GTC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
539 # AAG/TTC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
540 # ATA/TAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
541 # CAA/GTT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
542 # AGC/TCG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
543 # ACA/TGT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
544 # ACG/TGC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
545 # AGA/TCT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
546 # ACC/TGG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
547 # Other
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
548 # Tetranucleotide
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
549 # AAAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
550 # AAAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
551 # CACG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
552 # AACA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
553 # AATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
554 # AAGA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
555 # TGAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
556 # AAAG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
557 # ACAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
558 # AATG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
559 # AGCC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
560 # Other
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
561 # Pentanucleotide
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
562 # AAAAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
563 # AATTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
564 # GCTAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
565 # ATAAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
566 # AAAAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
567 # AAACA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
568 # ATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
569 # TTGCC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
570 # Other
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
571
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
572 # I also add mono-nucleotide repeats: - ie. just all T's, or A's, etc
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
573 # Just consider the dinucleotide repeats for now:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
574 my ($ATseq,$CGseq,$ACseq,$TGseq,$AGseq,$TCseq)=(0,0,0,0,0,0);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
575 my ($AAseq,$TTseq,$CCseq,$GGseq)=(0,0,0,0);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
576 foreach (@sequences)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
577 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
578 my $seq_len=$_->[0];
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
579 my $seq=$_->[2]; # This copy might be slow, maybe should just stick with using the reference.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
580 my $mnt=0.35*$seq_len; # Mononucleotide threshold: HERE 0.35 also means 70%; 0.4 means 80% dinucleotide repeats, as really one base so 0.5 = 100%
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
581 my $dnt=0.35*$seq_len; # Dinucleotide threshold: 0.35 means 70%; 0.4 means 80% dinucleotide repeats, as two bases so 0.5 = 100%
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
582 my ($AT,$CG,$AC,$TG,$AG,$TC)=(0,0,0,0,0,0);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
583 my ($AA,$TT,$CC,$GG)=(0,0,0,0);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
584 # See: http://www.allinterview.com/showanswers/76719.html
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
585
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
586 # AT/TA seems most common repeat so process it first to save time:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
587 while ($seq=~/AT/g) {$AT++;} if ($AT>$dnt) {$ATseq++; next;} # AT is same as TA. If has 80% AT's then won't have 80% AC etc.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
588 while ($seq=~/CG/g) {$CG++;} if ($CG>$dnt) {$CGseq++; next;} # CG is same as GC.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
589 # AC,TG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
590 while ($seq=~/AC/g) {$AC++;} if ($AC>$dnt) {$ACseq++; next;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
591 while ($seq=~/TG/g) {$TG++;} if ($TG>$dnt) {$TGseq++; next;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
592 # AG/TC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
593 while ($seq=~/AG/g) {$AG++;} if ($AG>$dnt) {$AGseq++; next;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
594 while ($seq=~/TC/g) {$TC++;} if ($TC>$dnt) {$TCseq++; next;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
595
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
596 # Added my simple mononucleotde repeat count:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
597 while ($seq=~/AA/g) {$AA++;} if ($AA>$mnt) {$AAseq++; next;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
598 while ($seq=~/TT/g) {$TT++;} if ($TT>$mnt) {$TTseq++; next;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
599 while ($seq=~/CC/g) {$CC++;} if ($CC>$mnt) {$CCseq++; next;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
600 while ($seq=~/GG/g) {$GG++;} if ($GG>$mnt) {$GGseq++; next;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
601 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
602
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
603 my $num_seq=($#sequences+1);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
604 my $total_din_repeats_seq= $ACseq+$TGseq+$ATseq+$AGseq+$TCseq+$CGseq;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
605 my $percent_din_repeats=100*$total_din_repeats_seq/$num_seq;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
606 print STATS "\nSimple Dinucleotide repeats:\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
607 printf STATS "\tNumber of %s with over 70%% dinucleotode repeats:\t%.2f %% (%d %s)\n", $type_plural, $percent_din_repeats, $total_din_repeats_seq, $type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
608 printf STATS "\tAT:\t%.2f %% (%d %s)\n", (100*$ATseq/$num_seq),$ATseq,$type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
609 printf STATS "\tCG:\t%.2f %% (%d %s)\n", (100*$CGseq/$num_seq),$CGseq,$type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
610 printf STATS "\tAC:\t%.2f %% (%d %s)\n", (100*$ACseq/$num_seq),$ACseq,$type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
611 printf STATS "\tTG:\t%.2f %% (%d %s)\n", (100*$TGseq/$num_seq),$TGseq,$type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
612 printf STATS "\tAG:\t%.2f %% (%d %s)\n", (100*$AGseq/$num_seq),$AGseq,$type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
613 printf STATS "\tTC:\t%.2f %% (%d %s)\n", (100*$TCseq/$num_seq),$TCseq,$type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
614
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
615 my $total_mono_repeats_seq= $AAseq+$TTseq+$CCseq+$GGseq;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
616 my $percent_mono_repeats=100*$total_mono_repeats_seq/$num_seq;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
617 print STATS "\nSimple mononucleotide repeats:\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
618 printf STATS "\tNumber of %s with over 50%% mononucleotode repeats:\t%.2f %% (%d %s)\n", $type_plural, $percent_mono_repeats, $total_mono_repeats_seq, $type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
619 printf STATS "\tAA:\t%.2f %% (%d %s)\n", (100*$AAseq/$num_seq),$AAseq,$type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
620 printf STATS "\tTT:\t%.2f %% (%d %s)\n", (100*$TTseq/$num_seq),$TTseq,$type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
621 printf STATS "\tCC:\t%.2f %% (%d %s)\n", (100*$CCseq/$num_seq),$CCseq,$type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
622 printf STATS "\tGG:\t%.2f %% (%d %s)\n", (100*$GGseq/$num_seq),$GGseq,$type_plural;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
623
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
624 return $percent_din_repeats+$percent_mono_repeats;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
625 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
626
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
627
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
628 sub commify_if
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
629 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
630 # If doCommify is >0 then converts output to commas.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
631 # Formats '1234567890.01' with commas as "1,234,567,890.01
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
632 # Based on: http://www.perlmonks.org/?node_id=110137
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
633 my ($number,$doCommify)=@_;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
634 if ($doCommify > 0) {$number =~ s/(\d)(?=(\d{3})+(\D|$))/$1\,/g;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
635 return $number;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
636 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
637
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
638
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
639 #--------------- Produce ordered list of random numbers -------------------
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
640 # This is copied from: my_random_contigs.pl
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
641
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
642 sub getListOfRandomNumbers
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
643 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
644 # Use: @list=getListOfRandomNumbers(200,20); to return sorted list of 20 numbers in range from 0 to 200 inclusive.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
645 my %list2=();
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
646 my $i=0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
647 my $MaxNumber=$_[0];
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
648 my $NumToPick=$_[1];
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
649 while ($i<$NumToPick)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
650 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
651 my $intRand = int(rand($MaxNumber+1)); # For Zero-based perl-arrays. The +1 means this generates random integers between 0 and $MaxNumber. (See: http://perldoc.perl.org/functions/rand.html )
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
652 if ($intRand>$MaxNumber) {$intRand=$MaxNumber} # Just to be extra sure that don't exceed $MaxNumber.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
653 if ( !exists($list2{$intRand}) ) {$list2{$intRand}=1; $i++;}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
654 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
655 #foreach my $key(keys %list2) {print "$key ";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
656 # Sort the list of numbers:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
657 #my @SortedList2 = sort { $a <=> $b } keys(%list2);
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
658 #return @SortedList2;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
659 return (sort { $a <=> $b } keys(%list2));
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
660 #print "Sorted list of ".$NumToPick." random numbers:\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
661 #foreach my $num(@SortedList2) {print "$num\n";}
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
662 #print "\n\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
663 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
664
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
665
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
666 sub print_sequences
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
667 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
668 # Print the sequences wrapping sequences using index array, at line length of '$fastaLineLen' characters:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
669 # Uses the global '@sequences' array.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
670 my $sequence_indexes_list=$_[0]; # This is an array reference, not the array itself.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
671 foreach my $num(@{$sequence_indexes_list})
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
672 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
673 #print "$num (max=$#sequences)\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
674 print STATS $sequences[$num]->[1]; # Prints the header, no "\n" needed after it.
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
675 my $pos=0;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
676 my $seqlen=$sequences[$num]->[0];
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
677 while ($pos<$seqlen)
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
678 {
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
679 print STATS substr($sequences[$num]->[2],$pos,$fastaLineLen)."\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
680 $pos+=$fastaLineLen;
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
681 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
682 print STATS "\n";
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
683 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
684 }
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
685
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
686
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
687 =pod
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
688 # Some test runs for mono-nucleotides and dinucelotides:
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
689 >FUOMOGO01AQV42DUMMYA length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
690 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
691 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
692 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
693 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
694 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
695 >FUOMOGO01AQV42DUMMYB length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
696 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
697 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
698 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
699 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
700 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
701 >FUOMOGO01AQV42DUMMYC length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
702 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
703 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
704 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
705 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
706 >FUOMOGO01AQV42DUMMYD length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
707 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
708 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
709
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
710 >FUOMOGO01AQV42 length=339 xy=0189_0676 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
711 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
712 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
713 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
714 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
715 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
716 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
717 >FUOMOGO01AUK0D length=214 xy=0231_0843 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
718 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
719 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
720 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
721 ACACACACACACACACACACACGACGACGACGAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
722 >FUOMOGO01AUB7C length=64 xy=0228_1718 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
723 ATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGTACG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
724 TACG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
725 >FUOMOGO01AU00B length=213 xy=0236_1097 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
726 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
727 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
728 ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
729 ACACACACACACACACACACGACGACGACGACG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
730 >FUOMOGO01ATYRT length=169 xy=0224_0695 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
731 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
732 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
733 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
734 >FUOMOGO01ARMLN length=400 xy=0197_2201 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
735 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
736 TATAGTAGTAGTAGTATATATATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
737 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
738 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
739 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
740 TATATATATATATATATATATATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
741 TATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
742 >FUOMOGO01AVGRX length=44 xy=0241_1051 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
743 TATATATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
744 >FUOMOGO01ASZ6K length=315 xy=0213_0922 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
745 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
746 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
747 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
748 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
749 TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
750 TGTGTGTGTGTGTGT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
751 >FUOMOGO01ARSZF length=65 xy=0199_2281 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
752 TATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGTAC
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
753 GTACG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
754 >FUOMOGO01AYV8U length=49 xy=0280_1324 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
755 ATATATATATATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
756 >FUOMOGO01AYV9X length=40 xy=0280_1363 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
757 TATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
758 >FUOMOGO01AUX4M length=40 xy=0235_1460 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
759 TATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
760 >FUOMOGO01AWOTU length=54 xy=0255_0800 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
761 ATATATATATATATATATATATATATATATATATATATATATATATATATAGTA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
762 >FUOMOGO01A11TC length=66 xy=0316_1054 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
763 ATATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGTA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
764 CGTACG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
765 >FUOMOGO01ASRJP length=401 xy=0210_2019 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
766 TATATATATATATATATATATATATATATATATATATATATATATATATATATAGTATAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
767 AGTAGTAGTAGTATATATATATATATATATATATATATATATATATATATATATATATAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
768 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
769 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
770 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
771 ATATATATATATATATATATATATATATATATATATATATATATATATATATATATATAT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
772 ATATATATATATATATATATATATATATATATATATATATA
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
773 >FUOMOGO01AU1ZH length=67 xy=0236_2363 region=1 run=R_2009_04_23_17_54_06_
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
774 TATATATATATATATATATATATATATATATATATATATATATATATATATATAGTACGT
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
775 ACGTACG
8d1c7f2a3f5c Uploaded
aaronpetkau
parents:
diff changeset
776 =cut