# HG changeset patch
# User devteam
# Date 1390832943 18000
# Node ID 9b7b4e0ca9db1ee4e5b69dc4ca523839a7c2cc6c
Imported from capsule None
diff -r 000000000000 -r 9b7b4e0ca9db fastq_stats.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_stats.py Mon Jan 27 09:29:03 2014 -0500
@@ -0,0 +1,48 @@
+#Dan Blankenberg
+import sys
+from galaxy_utils.sequence.fastq import fastqReader, fastqAggregator
+
+VALID_NUCLEOTIDES = [ 'A', 'C', 'G', 'T', 'N' ]
+VALID_COLOR_SPACE = map( str, range( 7 ) ) + [ '.' ]
+SUMMARY_STAT_ORDER = ['read_count', 'min_score', 'max_score', 'sum_score', 'mean_score', 'q1', 'med_score', 'q3', 'iqr', 'left_whisker', 'right_whisker' ]
+
+def main():
+ input_filename = sys.argv[1]
+ output_filename = sys.argv[2]
+ input_type = sys.argv[3] or 'sanger'
+
+ aggregator = fastqAggregator()
+ num_reads = None
+ fastq_read = None
+ for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+ aggregator.consume_read( fastq_read )
+ out = open( output_filename, 'wb' )
+ valid_nucleotides = VALID_NUCLEOTIDES
+ if fastq_read:
+ if fastq_read.sequence_space == 'base':
+ out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n' )
+ else:
+ out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n' )
+ valid_nucleotides = VALID_COLOR_SPACE
+ for i in range( aggregator.get_max_read_length() ):
+ column_stats = aggregator.get_summary_statistics_for_column( i )
+ out.write( '%i\t' % ( i + 1 ) )
+ out.write( '%s\t' * len( SUMMARY_STAT_ORDER ) % tuple( [ column_stats[ key ] for key in SUMMARY_STAT_ORDER ] ) )
+ out.write( '%s\t' % ','.join( map( str, column_stats['outliers'] ) ) )
+ base_counts = aggregator.get_base_counts_for_column( i )
+ for nuc in valid_nucleotides:
+ out.write( "%s\t" % base_counts.get( nuc, 0 ) )
+ extra_nucs = sorted( [ nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides ] )
+ out.write( "%s\t%s\n" % ( ','.join( extra_nucs ), ','.join( str( base_counts[nuc] ) for nuc in extra_nucs ) ) )
+ out.close()
+ if num_reads is None:
+ print "No valid fastq reads could be processed."
+ else:
+ print "%i fastq reads were processed." % ( num_reads + 1 )
+ print "Based upon quality values and sequence characters, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" )
+ ascii_range = aggregator.get_ascii_range()
+ decimal_range = aggregator.get_decimal_range()
+ print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed
+ print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] )
+
+if __name__ == "__main__": main()
diff -r 000000000000 -r 9b7b4e0ca9db fastq_stats.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_stats.xml Mon Jan 27 09:29:03 2014 -0500
@@ -0,0 +1,74 @@
+
+ by column
+
+ galaxy_sequence_utils
+
+ fastq_stats.py '$input_file' '$output_file' '${input_file.extension[len( 'fastq' ):]}'
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool creates summary statistics on a FASTQ file.
+
+.. class:: infomark
+
+**TIP:** This statistics report can be used as input for the **Boxplot** tools.
+
+-----
+
+**The output file will contain the following fields:**
+
+* column = column number (1 to 36 for a 36-cycles read Solexa file)
+* count = number of bases found in this column.
+* min = Lowest quality score value found in this column.
+* max = Highest quality score value found in this column.
+* sum = Sum of quality score values for this column.
+* mean = Mean quality score value for this column.
+* Q1 = 1st quartile quality score.
+* med = Median quality score.
+* Q3 = 3rd quartile quality score.
+* IQR = Inter-Quartile range (Q3-Q1).
+* lW = 'Left-Whisker' value (for boxplotting).
+* rW = 'Right-Whisker' value (for boxplotting).
+* outliers = Scores falling beyond the left and right whiskers (comma separated list).
+* A_Count = Count of 'A' nucleotides found in this column.
+* C_Count = Count of 'C' nucleotides found in this column.
+* G_Count = Count of 'G' nucleotides found in this column.
+* T_Count = Count of 'T' nucleotides found in this column.
+* N_Count = Count of 'N' nucleotides found in this column.
+* Other_Nucs = Comma separated list of other nucleotides found in this column.
+* Other_Count = Comma separated count of other nucleotides found in this column.
+
+For example::
+
+ #column count min max sum mean Q1 med Q3 IQR lW rW outliers A_Count C_Count G_Count T_Count N_Count other_bases other_base_count
+ 1 14336356 2 33 450600675 31.4306281875 32.0 33.0 33.0 1.0 31 33 2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 4482314 2199633 4425957 3208745 19707
+ 2 14336356 2 34 441135033 30.7703737965 30.0 33.0 33.0 3.0 26 34 2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 4419184 2170537 4627987 3118567 81
+ 3 14336356 2 34 433659182 30.2489127642 29.0 32.0 33.0 4.0 23 34 2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 4310988 2941988 3437467 3645784 129
+ 4 14336356 2 34 433635331 30.2472490917 29.0 32.0 33.0 4.0 23 34 2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 4110637 3007028 3671749 3546839 103
+ 5 14336356 2 34 432498583 30.167957813 29.0 32.0 33.0 4.0 23 34 2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 4348275 2935903 3293025 3759029 124
+
+-----
+
+.. class:: warningmark
+
+Adapter bases in color space reads are excluded from statistics.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. <http://www.ncbi.nlm.nih.gov/pubmed/20562416>`_
+
+
+
+
diff -r 000000000000 -r 9b7b4e0ca9db test-data/fastq_stats1.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_stats1.fastq Mon Jan 27 09:29:03 2014 -0500
@@ -0,0 +1,36 @@
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GTACGCATGACCGAACCCCCCNCCCCCCAATTGGTT
++CSHL_3_FC042AGLLWW:1:2:7:203
+BBC?7?B6>ABB?B;BBBCC9&;BCBBBBBBBB>>A
+@CSHL_3_FC042AGLLWW:1:2:7:33
+CAATGCCTCCAATTGGTTAATCCCCCTATATATACT
++CSHL_3_FC042AGLLWW:1:2:7:33
+8BBB?B;BB8?6@9B8BB=8.&1?,&;931&&&(BB
+@CSHL_3_FC042AGLLWW:1:2:7:169
+GCAGCAGGCGCGTCAGAGAGCCCCCCCCCCCCCCCC
++CSHL_3_FC042AGLLWW:1:2:7:169
+B@.?B=6BBB@.@BBBBBBBBBBBBBBB7=;6(663
+@CSHL_3_FC042AGLLWW:1:2:7:1436
+AATTATTTATTAAATTTTAATAATATGGGAGACACT
++CSHL_3_FC042AGLLWW:1:2:7:1436
+B?BBBBBBBBBBBBBBB@6ABBBBB@4@BBBBB77<
+@CSHL_3_FC042AGLLWW:1:2:7:292
+GGAGAAATACACACAATTGGTTAATCCCCCTATATA
++CSHL_3_FC042AGLLWW:1:2:7:292
+CBCBBBBBBB6.BBBBBBBBBBB=9&66&1@>6&3&
+@CSHL_3_FC042AGLLWW:1:2:7:1819
+AATTCAAACCACCCCAACCCACACACAGAGATACAA
++CSHL_3_FC042AGLLWW:1:2:7:1819
+B==2777-BB-0&96866&,66-&.6&66,6-*2.6
+@CSHL_3_FC042AGLLWW:1:2:7:1875
+GCAAAAGAGTAGTGTACCCCCCCCCCCCCCCCCCCC
++CSHL_3_FC042AGLLWW:1:2:7:1875
+BBBBBBBBB9699&9BBBBBA@;BBBBBBBBB9&96
+@CSHL_3_FC042AGLLWW:1:2:8:624
+ACTGCAATTGGTTAATCCCCCTATATAGCGCTGTGG
++CSHL_3_FC042AGLLWW:1:2:8:624
+BB<4?A9ABB@>>009.6?@<.6@BBBBBBBBBBBB
+@CSHL_3_FC042AGLLWW:1:2:8:250
+TGCCGCGCACACTGATGCAATTGGTTAATCCCCCTA
++CSHL_3_FC042AGLLWW:1:2:8:250
+BBBBBBBB?BBBBBBCCC<,91&6<39;?+6,3,9&
diff -r 000000000000 -r 9b7b4e0ca9db test-data/fastq_stats_1_out.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_stats_1_out.tabular Mon Jan 27 09:29:03 2014 -0500
@@ -0,0 +1,37 @@
+#column count min max sum mean Q1 med Q3 IQR lW rW outliers A_Count C_Count G_Count T_Count N_Count other_bases other_base_count
+1 9 23 34 288 32.0 33.0 33.0 33.0 0.0 33 33 23,34 3 1 4 1 0
+2 9 28 33 287 31.8888888889 30.5 33.0 33.0 2.5 28 33 3 3 2 1 0
+3 9 13 34 268 29.7777777778 27.5 33.0 33.5 6.0 27 34 13 5 1 0 3 0
+4 9 17 33 261 29.0 24.5 33.0 33.0 8.5 17 33 1 2 3 3 0
+5 9 22 33 269 29.8888888889 26.0 33.0 33.0 7.0 22 33 3 3 3 0 0
+6 9 22 33 277 30.7777777778 29.0 33.0 33.0 4.0 28 33 22 5 3 0 1 0
+7 9 21 33 258 28.6666666667 23.0 33.0 33.0 10.0 21 33 4 1 3 1 0
+8 9 12 33 263 29.2222222222 26.5 33.0 33.0 6.5 21 33 12 2 1 1 5 0
+9 9 29 33 290 32.2222222222 31.5 33.0 33.0 1.5 30 33 29 3 3 2 1 0
+10 9 23 33 277 30.7777777778 28.0 33.0 33.0 5.0 23 33 1 4 2 2 0
+11 9 12 33 245 27.2222222222 21.0 31.0 33.0 12.0 12 33 5 2 1 1 0
+12 9 13 33 214 23.7777777778 14.0 24.0 33.0 19.0 13 33 2 4 2 1 0
+13 9 5 33 249 27.6666666667 26.5 31.0 33.0 6.5 24 33 5 2 1 1 5 0
+14 9 5 33 233 25.8888888889 19.5 33.0 33.0 13.5 5 33 3 3 2 1 0
+15 9 15 33 251 27.8888888889 22.5 33.0 33.0 10.5 15 33 5 1 1 2 0
+16 9 23 34 269 29.8888888889 23.5 33.0 33.0 9.5 23 34 3 1 2 3 0
+17 9 13 34 266 29.5555555556 27.0 33.0 33.0 6.0 21 34 13 2 3 1 3 0
+18 9 21 34 272 30.2222222222 26.0 33.0 33.0 7.0 21 34 0 5 1 3 0
+19 9 5 34 244 27.1111111111 24.0 30.0 33.0 9.0 21 34 5 4 4 1 0 0
+20 9 11 34 241 26.7777777778 17.0 32.0 33.0 16.0 11 34 3 4 2 0 0
+21 9 13 33 240 26.6666666667 22.5 27.0 33.0 10.5 13 33 1 4 0 4 0
+22 9 5 33 190 21.1111111111 9.0 21.0 33.0 24.0 5 33 1 4 0 3 1
+23 9 5 33 205 22.7777777778 14.0 26.0 33.0 19.0 5 33 4 4 1 0 0
+24 9 5 33 247 27.4444444444 24.5 31.0 33.0 8.5 21 33 5 1 5 1 2 0
+25 9 11 34 241 26.7777777778 18.5 33.0 33.0 14.5 11 34 3 4 0 2 0
+26 9 5 33 212 23.5555555556 11.5 31.0 33.0 21.5 5 33 0 6 0 3 0
+27 9 5 33 227 25.2222222222 20.0 26.0 33.0 13.0 5 33 3 4 1 1 0
+28 9 21 33 255 28.3333333333 22.5 31.0 33.0 10.5 21 33 2 4 3 0 0
+29 9 5 33 228 25.3333333333 19.5 30.0 33.0 13.5 5 33 2 4 1 2 0
+30 9 10 33 213 23.6666666667 13.5 28.0 33.0 19.5 10 33 3 4 2 0 0
+31 9 5 33 236 26.2222222222 21.0 31.0 33.0 12.0 5 33 1 4 1 3 0
+32 9 5 33 210 23.3333333333 11.5 29.0 33.0 21.5 5 33 3 3 0 3 0
+33 9 5 33 183 20.3333333333 8.0 21.0 33.0 25.0 5 33 1 4 2 2 0
+34 9 5 33 150 16.6666666667 6.0 17.0 25.5 19.5 5 33 3 4 1 1 0
+35 9 13 33 217 24.1111111111 19.5 24.0 31.0 11.5 13 33 1 4 1 3 0
+36 9 5 33 195 21.6666666667 11.5 21.0 32.5 21.0 5 33 3 2 1 3 0
diff -r 000000000000 -r 9b7b4e0ca9db tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Mon Jan 27 09:29:03 2014 -0500
@@ -0,0 +1,6 @@
+
+
+
+
+
+