Commit message:
Imported from capsule None |
added:
fastqsolexa_to_fasta_qual.py fastqsolexa_to_fasta_qual.xml test-data/1.fastqsolexa test-data/2.fastqsolexa test-data/fastqsolexa_to_fasta_qual_out2.fasta test-data/fastqsolexa_to_fasta_qual_out2.qualsolexa test-data/fastqsolexa_to_fasta_qual_out4.fasta test-data/fastqsolexa_to_fasta_qual_out4.qualsolexa |
b |
diff -r 000000000000 -r ef23c03d7497 fastqsolexa_to_fasta_qual.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastqsolexa_to_fasta_qual.py Mon May 19 12:33:24 2014 -0400 |
[ |
@@ -0,0 +1,109 @@ +#!/usr/bin/env python + +""" +convert fastqsolexa file to separated sequence and quality files. + +assume each sequence and quality score are contained in one line +the order should be: +1st line: @title_of_seq +2nd line: nucleotides +3rd line: +title_of_qualityscore (might be skipped) +4th line: quality scores +(in three forms: a. digits, b. ASCII codes, the first char as the coding base, c. ASCII codes without the first char.) + +Usage: +%python fastqsolexa_to_fasta_qual.py <your_fastqsolexa_filename> <output_seq_filename> <output_score_filename> +""" + +import sys, os +from math import * + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( "%s" % msg ) + sys.exit() + +def __main__(): + infile_name = sys.argv[1] + outfile_seq = open( sys.argv[2], 'w' ) + outfile_score = open( sys.argv[3], 'w' ) + datatype = sys.argv[4] + seq_title_startswith = '' + qual_title_startswith = '' + default_coding_value = 64 + fastq_block_lines = 0 + + for i, line in enumerate( file( infile_name ) ): + line = line.rstrip() + if not line or line.startswith( '#' ): + continue + fastq_block_lines = ( fastq_block_lines + 1 ) % 4 + line_startswith = line[0:1] + if fastq_block_lines == 1: + # first line is @title_of_seq + if not seq_title_startswith: + seq_title_startswith = line_startswith + if line_startswith != seq_title_startswith: + outfile_seq.close() + outfile_score.close() + stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) ) + read_title = line[1:] + outfile_seq.write( '>%s\n' % line[1:] ) + elif fastq_block_lines == 2: + # second line is nucleotides + read_length = len( line ) + outfile_seq.write( '%s\n' % line ) + elif fastq_block_lines == 3: + # third line is +title_of_qualityscore ( might be skipped ) + if not qual_title_startswith: + qual_title_startswith = line_startswith + if line_startswith != qual_title_startswith: + outfile_seq.close() + outfile_score.close() + stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) ) + quality_title = line[1:] + if quality_title and read_title != quality_title: + outfile_seq.close() + outfile_score.close() + stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) ) + if not quality_title: + outfile_score.write( '>%s\n' % read_title ) + else: + outfile_score.write( '>%s\n' % line[1:] ) + else: + # fourth line is quality scores + qual = '' + fastq_integer = True + # peek: ascii or digits? + val = line.split()[0] + try: + check = int( val ) + fastq_integer = True + except: + fastq_integer = False + + if fastq_integer: + # digits + qual = line + else: + # ascii + quality_score_length = len( line ) + if quality_score_length == read_length + 1: + # first char is qual_score_startswith + qual_score_startswith = ord( line[0:1] ) + line = line[1:] + elif quality_score_length == read_length: + qual_score_startswith = default_coding_value + else: + stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) + for j, char in enumerate( line ): + score = ord( char ) - qual_score_startswith # 64 + qual = "%s%s " % ( qual, str( score ) ) + outfile_score.write( '%s\n' % qual ) + + outfile_seq.close() + outfile_score.close() + +if __name__ == "__main__": __main__() + \ No newline at end of file |
b |
diff -r 000000000000 -r ef23c03d7497 fastqsolexa_to_fasta_qual.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastqsolexa_to_fasta_qual.xml Mon May 19 12:33:24 2014 -0400 |
b |
@@ -0,0 +1,93 @@ +<tool id="fastqsolexa_to_fasta_qual" name="FASTQSOLEXA-to-FASTA-QUAL" version="1.0.0"> + <description>extracts sequences and quality scores from FASTQSOLEXA data</description> + <command interpreter="python">fastqsolexa_to_fasta_qual.py $input1 $output1 $output2 $input1.extension</command> + <inputs> + <param name="input1" type="data" format="fastqsolexa" label="Fastqsolexa file"/> + </inputs> + <outputs> + <data name="output1" format="fasta"/> + <data name="output2" format="qualsolexa"/> + </outputs> + <tests> + <!-- NOTE: this tool generates 2 output files, but our functional tests currently only handle the last one generated --> + <test> + <param name="input1" value="1.fastqsolexa" ftype="fastqsolexa" /> + <output name="output1" file="fastqsolexa_to_fasta_qual_out4.fasta" /> + <output name="output2" file="fastqsolexa_to_fasta_qual_out4.qualsolexa" /> + </test> + <test> + <param name="input1" value="2.fastqsolexa" ftype="fastqsolexa" /> + <output name="output1" file="fastqsolexa_to_fasta_qual_out2.fasta" /> + <output name="output2" file="fastqsolexa_to_fasta_qual_out2.qualsolexa" /> + </test> + </tests> + <help> + +.. class:: warningmark + +IMPORTANT: This tool currently only supports data where the quality scores are integers or ASCII quality scores with base 64. + +----- + +**What it does** + +This tool extracts sequences and quality scores from FASTQ data ( Solexa variant ), producing a FASTA dataset and a QUAL dataset. + +----- + +**Example1** + +- Converting the following Solexa fastq data:: + + @seq1 + GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT + +seq1 + hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh + @seq2 + GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG + +seq2 + hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO + +- will extract the following sequences:: + + >seq1 + GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT + >seq2 + GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG + +- and quality scores:: + + >seq1 + 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 16 23 0 40 40 40 40 40 40 + >seq2 + 40 40 40 40 40 40 40 40 40 40 40 40 40 40 25 40 40 33 40 40 40 40 23 40 1 40 6 40 19 9 10 7 3 40 15 + +**Example2** + +- Converting the following Solexa fastq data:: + + @HANNIBAL_1_FC302VTAAXX:2:1:228:167 + GAATTGATCAGGACATAGGACAACTGTAGGCACCAT + +HANNIBAL_1_FC302VTAAXX:2:1:228:167 + 40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 + @HANNIBAL_1_FC302VTAAXX:2:1:156:340 + GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG + +HANNIBAL_1_FC302VTAAXX:2:1:156:340 + 40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 + +- will extract the following sequences:: + + >HANNIBAL_1_FC302VTAAXX:2:1:228:167 + GAATTGATCAGGACATAGGACAACTGTAGGCACCAT + >HANNIBAL_1_FC302VTAAXX:2:1:156:340 + GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG + +- and quality scores:: + + >HANNIBAL_1_FC302VTAAXX:2:1:228:167 + 40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 + >HANNIBAL_1_FC302VTAAXX:2:1:156:340 + 40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 + + </help> +</tool> |
b |
diff -r 000000000000 -r ef23c03d7497 test-data/1.fastqsolexa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.fastqsolexa Mon May 19 12:33:24 2014 -0400 |
b |
@@ -0,0 +1,8 @@ +@HANNIBAL_1_FC302VTAAXX:2:1:228:167 +GAATTGATCAGGACATAGGACAACTGTAGGCACCAT ++HANNIBAL_1_FC302VTAAXX:2:1:228:167 +40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 +@HANNIBAL_1_FC302VTAAXX:2:1:156:340 +GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG ++HANNIBAL_1_FC302VTAAXX:2:1:156:340 +40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 \ No newline at end of file |
b |
diff -r 000000000000 -r ef23c03d7497 test-data/2.fastqsolexa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/2.fastqsolexa Mon May 19 12:33:24 2014 -0400 |
b |
@@ -0,0 +1,8 @@ +@seq1 +GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT ++seq1 +hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh +@seq2 +GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG ++seq2 +hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO \ No newline at end of file |
b |
diff -r 000000000000 -r ef23c03d7497 test-data/fastqsolexa_to_fasta_qual_out2.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fastqsolexa_to_fasta_qual_out2.fasta Mon May 19 12:33:24 2014 -0400 |
b |
@@ -0,0 +1,4 @@ +>seq1 +GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT +>seq2 +GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG |
b |
diff -r 000000000000 -r ef23c03d7497 test-data/fastqsolexa_to_fasta_qual_out2.qualsolexa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fastqsolexa_to_fasta_qual_out2.qualsolexa Mon May 19 12:33:24 2014 -0400 |
b |
@@ -0,0 +1,4 @@ +>seq1 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 16 23 0 40 40 40 40 40 40 +>seq2 +40 40 40 40 40 40 40 40 40 40 40 40 40 40 25 40 40 33 40 40 40 40 23 40 1 40 6 40 19 9 10 7 3 40 15 |
b |
diff -r 000000000000 -r ef23c03d7497 test-data/fastqsolexa_to_fasta_qual_out4.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fastqsolexa_to_fasta_qual_out4.fasta Mon May 19 12:33:24 2014 -0400 |
b |
@@ -0,0 +1,4 @@ +>HANNIBAL_1_FC302VTAAXX:2:1:228:167 +GAATTGATCAGGACATAGGACAACTGTAGGCACCAT +>HANNIBAL_1_FC302VTAAXX:2:1:156:340 +GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG |
b |
diff -r 000000000000 -r ef23c03d7497 test-data/fastqsolexa_to_fasta_qual_out4.qualsolexa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fastqsolexa_to_fasta_qual_out4.qualsolexa Mon May 19 12:33:24 2014 -0400 |
b |
@@ -0,0 +1,4 @@ +>HANNIBAL_1_FC302VTAAXX:2:1:228:167 +40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 +>HANNIBAL_1_FC302VTAAXX:2:1:156:340 +40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 |