Galaxy |

Changeset 0:ef23c03d7497 (2014-05-19)

Commit message:
Imported from capsule None

added:
fastqsolexa_to_fasta_qual.py
fastqsolexa_to_fasta_qual.xml
test-data/1.fastqsolexa
test-data/2.fastqsolexa
test-data/fastqsolexa_to_fasta_qual_out2.fasta
test-data/fastqsolexa_to_fasta_qual_out2.qualsolexa
test-data/fastqsolexa_to_fasta_qual_out4.fasta
test-data/fastqsolexa_to_fasta_qual_out4.qualsolexa

diff -r 000000000000 -r ef23c03d7497 fastqsolexa_to_fasta_qual.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastqsolexa_to_fasta_qual.py Mon May 19 12:33:24 2014 -0400

[

@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+
+"""
+convert fastqsolexa file to separated sequence and quality files.
+
+assume each sequence and quality score are contained in one line
+the order should be:
+1st line: @title_of_seq
+2nd line: nucleotides
+3rd line: +title_of_qualityscore (might be skipped)
+4th line: quality scores
+(in three forms: a. digits, b. ASCII codes, the first char as the coding base, c. ASCII codes without the first char.)
+
+Usage:
+%python fastqsolexa_to_fasta_qual.py <your_fastqsolexa_filename> <output_seq_filename> <output_score_filename>
+"""
+
+import sys, os
+from math import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( "%s" % msg )
+    sys.exit()
+
+def __main__():
+    infile_name = sys.argv[1]
+    outfile_seq = open( sys.argv[2], 'w' )
+    outfile_score = open( sys.argv[3], 'w' )
+    datatype = sys.argv[4]
+    seq_title_startswith = ''
+    qual_title_startswith = ''
+    default_coding_value = 64
+    fastq_block_lines = 0
+
+    for i, line in enumerate( file( infile_name ) ):
+        line = line.rstrip()
+        if not line or line.startswith( '#' ):
+            continue
+        fastq_block_lines = ( fastq_block_lines + 1 ) % 4
+        line_startswith = line[0:1]
+        if fastq_block_lines == 1:
+            # first line is @title_of_seq
+            if not seq_title_startswith:
+                seq_title_startswith = line_startswith
+            if line_startswith != seq_title_startswith:
+                outfile_seq.close()
+                outfile_score.close()
+                stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
+            read_title = line[1:]
+            outfile_seq.write( '>%s\n' % line[1:] )
+        elif fastq_block_lines == 2:
+            # second line is nucleotides
+            read_length = len( line )
+            outfile_seq.write( '%s\n' % line )
+        elif fastq_block_lines == 3:
+            # third line is +title_of_qualityscore ( might be skipped )
+            if not qual_title_startswith:
+                qual_title_startswith = line_startswith
+            if line_startswith != qual_title_startswith:
+                outfile_seq.close()
+                outfile_score.close()
+                stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
+            quality_title = line[1:]
+            if quality_title and read_title != quality_title:
+                outfile_seq.close()
+                outfile_score.close()
+                stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) )
+            if not quality_title:
+                outfile_score.write( '>%s\n' % read_title )
+            else:
+                outfile_score.write( '>%s\n' % line[1:] )
+        else:
+            # fourth line is quality scores
+            qual = ''
+            fastq_integer = True
+            # peek: ascii or digits?
+            val = line.split()[0]
+            try:
+                check = int( val )
+                fastq_integer = True
+            except:
+                fastq_integer = False
+
+            if fastq_integer:
+                # digits
+                qual = line
+            else:
+                # ascii
+                quality_score_length = len( line )
+                if quality_score_length == read_length + 1:
+                    # first char is qual_score_startswith
+                    qual_score_startswith = ord( line[0:1] )
+                    line = line[1:]
+                elif quality_score_length == read_length:
+                    qual_score_startswith = default_coding_value
+                else:
+                    stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) )
+                for j, char in enumerate( line ):
+                    score = ord( char ) - qual_score_startswith    # 64
+                    qual = "%s%s " % ( qual, str( score ) )
+            outfile_score.write( '%s\n' % qual )
+
+    outfile_seq.close()
+    outfile_score.close()
+
+if __name__ == "__main__": __main__()
+
\ No newline at end of file

diff -r 000000000000 -r ef23c03d7497 fastqsolexa_to_fasta_qual.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastqsolexa_to_fasta_qual.xml Mon May 19 12:33:24 2014 -0400

@@ -0,0 +1,93 @@
+<tool id="fastqsolexa_to_fasta_qual" name="FASTQSOLEXA-to-FASTA-QUAL" version="1.0.0">
+  <description>extracts sequences and quality scores from FASTQSOLEXA data</description>
+  <command interpreter="python">fastqsolexa_to_fasta_qual.py $input1 $output1 $output2 $input1.extension</command>
+  <inputs>
+    <param name="input1" type="data" format="fastqsolexa" label="Fastqsolexa file"/>
+  </inputs>
+  <outputs>
+    <data name="output1" format="fasta"/>
+    <data name="output2" format="qualsolexa"/>
+  </outputs>
+  <tests>
+    
+    <test>
+      <param name="input1" value="1.fastqsolexa" ftype="fastqsolexa" />
+      <output name="output1" file="fastqsolexa_to_fasta_qual_out4.fasta" />
+      <output name="output2" file="fastqsolexa_to_fasta_qual_out4.qualsolexa" />
+    </test>
+    <test>
+      <param name="input1" value="2.fastqsolexa" ftype="fastqsolexa" />
+      <output name="output1" file="fastqsolexa_to_fasta_qual_out2.fasta" />
+      <output name="output2" file="fastqsolexa_to_fasta_qual_out2.qualsolexa" />
+    </test>
+  </tests>
+  <help>
+
+.. class:: warningmark
+
+IMPORTANT: This tool currently only supports data where the quality scores are integers or ASCII quality scores with base 64.
+
+-----
+
+**What it does**
+
+This tool extracts sequences and quality scores from FASTQ data ( Solexa variant ), producing a FASTA dataset and a QUAL dataset.
+
+-----
+
+**Example1**
+
+- Converting the following Solexa fastq data::
+
+    @seq1
+    GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT
+    +seq1
+    hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh
+    @seq2
+    GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG
+    +seq2
+    hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO
+
+- will extract the following sequences::
+
+    >seq1
+    GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT
+    >seq2
+    GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG
+
+- and quality scores::
+
+    >seq1
+    40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 16 23 0 40 40 40 40 40 40
+    >seq2
+    40 40 40 40 40 40 40 40 40 40 40 40 40 40 25 40 40 33 40 40 40 40 23 40 1 40 6 40 19 9 10 7 3 40 15
+
+**Example2**
+
+- Converting the following Solexa fastq data::
+
+    @HANNIBAL_1_FC302VTAAXX:2:1:228:167
+    GAATTGATCAGGACATAGGACAACTGTAGGCACCAT
+    +HANNIBAL_1_FC302VTAAXX:2:1:228:167
+    40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4
+    @HANNIBAL_1_FC302VTAAXX:2:1:156:340
+    GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG
+    +HANNIBAL_1_FC302VTAAXX:2:1:156:340
+    40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9
+
+- will extract the following sequences::
+
+    >HANNIBAL_1_FC302VTAAXX:2:1:228:167
+    GAATTGATCAGGACATAGGACAACTGTAGGCACCAT
+    >HANNIBAL_1_FC302VTAAXX:2:1:156:340
+    GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG
+
+- and quality scores::
+
+    >HANNIBAL_1_FC302VTAAXX:2:1:228:167
+    40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4
+    >HANNIBAL_1_FC302VTAAXX:2:1:156:340
+    40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9
+
+    </help>
+</tool>

diff -r 000000000000 -r ef23c03d7497 test-data/1.fastqsolexa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/1.fastqsolexa Mon May 19 12:33:24 2014 -0400

@@ -0,0 +1,8 @@
+@HANNIBAL_1_FC302VTAAXX:2:1:228:167
+GAATTGATCAGGACATAGGACAACTGTAGGCACCAT
++HANNIBAL_1_FC302VTAAXX:2:1:228:167
+40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4
+@HANNIBAL_1_FC302VTAAXX:2:1:156:340
+GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG
++HANNIBAL_1_FC302VTAAXX:2:1:156:340
+40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9
\ No newline at end of file

diff -r 000000000000 -r ef23c03d7497 test-data/2.fastqsolexa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/2.fastqsolexa Mon May 19 12:33:24 2014 -0400

@@ -0,0 +1,8 @@
+@seq1
+GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT
++seq1
+hhhhhhhhhhhhhhhhhhhhhhhhhhPW@hhhhhh
+@seq2
+GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG
++seq2
+hhhhhhhhhhhhhhYhhahhhhWhAhFhSIJGChO
\ No newline at end of file

diff -r 000000000000 -r ef23c03d7497 test-data/fastqsolexa_to_fasta_qual_out2.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastqsolexa_to_fasta_qual_out2.fasta Mon May 19 12:33:24 2014 -0400

@@ -0,0 +1,4 @@
+>seq1
+GACAGCTTGGTTTTTAGTGAGTTGTTCCTTTCTTT
+>seq2
+GCAATGACGGCAGCAATAAACTCAACAGGTGCTGG

diff -r 000000000000 -r ef23c03d7497 test-data/fastqsolexa_to_fasta_qual_out2.qualsolexa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastqsolexa_to_fasta_qual_out2.qualsolexa Mon May 19 12:33:24 2014 -0400

@@ -0,0 +1,4 @@
+>seq1
+40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 16 23 0 40 40 40 40 40 40
+>seq2
+40 40 40 40 40 40 40 40 40 40 40 40 40 40 25 40 40 33 40 40 40 40 23 40 1 40 6 40 19 9 10 7 3 40 15

diff -r 000000000000 -r ef23c03d7497 test-data/fastqsolexa_to_fasta_qual_out4.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastqsolexa_to_fasta_qual_out4.fasta Mon May 19 12:33:24 2014 -0400

@@ -0,0 +1,4 @@
+>HANNIBAL_1_FC302VTAAXX:2:1:228:167
+GAATTGATCAGGACATAGGACAACTGTAGGCACCAT
+>HANNIBAL_1_FC302VTAAXX:2:1:156:340
+GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG

diff -r 000000000000 -r ef23c03d7497 test-data/fastqsolexa_to_fasta_qual_out4.qualsolexa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastqsolexa_to_fasta_qual_out4.qualsolexa Mon May 19 12:33:24 2014 -0400

@@ -0,0 +1,4 @@
+>HANNIBAL_1_FC302VTAAXX:2:1:228:167
+40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4
+>HANNIBAL_1_FC302VTAAXX:2:1:156:340
+40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9