Repository 'fasta_compute_length'
hg clone https://toolshed.g2.bx.psu.edu/repos/devteam/fasta_compute_length

Changeset 0:ece409f6573c (2014-05-19)
Next changeset 1:d8cc2c8eef14 (2015-07-21)
Commit message:
Imported from capsule None
added:
fasta_compute_length.py
fasta_compute_length.xml
utils/__init__.py
utils/fasta_to_len.py
b
diff -r 000000000000 -r ece409f6573c fasta_compute_length.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_compute_length.py Mon May 19 12:34:12 2014 -0400
[
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+"""
+Uses fasta_to_len converter code.
+"""
+
+import sys
+from utils.fasta_to_len import compute_fasta_length
+
+compute_fasta_length( sys.argv[1], sys.argv[2], sys.argv[3], False )
\ No newline at end of file
b
diff -r 000000000000 -r ece409f6573c fasta_compute_length.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_compute_length.xml Mon May 19 12:34:12 2014 -0400
b
@@ -0,0 +1,51 @@
+<tool id="fasta_compute_length" name="Compute sequence length">
+ <description></description>
+ <command interpreter="python">fasta_compute_length.py $input $output $keep_first</command>
+ <inputs>
+ <param name="input" type="data" format="fasta" label="Compute length for these sequences"/>
+ <param name="keep_first" type="integer" size="5" value="0" label="How many title characters to keep?" help="'0' = keep the whole thing"/>
+ </inputs>
+ <outputs>
+ <data name="output" format="tabular"/>
+ </outputs>
+ <tests>
+ <test>
+ <param name="input" value="454.fasta" />
+ <param name="keep_first" value="0"/>
+ <output name="output" file="fasta_tool_compute_length_1.out" />
+ </test>
+
+ <test>
+ <param name="input" value="extract_genomic_dna_out1.fasta" />
+ <param name="keep_first" value="0"/>
+ <output name="output" file="fasta_tool_compute_length_2.out" />
+ </test>
+
+ <test>
+ <param name="input" value="454.fasta" />
+ <param name="keep_first" value="14"/>
+ <output name="output" file="fasta_tool_compute_length_3.out" />
+ </test>
+ </tests>
+ <help>
+
+**What it does**
+
+This tool counts the length of each fasta sequence in the file. The output file has two columns per line (separated by tab): fasta titles and lengths of the sequences. The option *How many characters to keep?* allows to select a specified number of letters from the beginning of each FASTA entry. 
+
+-----
+
+**Example**
+
+Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run::
+
+    &gt;EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_     TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG     TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG     &gt;EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_     AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAAfa
+
+Running this tool while setting **How many characters to keep?** to **14** will produce this::
+
+ EYKX4VC02EQLO5  108
+ EYKX4VC02D4GS2  60
+
+
+ </help>
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r ece409f6573c utils/fasta_to_len.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/fasta_to_len.py Mon May 19 12:34:12 2014 -0400
[
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+"""
+Input: fasta, int
+Output: tabular
+Return titles with lengths of corresponding seq
+"""
+
+import sys, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def compute_fasta_length( fasta_file, out_file, keep_first_char, keep_first_word=False ):
+
+    infile = fasta_file
+    out = open( out_file, 'w')
+    keep_first_char = int( keep_first_char )
+
+    fasta_title = ''
+    seq_len = 0
+
+    # number of char to keep in the title
+    if keep_first_char == 0:
+        keep_first_char = None
+    else:
+        keep_first_char += 1
+
+    first_entry = True
+
+    for line in open( infile ):
+        line = line.strip()
+        if not line or line.startswith( '#' ):
+            continue
+        if line[0] == '>':
+            if first_entry == False:
+                if keep_first_word:
+                    fasta_title = fasta_title.split()[0]
+                out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) )
+            else:
+                first_entry = False
+            fasta_title = line
+            seq_len = 0
+        else:
+            seq_len += len(line)
+
+    # last fasta-entry
+    if keep_first_word:
+        fasta_title = fasta_title.split()[0]
+    out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) )
+    out.close()
+
+if __name__ == "__main__" :
+    compute_fasta_length( sys.argv[1], sys.argv[2], sys.argv[3], True )
\ No newline at end of file