changeset 0:04699558a38a draft default tip

planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/fasta_add_barcode commit e857f7126443e115f11954085423f8999bc870aa-dirty
author bebatut
date Fri, 15 Apr 2016 06:04:56 -0400
parents
children
files fasta_add_barcode.py fasta_add_barcode.xml test-data/input_sequences.fasta test-data/mapping_file.txt test-data/output_test.fasta
diffstat 5 files changed, 199 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_add_barcode.py	Fri Apr 15 06:04:56 2016 -0400
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import argparse
+import copy
+import operator
+
+
+def write_seq_fasta_format(seq, output_file):
+    split_seq = [seq[i:i+60] for i in xrange(0,len(seq),60)]
+    for split in split_seq:
+        output_file.write(split + '\n')
+
+def fasta_add_barcode(args):
+    mapping = {}
+    with open(args.input_mapping_file,'r') as input_mapping_file:
+        for line in input_mapping_file:
+            split_line = line[:-1].split('\t')
+
+            if len(split_line) != 2:
+                string = 'Incorrect number of column in mapping file.'
+                string += '\nTwo tabular separated columns are expected'
+                raise ValueError(string)
+
+            mapping[split_line[0]] = split_line[1]
+
+    seq_id = ''
+    seq = ''
+    with open(args.input_sequence_file,'r') as input_sequence_file:
+        with open(args.output_sequence_file, 'w') as output_sequence_file:
+            for line in input_sequence_file:
+                if line.startswith('>'):
+                    if seq != '':
+                        if not mapping.has_key(seq_id):
+                            string = 'A sequence identifier (' + seq_id + ') is'
+                            string += ' not found in mapping file'
+                            raise ValueError(string)
+
+                        output_sequence_file.write('>' + seq_id + '\n')
+
+                        barcode = mapping[seq_id]
+                        seq = barcode + seq
+                        write_seq_fasta_format(seq, output_sequence_file)
+                    seq_id = line[1:-1].split( )[0]
+                    seq = ''
+                else:
+                    seq += line[:-1]
+
+########
+# Main #
+########
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_sequence_file', required=True)
+    parser.add_argument('--input_mapping_file', required=True)
+    parser.add_argument('--output_sequence_file', required=True)
+    args = parser.parse_args()
+
+    fasta_add_barcode(args)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_add_barcode.xml	Fri Apr 15 06:04:56 2016 -0400
@@ -0,0 +1,60 @@
+<tool id="fasta_add_barcode" name="Add barcodes" version="1.0">
+
+    <description>to FASTA sequences</description>
+
+    <requirements>
+    </requirements>
+
+    <stdio>
+    </stdio>
+
+    <version_command>python -version</version_command>
+
+    <!--<command>-->
+    <command><![CDATA[
+        python $__tool_directory__/fasta_add_barcode.py 
+            --input_sequence_file $sequence_file
+            --input_mapping_file $mapping_file
+            --output_sequence_file $output_sequence_file
+        ]]>
+    </command>
+
+    <inputs>
+        <param name="sequence_file" type="data" format="fasta" label="Sequence 
+        zfile" help=""/>
+        <param name="mapping_file" type="data" format="tabular" 
+            label="Mapping file between sequence identifier and barcode to add" 
+            help="The mapping file must be a tabular delimited file with
+            two columns. The first column contains sequence identifier and
+            the second column corresponding barcode to add at the beginning
+            of each sequence."/>
+    </inputs>
+
+    <outputs>
+        <data format="fasta" name="output_sequence_file" metadata_source="sequence_file" 
+            label="Sequences with added barcodes from ${on_string}" />
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="sequence_file" value="input_sequences.fasta"/>
+            <param name="mapping_file" value="mapping_file.txt"/>
+            <output name="output_sequence_file" file="output_test.fasta" />
+        </test>
+    </tests>
+
+    <help><![CDATA[
+
+**What it does**
+
+This tool takes a FASTA file and add at the beginning of each sequence a barcode.
+
+The barcode of each sequence is determined given its sequence identifier. The mapping between sequence identifier and corresponding barcode is defined inside the mapping file. This file must be a tabular delimited file with two columns: the first with sequence identifiers and the second the corresponding barcode.
+
+]]>
+    </help>
+
+    <citations>
+    </citations>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_sequences.fasta	Fri Apr 15 06:04:56 2016 -0400
@@ -0,0 +1,38 @@
+>SRR029699.4565 EXRA6YY02A66ZM
+ACGGCTCGGTGCCGTCCACCGGTCAGCGGCGCCCTGGCCTCCCACGGGCTGACCCCGCAG
+TACTCTCGGCGCGATGGGGCTTAGCTTCCGGGTTCGGAACGGGACCGGGCGTGCCCC
+>SRR029699.5673 EXRA6YY02A5CA6
+GACCAGAACAACGCCGAAGATCAAAACGGTGAAAACAACGGCGAACAGGGAGGAAACGAA
+TAATGAAGAAGCTTGTTCTTTTAGCCCTCGGGGCCTTATTGTTAGCGGGCGCCACCTGCT
+CTCCCACACCGTCTCCAGTGCAGTACCATCGGCCGCTTGGGTCTTAACCATCGTGTTCGG
+GATGGGAACGGGTGTGTCCCCCAAGCGCATCGCCACCAGCAGTAGTTATCCTAGTTTTTG
+AAACTGTTAAAGTCTACGGCTCATCGTCCGTTACCTTAATAACTAAACAGTACGTAAAAC
+CC
+>SRR029699.5714 EXRA6YY02A735O
+ATCCCGAACACGATGGTAAAGACCTAAGCGGCCGATGGTACTGCACTGGAGACGGTGTGG
+GAGAGTAGGTGGCCGCCAAATTAAAAAGAAAATAAACCGAGAGGTTCTGCGATAAAGCTG
+GTCTTCACCAGTGATCAGAGTTTAAAAGAAAGCTTTTAGTCTCTGATGACTGATGAAAAA
+GTCAGTCAGATGAAGAACTGAATAAGGACTTCATCTTATGTACCTTGAAAACTGCATATA
+GTAAAAATCAATAGATTTAGATAAATA
+>SRR029699.14351 EXRA6YY04CD0L5
+CGGTGGCGATGCGCTTTTGGGAAACACCCGTCCACATCCCGAACACGATGGTTAAGACTG
+AAGCGGCCGATAATACTATACTGGTAACGGTATGGGAAGGTAGGTGGCTGCCGGATTTAT
+AAAGAACAGCATAGCATATATGCTTTCAAAATAGAACAGGCTCGAAAGAGCTAACCTACA
+TCAGCAGGGAAGTGCTGTTTAGATAGCTGGTTTTACCAGTGATTAGAGTATTTTAAGATA
+TTAAGGTTTCTAATGACTGAATAAAACAG
+>SRR029699.30831 EYTWXA302GEDQR
+GCTTATGGGACACACCCGTTCCCATCCCGAACACGACGGTTAAGACGTAAGCGGCCGATG
+GTACTATGCTGGAGACGGCATGGGAGAGCAGGTGGGTGCCGGACTGAGACACGCAACAGG
+GGATAGGCAAGGCACACAGGGGATAGG
+>SRR029699.33623 EYTWXA302HT3FX
+GACTACGAGGTTGATAGGCACGATGTGTAAGTGGAGCGATCCATTCAGCAAGTGTGTACT
+AATAGATCGAGGGCTTGACCACAATTCGCTTGAATTCTCAAGTCAATGACAAAATGTTAG
+CAGTGATTATTCAGTTTTGAAGGCACGTCCTTCAAGAAATACTGGACAAAGTAAAACAGA
+AATGTTATACTGAACCAGTCATATTGGTCGGTGACGATGACGGTGAGGTTCCACCTGTTC
+CCATTCCGAACACAG
+>SRR029699.38075 EYTWXA302GR8DJ
+ACTTTATCAAGATACCAAGTGGAGAATACGAGATTCGAACTCGTGACCTCCTGCTTGCAA
+GGCAGGCGCTCTCCCAACTGAGCTAATCCCCCAAGGGAATCCGGCAGCCACCTGCTCTCC
+CATGCCATCTCCAGCATAGTACCATCGGCCGCTCAGGTCTTAACCATCGTGTTCGGGATG
+GGAACGGGTGTGTCCCCTGAGCGCATCGCCACCGGAAATATCTTATCAAGTTTTTGCTTG
+ATAACTGAATAAAC
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mapping_file.txt	Fri Apr 15 06:04:56 2016 -0400
@@ -0,0 +1,7 @@
+SRR029699.4565	AAAAAA
+SRR029699.5673	CCCCCC
+SRR029699.5714	TTTTTT
+SRR029699.14351	GGGGGG
+SRR029699.30831	AAAAAA
+SRR029699.33623	CCCCCC
+SRR029699.38075	TTTTTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_test.fasta	Fri Apr 15 06:04:56 2016 -0400
@@ -0,0 +1,33 @@
+>SRR029699.4565
+AAAAAAACGGCTCGGTGCCGTCCACCGGTCAGCGGCGCCCTGGCCTCCCACGGGCTGACC
+CCGCAGTACTCTCGGCGCGATGGGGCTTAGCTTCCGGGTTCGGAACGGGACCGGGCGTGC
+CCC
+>SRR029699.5673
+CCCCCCGACCAGAACAACGCCGAAGATCAAAACGGTGAAAACAACGGCGAACAGGGAGGA
+AACGAATAATGAAGAAGCTTGTTCTTTTAGCCCTCGGGGCCTTATTGTTAGCGGGCGCCA
+CCTGCTCTCCCACACCGTCTCCAGTGCAGTACCATCGGCCGCTTGGGTCTTAACCATCGT
+GTTCGGGATGGGAACGGGTGTGTCCCCCAAGCGCATCGCCACCAGCAGTAGTTATCCTAG
+TTTTTGAAACTGTTAAAGTCTACGGCTCATCGTCCGTTACCTTAATAACTAAACAGTACG
+TAAAACCC
+>SRR029699.5714
+TTTTTTATCCCGAACACGATGGTAAAGACCTAAGCGGCCGATGGTACTGCACTGGAGACG
+GTGTGGGAGAGTAGGTGGCCGCCAAATTAAAAAGAAAATAAACCGAGAGGTTCTGCGATA
+AAGCTGGTCTTCACCAGTGATCAGAGTTTAAAAGAAAGCTTTTAGTCTCTGATGACTGAT
+GAAAAAGTCAGTCAGATGAAGAACTGAATAAGGACTTCATCTTATGTACCTTGAAAACTG
+CATATAGTAAAAATCAATAGATTTAGATAAATA
+>SRR029699.14351
+GGGGGGCGGTGGCGATGCGCTTTTGGGAAACACCCGTCCACATCCCGAACACGATGGTTA
+AGACTGAAGCGGCCGATAATACTATACTGGTAACGGTATGGGAAGGTAGGTGGCTGCCGG
+ATTTATAAAGAACAGCATAGCATATATGCTTTCAAAATAGAACAGGCTCGAAAGAGCTAA
+CCTACATCAGCAGGGAAGTGCTGTTTAGATAGCTGGTTTTACCAGTGATTAGAGTATTTT
+AAGATATTAAGGTTTCTAATGACTGAATAAAACAG
+>SRR029699.30831
+AAAAAAGCTTATGGGACACACCCGTTCCCATCCCGAACACGACGGTTAAGACGTAAGCGG
+CCGATGGTACTATGCTGGAGACGGCATGGGAGAGCAGGTGGGTGCCGGACTGAGACACGC
+AACAGGGGATAGGCAAGGCACACAGGGGATAGG
+>SRR029699.33623
+CCCCCCGACTACGAGGTTGATAGGCACGATGTGTAAGTGGAGCGATCCATTCAGCAAGTG
+TGTACTAATAGATCGAGGGCTTGACCACAATTCGCTTGAATTCTCAAGTCAATGACAAAA
+TGTTAGCAGTGATTATTCAGTTTTGAAGGCACGTCCTTCAAGAAATACTGGACAAAGTAA
+AACAGAAATGTTATACTGAACCAGTCATATTGGTCGGTGACGATGACGGTGAGGTTCCAC
+CTGTTCCCATTCCGAACACAG