Mercurial > repos > bebatut > fasta_add_barcode
changeset 0:04699558a38a draft default tip
planemo upload for repository https://github.com/ASaiM/galaxytools/tree/master/tools/fasta_add_barcode commit e857f7126443e115f11954085423f8999bc870aa-dirty
author | bebatut |
---|---|
date | Fri, 15 Apr 2016 06:04:56 -0400 |
parents | |
children | |
files | fasta_add_barcode.py fasta_add_barcode.xml test-data/input_sequences.fasta test-data/mapping_file.txt test-data/output_test.fasta |
diffstat | 5 files changed, 199 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_add_barcode.py Fri Apr 15 06:04:56 2016 -0400 @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import os +import argparse +import copy +import operator + + +def write_seq_fasta_format(seq, output_file): + split_seq = [seq[i:i+60] for i in xrange(0,len(seq),60)] + for split in split_seq: + output_file.write(split + '\n') + +def fasta_add_barcode(args): + mapping = {} + with open(args.input_mapping_file,'r') as input_mapping_file: + for line in input_mapping_file: + split_line = line[:-1].split('\t') + + if len(split_line) != 2: + string = 'Incorrect number of column in mapping file.' + string += '\nTwo tabular separated columns are expected' + raise ValueError(string) + + mapping[split_line[0]] = split_line[1] + + seq_id = '' + seq = '' + with open(args.input_sequence_file,'r') as input_sequence_file: + with open(args.output_sequence_file, 'w') as output_sequence_file: + for line in input_sequence_file: + if line.startswith('>'): + if seq != '': + if not mapping.has_key(seq_id): + string = 'A sequence identifier (' + seq_id + ') is' + string += ' not found in mapping file' + raise ValueError(string) + + output_sequence_file.write('>' + seq_id + '\n') + + barcode = mapping[seq_id] + seq = barcode + seq + write_seq_fasta_format(seq, output_sequence_file) + seq_id = line[1:-1].split( )[0] + seq = '' + else: + seq += line[:-1] + +######## +# Main # +######## +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--input_sequence_file', required=True) + parser.add_argument('--input_mapping_file', required=True) + parser.add_argument('--output_sequence_file', required=True) + args = parser.parse_args() + + fasta_add_barcode(args) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_add_barcode.xml Fri Apr 15 06:04:56 2016 -0400 @@ -0,0 +1,60 @@ +<tool id="fasta_add_barcode" name="Add barcodes" version="1.0"> + + <description>to FASTA sequences</description> + + <requirements> + </requirements> + + <stdio> + </stdio> + + <version_command>python -version</version_command> + + <!--<command>--> + <command><![CDATA[ + python $__tool_directory__/fasta_add_barcode.py + --input_sequence_file $sequence_file + --input_mapping_file $mapping_file + --output_sequence_file $output_sequence_file + ]]> + </command> + + <inputs> + <param name="sequence_file" type="data" format="fasta" label="Sequence + zfile" help=""/> + <param name="mapping_file" type="data" format="tabular" + label="Mapping file between sequence identifier and barcode to add" + help="The mapping file must be a tabular delimited file with + two columns. The first column contains sequence identifier and + the second column corresponding barcode to add at the beginning + of each sequence."/> + </inputs> + + <outputs> + <data format="fasta" name="output_sequence_file" metadata_source="sequence_file" + label="Sequences with added barcodes from ${on_string}" /> + </outputs> + + <tests> + <test> + <param name="sequence_file" value="input_sequences.fasta"/> + <param name="mapping_file" value="mapping_file.txt"/> + <output name="output_sequence_file" file="output_test.fasta" /> + </test> + </tests> + + <help><![CDATA[ + +**What it does** + +This tool takes a FASTA file and add at the beginning of each sequence a barcode. + +The barcode of each sequence is determined given its sequence identifier. The mapping between sequence identifier and corresponding barcode is defined inside the mapping file. This file must be a tabular delimited file with two columns: the first with sequence identifiers and the second the corresponding barcode. + +]]> + </help> + + <citations> + </citations> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input_sequences.fasta Fri Apr 15 06:04:56 2016 -0400 @@ -0,0 +1,38 @@ +>SRR029699.4565 EXRA6YY02A66ZM +ACGGCTCGGTGCCGTCCACCGGTCAGCGGCGCCCTGGCCTCCCACGGGCTGACCCCGCAG +TACTCTCGGCGCGATGGGGCTTAGCTTCCGGGTTCGGAACGGGACCGGGCGTGCCCC +>SRR029699.5673 EXRA6YY02A5CA6 +GACCAGAACAACGCCGAAGATCAAAACGGTGAAAACAACGGCGAACAGGGAGGAAACGAA +TAATGAAGAAGCTTGTTCTTTTAGCCCTCGGGGCCTTATTGTTAGCGGGCGCCACCTGCT +CTCCCACACCGTCTCCAGTGCAGTACCATCGGCCGCTTGGGTCTTAACCATCGTGTTCGG +GATGGGAACGGGTGTGTCCCCCAAGCGCATCGCCACCAGCAGTAGTTATCCTAGTTTTTG +AAACTGTTAAAGTCTACGGCTCATCGTCCGTTACCTTAATAACTAAACAGTACGTAAAAC +CC +>SRR029699.5714 EXRA6YY02A735O +ATCCCGAACACGATGGTAAAGACCTAAGCGGCCGATGGTACTGCACTGGAGACGGTGTGG +GAGAGTAGGTGGCCGCCAAATTAAAAAGAAAATAAACCGAGAGGTTCTGCGATAAAGCTG +GTCTTCACCAGTGATCAGAGTTTAAAAGAAAGCTTTTAGTCTCTGATGACTGATGAAAAA +GTCAGTCAGATGAAGAACTGAATAAGGACTTCATCTTATGTACCTTGAAAACTGCATATA +GTAAAAATCAATAGATTTAGATAAATA +>SRR029699.14351 EXRA6YY04CD0L5 +CGGTGGCGATGCGCTTTTGGGAAACACCCGTCCACATCCCGAACACGATGGTTAAGACTG +AAGCGGCCGATAATACTATACTGGTAACGGTATGGGAAGGTAGGTGGCTGCCGGATTTAT +AAAGAACAGCATAGCATATATGCTTTCAAAATAGAACAGGCTCGAAAGAGCTAACCTACA +TCAGCAGGGAAGTGCTGTTTAGATAGCTGGTTTTACCAGTGATTAGAGTATTTTAAGATA +TTAAGGTTTCTAATGACTGAATAAAACAG +>SRR029699.30831 EYTWXA302GEDQR +GCTTATGGGACACACCCGTTCCCATCCCGAACACGACGGTTAAGACGTAAGCGGCCGATG +GTACTATGCTGGAGACGGCATGGGAGAGCAGGTGGGTGCCGGACTGAGACACGCAACAGG +GGATAGGCAAGGCACACAGGGGATAGG +>SRR029699.33623 EYTWXA302HT3FX +GACTACGAGGTTGATAGGCACGATGTGTAAGTGGAGCGATCCATTCAGCAAGTGTGTACT +AATAGATCGAGGGCTTGACCACAATTCGCTTGAATTCTCAAGTCAATGACAAAATGTTAG +CAGTGATTATTCAGTTTTGAAGGCACGTCCTTCAAGAAATACTGGACAAAGTAAAACAGA +AATGTTATACTGAACCAGTCATATTGGTCGGTGACGATGACGGTGAGGTTCCACCTGTTC +CCATTCCGAACACAG +>SRR029699.38075 EYTWXA302GR8DJ +ACTTTATCAAGATACCAAGTGGAGAATACGAGATTCGAACTCGTGACCTCCTGCTTGCAA +GGCAGGCGCTCTCCCAACTGAGCTAATCCCCCAAGGGAATCCGGCAGCCACCTGCTCTCC +CATGCCATCTCCAGCATAGTACCATCGGCCGCTCAGGTCTTAACCATCGTGTTCGGGATG +GGAACGGGTGTGTCCCCTGAGCGCATCGCCACCGGAAATATCTTATCAAGTTTTTGCTTG +ATAACTGAATAAAC \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mapping_file.txt Fri Apr 15 06:04:56 2016 -0400 @@ -0,0 +1,7 @@ +SRR029699.4565 AAAAAA +SRR029699.5673 CCCCCC +SRR029699.5714 TTTTTT +SRR029699.14351 GGGGGG +SRR029699.30831 AAAAAA +SRR029699.33623 CCCCCC +SRR029699.38075 TTTTTT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_test.fasta Fri Apr 15 06:04:56 2016 -0400 @@ -0,0 +1,33 @@ +>SRR029699.4565 +AAAAAAACGGCTCGGTGCCGTCCACCGGTCAGCGGCGCCCTGGCCTCCCACGGGCTGACC +CCGCAGTACTCTCGGCGCGATGGGGCTTAGCTTCCGGGTTCGGAACGGGACCGGGCGTGC +CCC +>SRR029699.5673 +CCCCCCGACCAGAACAACGCCGAAGATCAAAACGGTGAAAACAACGGCGAACAGGGAGGA +AACGAATAATGAAGAAGCTTGTTCTTTTAGCCCTCGGGGCCTTATTGTTAGCGGGCGCCA +CCTGCTCTCCCACACCGTCTCCAGTGCAGTACCATCGGCCGCTTGGGTCTTAACCATCGT +GTTCGGGATGGGAACGGGTGTGTCCCCCAAGCGCATCGCCACCAGCAGTAGTTATCCTAG +TTTTTGAAACTGTTAAAGTCTACGGCTCATCGTCCGTTACCTTAATAACTAAACAGTACG +TAAAACCC +>SRR029699.5714 +TTTTTTATCCCGAACACGATGGTAAAGACCTAAGCGGCCGATGGTACTGCACTGGAGACG +GTGTGGGAGAGTAGGTGGCCGCCAAATTAAAAAGAAAATAAACCGAGAGGTTCTGCGATA +AAGCTGGTCTTCACCAGTGATCAGAGTTTAAAAGAAAGCTTTTAGTCTCTGATGACTGAT +GAAAAAGTCAGTCAGATGAAGAACTGAATAAGGACTTCATCTTATGTACCTTGAAAACTG +CATATAGTAAAAATCAATAGATTTAGATAAATA +>SRR029699.14351 +GGGGGGCGGTGGCGATGCGCTTTTGGGAAACACCCGTCCACATCCCGAACACGATGGTTA +AGACTGAAGCGGCCGATAATACTATACTGGTAACGGTATGGGAAGGTAGGTGGCTGCCGG +ATTTATAAAGAACAGCATAGCATATATGCTTTCAAAATAGAACAGGCTCGAAAGAGCTAA +CCTACATCAGCAGGGAAGTGCTGTTTAGATAGCTGGTTTTACCAGTGATTAGAGTATTTT +AAGATATTAAGGTTTCTAATGACTGAATAAAACAG +>SRR029699.30831 +AAAAAAGCTTATGGGACACACCCGTTCCCATCCCGAACACGACGGTTAAGACGTAAGCGG +CCGATGGTACTATGCTGGAGACGGCATGGGAGAGCAGGTGGGTGCCGGACTGAGACACGC +AACAGGGGATAGGCAAGGCACACAGGGGATAGG +>SRR029699.33623 +CCCCCCGACTACGAGGTTGATAGGCACGATGTGTAAGTGGAGCGATCCATTCAGCAAGTG +TGTACTAATAGATCGAGGGCTTGACCACAATTCGCTTGAATTCTCAAGTCAATGACAAAA +TGTTAGCAGTGATTATTCAGTTTTGAAGGCACGTCCTTCAAGAAATACTGGACAAAGTAA +AACAGAAATGTTATACTGAACCAGTCATATTGGTCGGTGACGATGACGGTGAGGTTCCAC +CTGTTCCCATTCCGAACACAG