Mercurial > repos > artbio > cherry_pick_fasta
changeset 6:d8fa616a228a draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cherry_pick_fasta commit 8384f1bb5378232bbe78319e06a3522674c7c1fe"
author | artbio |
---|---|
date | Fri, 08 Apr 2022 16:56:42 +0000 |
parents | 144b856e926c |
children | 6c0aefd9fee3 |
files | cherry_pick_fasta.py cherry_pick_fasta.xml test-data/input_withspace.fa test-data/output_withspace.fa |
diffstat | 4 files changed, 71 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/cherry_pick_fasta.py Tue Apr 05 23:42:28 2022 +0000 +++ b/cherry_pick_fasta.py Fri Apr 08 16:56:42 2022 +0000 @@ -1,10 +1,5 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Chery pick of fasta sequences satisfying a query string in their header/name import argparse -from Bio import SeqIO - def Parser(): the_parser = argparse.ArgumentParser( @@ -68,7 +63,17 @@ def buid_fasta_dict(fasta): - seq_dict = {rec.id: rec.seq for rec in SeqIO.parse(fasta, "fasta")} + seq_dict = dict() + f = open(fasta, 'r') + content = f.read() + segmented_content = content.split('>') + segmented_content = segmented_content[1:] + for seq in segmented_content: + sliced_seq = seq.split('\n') + header = sliced_seq[0] + sliced_seq = sliced_seq[1:] + sequence = ''.join(sliced_seq) + seq_dict[header] = sequence return seq_dict
--- a/cherry_pick_fasta.xml Tue Apr 05 23:42:28 2022 +0000 +++ b/cherry_pick_fasta.xml Fri Apr 08 16:56:42 2022 +0000 @@ -1,9 +1,10 @@ -<tool id="cherry_pick_fasta" name="Pick Fasta sequences" version="3.2.1"> +<tool id="cherry_pick_fasta" name="Pick Fasta sequences" version="3.3"> <description>with header satisfying a string query</description> <requirements> - <requirement type="package" version="1.75">biopython</requirement> + <requirement type="package" version="3.8.0">python</requirement> </requirements> - <command interpreter="python">cherry_pick_fasta.py + <command detect_errors="exit_code"><![CDATA[ + python '$__tool_directory__/cherry_pick_fasta.py' --input $input --searchfor '$search.searchfor' #if $search.options_selector == 'single': @@ -20,7 +21,7 @@ #end if #end if --output $output - </command> + ]]></command> <inputs> <param name="input" type="data" format="fasta" label="Source file" help="Fasta file to parse" /> @@ -69,6 +70,14 @@ <data name="output" format="fasta" label="Fasta sequences ${search.searchfor.value} ${search.options_selector} term(s) in header" /> </outputs> <tests> + <!-- test headers with space --> + <test> + <param ftype="fasta" name="input" value="input_withspace.fa" /> + <param name="query" value="type=rRNA" /> + <param name="searchfor" value="with" /> + <param name="match" value="include" /> + <output name="output" ftype="fasta" file="output_withspace.fa" /> + </test> <!-- exact matches --> <test> <param ftype="fasta" name="input" value="input.fa" />
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input_withspace.fa Fri Apr 08 16:56:42 2022 +0000 @@ -0,0 +1,31 @@ +>FBtr0070292 type=snoRNA; loc=X:complement(1482492..1482590); ID=FBtr0070292; name=snoRNA:M-RA; dbxref=FlyBase:FBtr0070292,FlyBase_Annotation_IDs:CR32807-RA,REFSEQ:NR_002093,RNAcentral:URS00002398EB_7227; MD5=81314d41b5db15222ab4c0daca9e3a65; length=99; parent=FBgn0044508; release=r6.45; species=Dmel; +AATTCAATGATTTCAACTTCACTGCTGACCAGCCACGCCTCAACGCCTCGGAGTGTTTGC +CCCCAGTCTCTGATAGATACACCATATTCTAATACACAC +>FBtr0072259 type=snoRNA; loc=2R:24176228..24176367; ID=FBtr0072259; name=snoRNA:Psi18S-1820-RA; dbxref=FlyBase:FBtr0072259,FlyBase_Annotation_IDs:CR32884-RA,REFSEQ:NR_001911,RNAcentral:URS00004071FD_7227; MD5=a2f621766d8c1f39cb12a50f1c35d266; length=140; parent=FBgn0026169; release=r6.45; species=Dmel; +ACCCCATGATATTAAGCCGGCAGGTGCCTGCAATCCCCACGGGCACCTGTGACTATGATT +GGGAACAGCAAAGCTGCTGATGTGTGTATGTTGGGCAGCGCTCTTATCCTGCGCGCTCGA +CAAAGAATATTCGGTCGACA +>FBtr0076634 type=snoRNA; loc=3L:8601473..8601552; ID=FBtr0076634; name=snoRNA:U49:66Da-RA; dbxref=FlyBase:FBtr0076634,FlyBase_Annotation_IDs:CR32911-RA,REFSEQ:NR_001954,RNAcentral:URS000020066A_7227; MD5=9f94db9a46640f6af279661b83c98768; length=80; parent=FBgn0060292; release=r6.45; species=Dmel; +TGCACATGATGATAATTTGTCTTCTTGATAGGAAGTGCCAACTGACACATAAATGAAGCT +AGTTTAACCCTCTGAAAGCA +>FBtr0076635 type=snoRNA; loc=3L:8601948..8602031; ID=FBtr0076635; name=snoRNA:U49:66Db-RA; dbxref=FlyBase:FBtr0076635,FlyBase_Annotation_IDs:CR32910-RA,REFSEQ:NR_001955,RNAcentral:URS000008F83A_7227; MD5=b2175b964d1d7e4cbe00bd8e55b73ca8; length=84; parent=FBgn0060291; release=r6.45; species=Dmel; +CTGCACATGATGATAATTGAAAGTTCTTGATAGGAATTGCCGGCTGACACATATATGAAG +CTAATTCTAGTTTATCTGATTACA +>FBtr0077222 type=snoRNA; loc=X:complement(21386845..21386935); ID=FBtr0077222; name=snoRNA:MeU6-A47-RA; dbxref=FlyBase:FBtr0077222,FlyBase_Annotation_IDs:CR32519-RA,REFSEQ:NR_002137,RNAcentral:URS00005A5367_7227; MD5=77f36daf4093aa585511640befa5a75c; length=91; parent=FBgn0025882; release=r6.45; species=Dmel; +AGCCAATGATGATTAAACCACAAATGCAGGGAGTGGGCAGCTGGCAACAGTTGGCGCACG +CATGACCAGCAACTAATCTTCTCGCTGACGG +>FBtr0078576 type=snoRNA; loc=3L:complement(22869545..22869622); ID=FBtr0078576; name=snoRNA:Me28S-G764-RA; dbxref=FlyBase:FBtr0078576,FlyBase_Annotation_IDs:CR32907-RA,REFSEQ:NR_001980,RNAcentral:URS00004E3D30_7227; MD5=a9683fc8a27d695afdba8fa26613675a; length=78; parent=FBgn0020518; release=r6.45; species=Dmel; +ATATGATGTTAACAGCTTGTAATGATTATGCATTGTGATTATTCAAGTTTATTTTGTTTC +AAGACGGGACTGACTATA +>FBtr0086350 type=rRNA; loc=2R:complement(19764289..19764423); ID=FBtr0086350; name=5SrRNA:CR33358-RA; dbxref=FlyBase:FBtr0086350,FlyBase_Annotation_IDs:CR33358-RA,REFSEQ:NR_001887,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053358; release=r6.45; species=Dmel; +GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGCGTCGGGCGCGGTTAGTACTT +AGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCTCGTCCACAACTTTTT +>FBtr0086351 type=rRNA; loc=2R:complement(19763913..19764047); ID=FBtr0086351; name=5SrRNA:CR33359-RA; dbxref=FlyBase:FBtr0086351,FlyBase_Annotation_IDs:CR33359-RA,REFSEQ:NR_001886,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053359; release=r6.45; species=Dmel; +GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGCGTCGGGCGCGGTTAGTACTT +AGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCTCGTCCACAACTTTTT +>FBtr0086352 type=rRNA; loc=2R:complement(19763544..19763678); ID=FBtr0086352; name=5SrRNA:CR33360-RA; dbxref=FlyBase:FBtr0086352,FlyBase_Annotation_IDs:CR33360-RA,REFSEQ:NR_001885,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053360; release=r6.45; species=Dmel; +GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGCGTCGGGCGCGGTTAGTACTT +AGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCTCGTCCACAACTTTTT +>FBtr0086353 type=rRNA; loc=2R:complement(19763175..19763309); ID=FBtr0086353; name=5SrRNA:CR33361-RA; dbxref=FlyBase:FBtr0086353,FlyBase_Annotation_IDs:CR33361-RA,REFSEQ:NR_001884,RNAcentral:URS00005F0FA0_7227; MD5=9162f14f3b33b2210e2100cada0b7f92; length=135; parent=FBgn0053361; release=r6.45; species=Dmel; +GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGCGTCGGGCGCGGTTAGTACTT +AGATGAGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCTCGTCCACAACTTTTT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_withspace.fa Fri Apr 08 16:56:42 2022 +0000 @@ -0,0 +1,16 @@ +>FBtr0086350 type=rRNA; loc=2R:complement(19764289..19764423); ID=FBtr0086350; name=5SrRNA:CR33358-RA; dbxref=FlyBase:FBtr0086350,FlyBase_Annotation_IDs:CR33358-RA,REFSEQ:NR_001887,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053358; release=r6.45; species=Dmel; +GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGC +GTCGGGCGCGGTTAGTACTTAGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCT +CGTCCACAACTTTTT +>FBtr0086351 type=rRNA; loc=2R:complement(19763913..19764047); ID=FBtr0086351; name=5SrRNA:CR33359-RA; dbxref=FlyBase:FBtr0086351,FlyBase_Annotation_IDs:CR33359-RA,REFSEQ:NR_001886,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053359; release=r6.45; species=Dmel; +GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGC +GTCGGGCGCGGTTAGTACTTAGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCT +CGTCCACAACTTTTT +>FBtr0086352 type=rRNA; loc=2R:complement(19763544..19763678); ID=FBtr0086352; name=5SrRNA:CR33360-RA; dbxref=FlyBase:FBtr0086352,FlyBase_Annotation_IDs:CR33360-RA,REFSEQ:NR_001885,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053360; release=r6.45; species=Dmel; +GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGC +GTCGGGCGCGGTTAGTACTTAGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCT +CGTCCACAACTTTTT +>FBtr0086353 type=rRNA; loc=2R:complement(19763175..19763309); ID=FBtr0086353; name=5SrRNA:CR33361-RA; dbxref=FlyBase:FBtr0086353,FlyBase_Annotation_IDs:CR33361-RA,REFSEQ:NR_001884,RNAcentral:URS00005F0FA0_7227; MD5=9162f14f3b33b2210e2100cada0b7f92; length=135; parent=FBgn0053361; release=r6.45; species=Dmel; +GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGC +GTCGGGCGCGGTTAGTACTTAGATGAGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCT +CGTCCACAACTTTTT