changeset 6:d8fa616a228a draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cherry_pick_fasta commit 8384f1bb5378232bbe78319e06a3522674c7c1fe"
author artbio
date Fri, 08 Apr 2022 16:56:42 +0000
parents 144b856e926c
children 6c0aefd9fee3
files cherry_pick_fasta.py cherry_pick_fasta.xml test-data/input_withspace.fa test-data/output_withspace.fa
diffstat 4 files changed, 71 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/cherry_pick_fasta.py	Tue Apr 05 23:42:28 2022 +0000
+++ b/cherry_pick_fasta.py	Fri Apr 08 16:56:42 2022 +0000
@@ -1,10 +1,5 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# Chery pick of fasta sequences satisfying a query string in their header/name
 import argparse
 
-from Bio import SeqIO
-
 
 def Parser():
     the_parser = argparse.ArgumentParser(
@@ -68,7 +63,17 @@
 
 
 def buid_fasta_dict(fasta):
-    seq_dict = {rec.id: rec.seq for rec in SeqIO.parse(fasta, "fasta")}
+    seq_dict = dict()
+    f = open(fasta, 'r')
+    content = f.read()
+    segmented_content = content.split('>')
+    segmented_content = segmented_content[1:]
+    for seq in segmented_content:
+        sliced_seq = seq.split('\n')
+        header = sliced_seq[0]
+        sliced_seq = sliced_seq[1:]
+        sequence = ''.join(sliced_seq)
+        seq_dict[header] = sequence
     return seq_dict
 
 
--- a/cherry_pick_fasta.xml	Tue Apr 05 23:42:28 2022 +0000
+++ b/cherry_pick_fasta.xml	Fri Apr 08 16:56:42 2022 +0000
@@ -1,9 +1,10 @@
-<tool id="cherry_pick_fasta" name="Pick Fasta sequences" version="3.2.1">
+<tool id="cherry_pick_fasta" name="Pick Fasta sequences" version="3.3">
   <description>with header satisfying a string query</description>
   <requirements>
-        <requirement type="package" version="1.75">biopython</requirement>
+        <requirement type="package" version="3.8.0">python</requirement>
   </requirements>
-  <command interpreter="python">cherry_pick_fasta.py
+  <command detect_errors="exit_code"><![CDATA[
+                            python '$__tool_directory__/cherry_pick_fasta.py'
                                    --input $input
                                    --searchfor '$search.searchfor'
                                    #if $search.options_selector == 'single':
@@ -20,7 +21,7 @@
                                        #end if
                                    #end if
                                    --output $output
-  </command>
+  ]]></command>
 
   <inputs>
     <param name="input" type="data" format="fasta" label="Source file" help="Fasta file to parse" />
@@ -69,6 +70,14 @@
     <data name="output" format="fasta" label="Fasta sequences ${search.searchfor.value} ${search.options_selector} term(s) in header" />
   </outputs>
   <tests>
+    <!-- test headers with space -->
+    <test>
+        <param ftype="fasta" name="input" value="input_withspace.fa" />
+        <param name="query" value="type=rRNA" />
+        <param name="searchfor" value="with" />
+        <param name="match" value="include" />
+        <output name="output" ftype="fasta" file="output_withspace.fa" />
+    </test>
     <!-- exact matches -->
     <test>
         <param ftype="fasta" name="input" value="input.fa" />
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_withspace.fa	Fri Apr 08 16:56:42 2022 +0000
@@ -0,0 +1,31 @@
+>FBtr0070292 type=snoRNA; loc=X:complement(1482492..1482590); ID=FBtr0070292; name=snoRNA:M-RA; dbxref=FlyBase:FBtr0070292,FlyBase_Annotation_IDs:CR32807-RA,REFSEQ:NR_002093,RNAcentral:URS00002398EB_7227; MD5=81314d41b5db15222ab4c0daca9e3a65; length=99; parent=FBgn0044508; release=r6.45; species=Dmel; 
+AATTCAATGATTTCAACTTCACTGCTGACCAGCCACGCCTCAACGCCTCGGAGTGTTTGC
+CCCCAGTCTCTGATAGATACACCATATTCTAATACACAC
+>FBtr0072259 type=snoRNA; loc=2R:24176228..24176367; ID=FBtr0072259; name=snoRNA:Psi18S-1820-RA; dbxref=FlyBase:FBtr0072259,FlyBase_Annotation_IDs:CR32884-RA,REFSEQ:NR_001911,RNAcentral:URS00004071FD_7227; MD5=a2f621766d8c1f39cb12a50f1c35d266; length=140; parent=FBgn0026169; release=r6.45; species=Dmel; 
+ACCCCATGATATTAAGCCGGCAGGTGCCTGCAATCCCCACGGGCACCTGTGACTATGATT
+GGGAACAGCAAAGCTGCTGATGTGTGTATGTTGGGCAGCGCTCTTATCCTGCGCGCTCGA
+CAAAGAATATTCGGTCGACA
+>FBtr0076634 type=snoRNA; loc=3L:8601473..8601552; ID=FBtr0076634; name=snoRNA:U49:66Da-RA; dbxref=FlyBase:FBtr0076634,FlyBase_Annotation_IDs:CR32911-RA,REFSEQ:NR_001954,RNAcentral:URS000020066A_7227; MD5=9f94db9a46640f6af279661b83c98768; length=80; parent=FBgn0060292; release=r6.45; species=Dmel; 
+TGCACATGATGATAATTTGTCTTCTTGATAGGAAGTGCCAACTGACACATAAATGAAGCT
+AGTTTAACCCTCTGAAAGCA
+>FBtr0076635 type=snoRNA; loc=3L:8601948..8602031; ID=FBtr0076635; name=snoRNA:U49:66Db-RA; dbxref=FlyBase:FBtr0076635,FlyBase_Annotation_IDs:CR32910-RA,REFSEQ:NR_001955,RNAcentral:URS000008F83A_7227; MD5=b2175b964d1d7e4cbe00bd8e55b73ca8; length=84; parent=FBgn0060291; release=r6.45; species=Dmel; 
+CTGCACATGATGATAATTGAAAGTTCTTGATAGGAATTGCCGGCTGACACATATATGAAG
+CTAATTCTAGTTTATCTGATTACA
+>FBtr0077222 type=snoRNA; loc=X:complement(21386845..21386935); ID=FBtr0077222; name=snoRNA:MeU6-A47-RA; dbxref=FlyBase:FBtr0077222,FlyBase_Annotation_IDs:CR32519-RA,REFSEQ:NR_002137,RNAcentral:URS00005A5367_7227; MD5=77f36daf4093aa585511640befa5a75c; length=91; parent=FBgn0025882; release=r6.45; species=Dmel; 
+AGCCAATGATGATTAAACCACAAATGCAGGGAGTGGGCAGCTGGCAACAGTTGGCGCACG
+CATGACCAGCAACTAATCTTCTCGCTGACGG
+>FBtr0078576 type=snoRNA; loc=3L:complement(22869545..22869622); ID=FBtr0078576; name=snoRNA:Me28S-G764-RA; dbxref=FlyBase:FBtr0078576,FlyBase_Annotation_IDs:CR32907-RA,REFSEQ:NR_001980,RNAcentral:URS00004E3D30_7227; MD5=a9683fc8a27d695afdba8fa26613675a; length=78; parent=FBgn0020518; release=r6.45; species=Dmel; 
+ATATGATGTTAACAGCTTGTAATGATTATGCATTGTGATTATTCAAGTTTATTTTGTTTC
+AAGACGGGACTGACTATA
+>FBtr0086350 type=rRNA; loc=2R:complement(19764289..19764423); ID=FBtr0086350; name=5SrRNA:CR33358-RA; dbxref=FlyBase:FBtr0086350,FlyBase_Annotation_IDs:CR33358-RA,REFSEQ:NR_001887,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053358; release=r6.45; species=Dmel; 
+GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGCGTCGGGCGCGGTTAGTACTT
+AGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCTCGTCCACAACTTTTT
+>FBtr0086351 type=rRNA; loc=2R:complement(19763913..19764047); ID=FBtr0086351; name=5SrRNA:CR33359-RA; dbxref=FlyBase:FBtr0086351,FlyBase_Annotation_IDs:CR33359-RA,REFSEQ:NR_001886,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053359; release=r6.45; species=Dmel; 
+GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGCGTCGGGCGCGGTTAGTACTT
+AGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCTCGTCCACAACTTTTT
+>FBtr0086352 type=rRNA; loc=2R:complement(19763544..19763678); ID=FBtr0086352; name=5SrRNA:CR33360-RA; dbxref=FlyBase:FBtr0086352,FlyBase_Annotation_IDs:CR33360-RA,REFSEQ:NR_001885,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053360; release=r6.45; species=Dmel; 
+GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGCGTCGGGCGCGGTTAGTACTT
+AGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCTCGTCCACAACTTTTT
+>FBtr0086353 type=rRNA; loc=2R:complement(19763175..19763309); ID=FBtr0086353; name=5SrRNA:CR33361-RA; dbxref=FlyBase:FBtr0086353,FlyBase_Annotation_IDs:CR33361-RA,REFSEQ:NR_001884,RNAcentral:URS00005F0FA0_7227; MD5=9162f14f3b33b2210e2100cada0b7f92; length=135; parent=FBgn0053361; release=r6.45; species=Dmel; 
+GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGCGTCGGGCGCGGTTAGTACTT
+AGATGAGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCTCGTCCACAACTTTTT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_withspace.fa	Fri Apr 08 16:56:42 2022 +0000
@@ -0,0 +1,16 @@
+>FBtr0086350 type=rRNA; loc=2R:complement(19764289..19764423); ID=FBtr0086350; name=5SrRNA:CR33358-RA; dbxref=FlyBase:FBtr0086350,FlyBase_Annotation_IDs:CR33358-RA,REFSEQ:NR_001887,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053358; release=r6.45; species=Dmel; 
+GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGC
+GTCGGGCGCGGTTAGTACTTAGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCT
+CGTCCACAACTTTTT
+>FBtr0086351 type=rRNA; loc=2R:complement(19763913..19764047); ID=FBtr0086351; name=5SrRNA:CR33359-RA; dbxref=FlyBase:FBtr0086351,FlyBase_Annotation_IDs:CR33359-RA,REFSEQ:NR_001886,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053359; release=r6.45; species=Dmel; 
+GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGC
+GTCGGGCGCGGTTAGTACTTAGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCT
+CGTCCACAACTTTTT
+>FBtr0086352 type=rRNA; loc=2R:complement(19763544..19763678); ID=FBtr0086352; name=5SrRNA:CR33360-RA; dbxref=FlyBase:FBtr0086352,FlyBase_Annotation_IDs:CR33360-RA,REFSEQ:NR_001885,RNAcentral:URS00003B4856_7227; MD5=dacf3866c94cfe27156e359c01e8f212; length=135; parent=FBgn0053360; release=r6.45; species=Dmel; 
+GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGC
+GTCGGGCGCGGTTAGTACTTAGATGGGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCT
+CGTCCACAACTTTTT
+>FBtr0086353 type=rRNA; loc=2R:complement(19763175..19763309); ID=FBtr0086353; name=5SrRNA:CR33361-RA; dbxref=FlyBase:FBtr0086353,FlyBase_Annotation_IDs:CR33361-RA,REFSEQ:NR_001884,RNAcentral:URS00005F0FA0_7227; MD5=9162f14f3b33b2210e2100cada0b7f92; length=135; parent=FBgn0053361; release=r6.45; species=Dmel; 
+GCCAACGACCATACCACGCTGAATACATCGGTTCTCGTCCGATCACCGAAATTAAGCAGC
+GTCGGGCGCGGTTAGTACTTAGATGAGGGACCGCTTGGGAACACCGCGTGTTGTTGGCCT
+CGTCCACAACTTTTT