Repository 'find_subsequences'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/find_subsequences

Changeset 0:7f39014f9404 (2015-03-20)
Next changeset 1:d882a0a75759 (2015-04-10)
Commit message:
Imported from capsule None
added:
find_subsequences.py
find_subsequences.xml
test-data/find_subsequences_advanced_result1.bed
test-data/find_subsequences_input1.fasta
test-data/find_subsequences_simple_result1.bed
test-data/find_subsequences_simple_result2.bed
tool_dependencies.xml
b
diff -r 000000000000 -r 7f39014f9404 find_subsequences.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/find_subsequences.py Fri Mar 20 06:23:17 2015 -0400
[
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+import re
+import sys
+import argparse
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqUtils import nt_search
+from Bio.Alphabet import generic_dna
+
+choices = ['embl', 'fasta', 'fastq-sanger', 'fastq', 'fastq-solexa', 'fastq-illumina', 'genbank', 'gb']
+
+def find_pattern(seqs, pattern, outfile_path):
+    """
+    Finds all occurrences of a pattern in the a given sequence.
+    Outputs sequence ID, start and end postion of the pattern.
+    """
+    pattern = pattern.upper()
+    rev_compl = Seq(pattern, generic_dna).complement()
+    search_func = simple_pattern_search
+    if set(pattern).difference(set('ATCG')):
+        search_func = complex_pattern_search
+
+    with open(outfile_path, 'w+') as outfile:
+        for seq in seqs:
+            search_func(seq, pattern, outfile)
+            search_func(seq, rev_compl, outfile, '-')
+
+
+def simple_pattern_search(sequence, pattern, outfile, strand='+'):
+    """
+    Simple regular expression search. This is way faster than the complex search.
+    """
+    bed_template = '%s\t%s\t%s\t%s\t%s\t%s\n'
+    for match in re.finditer( str(pattern), str(sequence.seq) ):
+        outfile.write(bed_template % (sequence.id,  match.start(), match.end(), sequence.description, '', strand))
+
+
+def complex_pattern_search(sequence, pattern, outfile, strand='+'):
+    """
+    Searching for pattern with biopyhon's nt_search().
+    This allows for ambiguous values, like N = A or T or C or G, R = A or G ...
+    """
+    l = len(pattern)
+    matches = nt_search(str(sequence.seq), pattern)
+    bed_template = '%s\t%s\t%s\t%s\t%s\t%s\n'
+    for match in matches[1:]:
+        outfile.write(bed_template % (sequence.id, match, match+l, sequence.description, '', strand) )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--input' , required=True)
+    parser.add_argument('-o', '--output' , required=True)
+    parser.add_argument('-p', '--pattern' , required=True)
+    parser.add_argument('-f', '--format', default="fasta", choices=choices)
+    args = parser.parse_args()
+
+    with open(args.input) as handle:
+        find_pattern( SeqIO.parse(handle, args.format), args.pattern, args.output )
+
b
diff -r 000000000000 -r 7f39014f9404 find_subsequences.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/find_subsequences.xml Fri Mar 20 06:23:17 2015 -0400
[
b'@@ -0,0 +1,291 @@\n+<tool id="bg_find_subsequences" name="Nucleotide subsequence search" version="0.1">\n+    <description>providing regions in BED format</description>\n+    <requirements>\n+        <requirement type="package" version="1.65">biopython</requirement>\n+    </requirements>\n+    <command interpreter="python">\n+    <![CDATA[\n+        find_subsequences.py\n+            --input $input\n+            --output $output\n+            --pattern "$pattern_conditional.pattern"\n+            #if $input.ext == \'fasta\':\n+                --format \'fasta\'\n+            #else:\n+                --format \'fastq\'\n+            #end if\n+    ]]>\n+    </command>\n+    <inputs>\n+        <param format="fasta,fastq" name="input" type="data" label="Nucleotide sequence to be searched"\n+            help="This can be in FASTA or FASTQ format." />\n+\n+        <conditional name="pattern_conditional">\n+            <param name="pattern_conditional_select" type="select" label="Search with a">\n+                <option value="predefined">predefined pattern</option>\n+                <option value="user">user defined pattern</option>\n+            </param>\n+            <when value="predefined">\n+                <param name="pattern" type="select" label="Select one of the predefined pattern" help="">\n+                    <option value=\'CCTCGAGG\'>AbsI - CCTCGAGG</option>\n+                    <option value=\'CGCG\'>AccII - CGCG (FnuDII Bce31293I BceBI BceRI BepI Bpu95I Bsh1236I Bsp50I Bsp70I Bsp123I BspFNI BspJ76I BstFNI BstUI Bsu1192II Bsu1193I Bsu1532I Bsu6633I BsuEII BtkI Cpa1150I CpaAI Csp68KVI EsaBS9I FalII FauBII FspMI Hin1056I HpyF14I HpyF15I HpyF26I HpyF52II HpyF64IV MvaAI MvnI PflAI SceAI SelI ThaI TmaI TneDI Uba1321I Uba1404I Uba1405I Uba1446I)</option>\n+                    <option value=\'GCTAGC\'>AceII - GCTAGC (NheI AsuNHI BmtI BspOI LlaG2I PstNHI)</option>\n+                    <option value=\'AACGTT\'>AclI - AACGTT (Psp1406I UbaN9I)</option>\n+                    <option value=\'YGGCCR\'>AcoI - YGGCCR (CfrI Ava458I Bfi89I Cfr14I Cfr38I Cfr39I Cfr40I Cfr55I Cfr59I EaeI EciBI Eco90I Eco164I EcoHAI EcoHK31I EspHK16I EspHK24I KspHK15I Uba36I Uba1188I Uba1327I)</option>\n+                    <option value=\'CACGTG\'>AcvI - CACGTG (PmaCI BbrPI BcoAI Bsp87I Eco72I Pgl34I PmlI PshCI PshDI Psp38I PspBI PspCI VpaK3AI VpaK3BI)</option>\n+                    <option value=\'CACNNNGTG\'>AdeI - CACNNNGTG (DraIII BstIZ316I)</option>\n+                    <option value=\'CGATCG\'>Afa16RI - CGATCG (PvuI Afa22MI BmaI BmaAI BmaBI BmaCI BmaDI BpvUI BspCI BstZ8I Cas2I Cfr51I DrdIII EagBI EclJI ErhB9I Kpl79I MvrI NblI Ple19I PntI Psu161I Pvu84I RshI RspI SmaAIII SpaPII SplAIII Uba1129I Uba1139I Xgl3216I Xgl3217I Xgl3218I Xgl3219I Xgl3220I XmlI XmlAI XniI XorII XorKI)</option>\n+                    <option value=\'CTTAAG\'>AflII - CTTAAG (BfrI BsaFI BscLI BspTI Bst98I BstAFI BstPZ740I Cfr92I Esp4I MspCI TrsTII Uba1266I Uba1299I Uba1312I Uba1313I Uba1331I Uba1374I Uba1420I Uba1426I Uba1443I VfiI Vha464I)</option>\n+                    <option value=\'ACRYGT\'>AflIII - ACRYGT (Asp90I)</option>\n+                    <option value=\'TTSAA\'>AgsI - TTSAA</option>\n+                    <option value=\'CCSGG\'>AhaI - CCSGG (CauII AseII Asp1I AsuC2I BcnI BpuMI Bsp7I Bsp8I Bsp55I BspF105I BspJ67I EciDI Eco121I Eco179I Eco190I Eco1831I EcoHI HgiS21I HgiS22I Hin3I Kpn49kII Mgl14481I NciI Pae181I RshII SflHK10695I SflHK11086I SflHK11087I SflHK11572I SflHK115731I Ssp2I Tmu1I Uba41I Uba42I Uba1280I Uba1318I Uba1347I Uba1370I Uba1372I Uba1376I Uba1378I Uba1389I Uba1401I Uba1423I Uba1424I UbaN11I)</option>\n+                    <option value=\'ACTAGT\'>AhlI - ACTAGT (SpeI AclNI BcuI)</option>\n+                    <option value=\'AGCT\'>AluI - AGCT (AluBI BsaLI MarI Mho2111I MltI OtuI OtuNI OxaI Uba1433I Uba1441I)</option>\n+                    <option value=\'CAGNNNCTG\'>AlwNI - CAGNNNCTG (CaiI PstNI)</option>\n+                    <option value=\'CYCGRG\'>Ama87I - CYCGRG (AvaI AcrI AquI AspBI AspDI AvrI BcoI BmeT110I Bse15I BsiHKC'..b'utJI)</option>\n+                    <option value=\'CCANNNNNTGG\'>Van91I - CCANNNNNTGG (PflMI AccB7I AcpII Asp10HII BasI Esp1396I PflBI)</option>\n+                    <option value=\'GGWCC\'>VpaK11AI - GGWCC (AvaII AflI Asp697I Asp745I AspBII AspDII BamNII BceSII BcuAI Bme18I Bme216I BmpI Bsp71I Bsp100I Bsp128I Bsp132I Bsp133I Bsp1260I BspF53I BspJ105I BsrAI Bst4QI BthAI BtiI CauI CliI ClmII Csp68KI DsaIV EagMI Eco47I ErpI FdiI FspMSI FssI GspAI HgiBI HgiCII HgiEI HgiHIII HgiJI Hsp2I Kzo49I Mae7806III MfoI MliI MspAI MspNI NliII Nli3877II NmuAII NspDII NspGI NspHII NspKI Pfl19I PolI Psp03I SfnI Sgh1835I SinI SinAI SinBI SinCI SinDI SinEI SinFI SinGI SinHI SinJI SmuEI SynI TruI Tru28I Tsp301I Uba48I Uba62I Uba1131I Uba1249I Uba1272I Uba1278I Uba1304I Uba1314I Uba1373I Uba1413I Uba1438I UbaN15I Uth554I VpaK11I VpaK65I VpaK7AI VpaK13AI VpaK11BI VpaK11CI VpaK11DI)</option>\n+                    <option value=\'CCTNNNNNAGG\'>XagI - CCTNNNNNAGG (EcoNI Bpu1268I BsoEI BstENI BstWI Uba1289I Uba1290I Uba1308I Uba1309I Uba1310I)</option>\n+                    <option value=\'TCTAGA\'>XbaI - TCTAGA (BspAAII BspLU11II BsrXI Msp23I UbaN19I)</option>\n+                    <option value=\'CCANNNNNNNNNTGG\'>XcmI - CCANNNNNNNNNTGG</option>\n+                    <option value=\'CCCGGG\'>XcyI - CCCGGG (SmaI AhyI Cfr9I CfrJ4I CphBI EaeAI EclRI KteAI Pac25I PaeBI PspAI PspALI TspMI Uba1220I Uba1393I XmaI XmaCI)</option>\n+                    <option value=\'CGGCCG\'>XmaIII - CGGCCG (AaaI BseX3I BsoDI BstZI EagI EclXI Eco52I SenPT16I TauII Tsp504I)</option>\n+                    <option value=\'GTMKAC\'>XmiI - GTMKAC (AccI DsaVI FblI OmiBII)</option>\n+                    <option value=\'CTAG\'>XspI - CTAG (MaeI BfaI CchI FgoI FisI FspBI MjaI MthFI MthZI RmaI Rma485I Rma486I Rma490I Rma495I Rma496I Rma497I Rma500I Rma501I Rma503I Rma506I Rma509I Rma510I Rma515I Rma516I Rma517I Rma518I Rma519I Rma522I SspMI UbaN5I)</option>\n+                    <option value=\'GACGTC\'>ZraI - GACGTC (AatII AspJI Ppu1253I Ssp5230I)</option>\n+                </param>\n+\n+            </when>\n+            <when value="user">\n+                <param name="pattern" type="text" value="" label="Search pattern" help="For example GGATCC for BamHI.">\n+                    <validator type="regex" message="Only letters are allowed.">^[a-zA-Z]+$</validator>\n+                </param>\n+            </when>\n+        </conditional>\n+    </inputs>\n+    <outputs>\n+        <data format="bed" name="output" />\n+    </outputs>\n+    <tests>\n+        <test>\n+            <param name="input" value="find_subsequences_input1.fasta" ftype="fasta"/>\n+            <param name="pattern" value="atcg"/>\n+            <param name="pattern_conditional_select" value="user"/>\n+            <output name="output" file="find_subsequences_simple_result1.bed" ftype="bed"/>\n+        </test>\n+        <test>\n+            <param name="input" value="find_subsequences_input1.fasta" ftype="fasta"/>\n+            <param name="pattern" value="AGCT"/>\n+            <param name="pattern_conditional_select" value="predefined"/>\n+            <output name="output" file="find_subsequences_simple_result2.bed" ftype="bed"/>\n+        </test>\n+        <test>\n+            <param name="input" value="find_subsequences_input1.fasta" ftype="fasta"/>\n+            <param name="pattern" value="atnncg"/>\n+            <param name="pattern_conditional_select" value="user"/>\n+            <output name="output" file="find_subsequences_advanced_result1.bed" ftype="bed"/>\n+        </test>\n+    </tests>\n+    <help>\n+<![CDATA[\n+\n+**What it does**\n+\n+Searches for a subsequence in a larger nucleotide sequence. For example to get all restriction enzymes for BamHI.\n+\n+You can use ambiguous values using the standard `nucleotide ambiguity code <http://www.dnabaser.com/articles/IUPAC%20ambiguity%20codes.html>`_.\n+\n+This tool is searching on both strands.\n+\n+]]>\n+    </help>\n+    <citations>\n+        <citation type="doi">10.1093/bioinformatics/btp163</citation>\n+    </citations>\n+</tool>\n'
b
diff -r 000000000000 -r 7f39014f9404 test-data/find_subsequences_advanced_result1.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/find_subsequences_advanced_result1.bed Fri Mar 20 06:23:17 2015 -0400
b
@@ -0,0 +1,2 @@
+forward_advanced 9 15 forward_advanced +
+reverse_advanced 9 15 reverse_advanced -
b
diff -r 000000000000 -r 7f39014f9404 test-data/find_subsequences_input1.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/find_subsequences_input1.fasta Fri Mar 20 06:23:17 2015 -0400
b
@@ -0,0 +1,11 @@
+>forward_simple
+ACCTGTAGCATCGGGGCTCGTA
+
+>reverse_simple
+AAAATTTAGCGATTGTAGTGTAGATAGTA
+
+>forward_advanced
+AAAAAAAAAATGGCGTTTTTTTTTT
+
+>reverse_advanced
+AAAAAAAAATAGGGCTTTTTTTTTTT
b
diff -r 000000000000 -r 7f39014f9404 test-data/find_subsequences_simple_result1.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/find_subsequences_simple_result1.bed Fri Mar 20 06:23:17 2015 -0400
b
@@ -0,0 +1,3 @@
+forward_simple 9 13 forward_simple +
+forward_simple 5 9 forward_simple -
+reverse_simple 6 10 reverse_simple -
b
diff -r 000000000000 -r 7f39014f9404 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Fri Mar 20 06:23:17 2015 -0400
b
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="biopython" version="1.65">
+      <repository changeset_revision="dc595937617c" name="package_biopython_1_65" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>