Repository 'sapp'
hg clone https://toolshed.g2.bx.psu.edu/repos/jjkoehorst/sapp

Changeset 12:0773b11fb822 (2015-02-21)
Previous changeset 11:a712b378e090 (2015-02-21) Next changeset 13:1efd1975a68d (2015-02-21)
Commit message:
cutadapters added
modified:
fasta2rdf/fastatordf.xml
added:
protein2rdf/protein_to_ttl.py
protein2rdf/protein_to_ttl.xml
protein2rdf/test-data/NC_017117.faa
rnaseq/cutadapt/cutadapt_adapters.txt
b
diff -r a712b378e090 -r 0773b11fb822 fasta2rdf/fastatordf.xml
--- a/fasta2rdf/fastatordf.xml Sat Feb 21 16:33:42 2015 +0100
+++ b/fasta2rdf/fastatordf.xml Sat Feb 21 16:56:49 2015 +0100
b
@@ -1,4 +1,8 @@
 <tool id="SAPP_genome_to_ttl" name="FASTA to RDF" version="0.1">
+    <requirements>
+        <requirement type='package' version="3.4">python</requirement>
+        <requirement type='package' version="1.0">rdflib</requirement>
+    </requirements>
  <description></description>
  <command interpreter="python3">fastatordf.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' -sourcedb SAPP
  #for $index, $id in enumerate( $ids ) 
b
diff -r a712b378e090 -r 0773b11fb822 protein2rdf/protein_to_ttl.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/protein2rdf/protein_to_ttl.py Sat Feb 21 16:56:49 2015 +0100
[
@@ -0,0 +1,130 @@
+def delete_galaxy():
+ import sys
+ for index, path in enumerate(sys.path):
+ if "galaxy-dist/" in path:
+ sys.path[index] = ''
+
+#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
+delete_galaxy()
+
+# from io import StringIO
+from rdflib import Graph, URIRef, Literal,Namespace,  RDF,RDFS,OWL,  plugin
+# import rdflib
+from rdflib.store import Store
+import sys
+import hashlib
+
+store = plugin.get('IOMemory', Store)()
+
+global URI
+URI = "http://csb.wur.nl/genome/"
+global seeAlso
+seeAlso = "rdfs:seeAlso"
+global coreURI
+coreURI = Namespace(URI)
+
+
+def createClass(uri):
+ genomeGraph.add((uri,RDF.type,OWL.Class))
+ genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
+ return uri
+
+def fasta_parser(input_file):
+ createClass(coreURI["Protein"])
+
+ genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
+ if genome == '':
+ genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
+
+ genomeURI = coreURI[genome]
+ for index, element in enumerate(sys.argv):
+ if '-organism' == element:
+ genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
+ if '-ncbi_taxid' == element:
+ genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
+ if '-idtag' == element:
+ genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+ if '-diagnosis' == element:
+ genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1])))
+ if '-country' == element:
+ genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1])))
+ if '-location' == element:
+ genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1])))
+ if '-date' == element:
+ genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1])))
+ if '-ids' == element:
+ genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+
+
+
+ data = (open(input_file).readlines())
+ fastadict = {}
+ sequence = ""
+ key = ""
+ for index, line in enumerate(data):
+ if ">" == line[0]:
+ if sequence:
+ fastadict[key] = sequence
+ key = line
+ sequence = ""
+ fastadict[key] = ""
+ else:
+ sequence += line.strip()
+ fastadict[key] = sequence
+
+ #Create a class, to be the same as all the other genome conversions...
+ #TODO: Proteins are part of cds, cds are part of dnaobject
+ #If CDS is not there... how then?
+ classURI = coreURI[genome + "/" + "protein_fasta"]
+ proteinClass = createClass(coreURI["Protein"])
+ genomeClass = createClass(coreURI["Genome"])
+ typeClass = createClass(coreURI["DnaObject"])
+ cdsClass = createClass(coreURI["Cds"])
+ #A theoretical begin, end is created to have a workable GBK generation
+ begin = 0
+ end = 0
+ genomeGraph.add((genomeURI, RDF.type, genomeClass))
+ genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI))
+ genomeGraph.add((classURI, RDF.type, typeClass))
+
+ for protein in fastadict:
+ sequence = fastadict[protein]
+ sequence = sequence.encode('utf-8')
+ end = begin + len(sequence)
+ md5_protein = hashlib.md5(sequence).hexdigest()
+ proteinURI = coreURI["protein/"+md5_protein]
+
+ cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)]
+ genomeGraph.add((classURI, coreURI["feature"] , cdsURI))
+ genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin)))
+ genomeGraph.add((cdsURI, coreURI["end"] , Literal(end)))
+ genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI))
+ genomeGraph.add((cdsURI, RDF.type, cdsClass))
+
+
+
+ genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
+ genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence)))
+ genomeGraph.add((proteinURI,RDF.type,proteinClass))
+ genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ genomeGraph.add((proteinURI, RDF.type, proteinClass))
+ begin = end
+
+def save():
+ data = genomeGraph.serialize(format='turtle')
+ open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
+
+def main():
+ store = plugin.get('IOMemory', Store)()
+ global genomeGraph
+ genomeGraph = Graph(store,URIRef(URI))
+ genomeGraph.bind("ssb",coreURI)
+ input_file = sys.argv[sys.argv.index("-input")+1]
+ fasta_parser(input_file)
+ save()
+
+if __name__ == '__main__':
+ main()
+
b
diff -r a712b378e090 -r 0773b11fb822 protein2rdf/protein_to_ttl.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/protein2rdf/protein_to_ttl.xml Sat Feb 21 16:56:49 2015 +0100
b
@@ -0,0 +1,42 @@
+<tool id="SAPP_protein_rdf" name="Protein FASTA to RDF" version="0.1">
+    <requirements>
+        <requirement type='package' version="3.4">python</requirement>
+        <requirement type='package' version="1.0">rdflib</requirement>
+    </requirements>
+ <description></description>
+ <command interpreter="python3.4">protein_to_ttl.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' '-diagnosis' '$diagnosis' '-country' '$country' '-location' '$location' '-date' '$date' -sourcedb SAPP 
+ #for $index, $id in enumerate( $ids ) 
+ '-ids' '$id.id_tag'
+ #end for
+ '-id_alternative' '$input.name'
+ </command>
+ <inputs>
+ <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/>
+ <param size="60" name="organism" type="text" format="text" label="organism name"/>
+ <param size="60" name="diagnosis" type="text" format="text" label="Diagnosis of host if applicable"/>
+ <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/>
+ <param size="60" name="country" type="text" format="text" label="Country of sample"/>
+ <param size="60" name="location" type="text" format="text" label="Location of sample e.g., river, city, hospital"/>
+ <param size="60" name="date" type="text" format="text" label="Sample date"/>
+ <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!"/>
+ <repeat name="ids" title="Identification tags">     
+ <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/>
+ </repeat>
+ </inputs>
+
+ <outputs>
+ <data format="rdf" name="output" label="proteinTTL: ${input.name}" />
+ </outputs>
+    <tests>
+        <test>
+            <param name="input" value="test-data/NC_017117.faa"/>
+            <output name="$output" file="NC_017117.rdf"/>
+            <output name="$ncbi_taxid" value="634455"/>
+            <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/>
+            <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/>
+        </test>
+    </tests>
+ <help>
+ RDF creation from a multi protein fasta file
+ </help>
+</tool>
b
diff -r a712b378e090 -r 0773b11fb822 protein2rdf/test-data/NC_017117.faa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/protein2rdf/test-data/NC_017117.faa Sat Feb 21 16:56:49 2015 +0100
[
b'@@ -0,0 +1,993 @@\n+>gi|384055706|ref|YP_005485330.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MKSDRFTDAQIMGVIRQAEGGVPVPDLCREHGISNATFYRWRAKYGGMDASMISQMKALEEENRRLKRMY\n+ADLSMQTDILKEALGKK\n+>gi|384055707|ref|YP_005485331.1| DNA helicase II UvrD/Rep [Acetobacter pasteurianus IFO 3283-22]\n+MAGHHVEAMIARAHAQKRFMDDAGWRYVVELYGRYQSLLREQNAADFGDLLMWPTLAMLHNDAYRYRWSR\n+RFTAVMADEFQDVNRAQFLWLKMISEVSAEFFAVGDDSQSIL\n+>gi|384055708|ref|YP_005485332.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MVVGRNDCAKGRQMKDTVIGVDLAKNIFQVHGASRAGEVMFRKKLRRQQFMQFMATQPPALVVLEACGSA\n+HYWARELAGAGHEVRLIAPQYVKPFVKRQKNDAADAEAIVIAARQPEMRFVEPRTEAQQARGVLFRARQR\n+LVHQRTELVNALRAVLYEFGLVVPQGIAHIRHIEAMLDEAVLPEAVKQECLDLLRQISEQSVRIDVRTKK\n+IRMLAQESENTCRLQSMPGVGPLTALAIEAFAPDLQSFRRGRDFAAWLGLVPRQFSSGGKERLGKISKAG\n+QADIRRLLIMGAMTQVNWASRKAPAPGSWLARMLARKPRMLVAIALANRMARAIWAMATKQEDYRDPALS\n+VAA\n+>gi|384055709|ref|YP_005485333.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MEQIIRIGMDTSKSVFQLHGVNAKEQPVLRRKLSRREMVKFFEKLPPIEIAIEACGASHYWGRVLSCLGH\n+TVKLIAPQLVKPYVKRGKNDAADAEALCEAMSRPTMRFVPLKSEEEQAALMLIGMRARLIRNRTQLANTI\n+RGYAAEFGITAPKGMCRIEALLDRIAADESLPTLTRELFALHAKEYAELQGEIEQLEGKVMAWHRANECS\n+QRLAKIPGVGPIGAALLMMKTPDPHLFKSGRAFAAWIGLTPRDHSTGGKTRLGRITRAGDEVLRSTLVVG\n+ATAVVSHARRTNGKNASSWLRELLERKKPKLAAVALANKIARIAWKLMVSGEHYKRLLQQPGAAAV\n+>gi|384055710|ref|YP_005485334.1| DNA resolvase [Acetobacter pasteurianus IFO 3283-22]\n+MVPPKPGKTPVGGRLIGYARVSTDDQGTDAQLNELRDAGCTMIFEKHASGADRNRPVLIRLLRDMNAGDT\n+LVVVRLDRLARSVSHLLAVIEQLDYAGAHFRSLDDPIDTTTPQGMFSLQVLGAVAQLDADFFCDGVDGSQ\n+RHRDVPR\n+>gi|384055711|ref|YP_005485335.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MLTSRIHRRKPMGKPMSKATARANAAKSSIRAHVEHVFAHQKNRFNLFIRTIGLARAEAKLTLCNLAYNF\n+NRLIFHERLETAG\n+>gi|384055712|ref|YP_005485336.1| D-mannonate oxidoreductase [Acetobacter pasteurianus IFO 3283-22]\n+MNLNRNAISHVPDTVYTPRYDPALLRPGIVHLGCGNFHRGHQVVATQAAIDAEGRDGLRWGIVSATMRRP\n+DLATVLQSQDNLYTLLTREPANTVASVMAAITEAVYAGDDNANLAARIADPATAIVTLTVTASGYYLSAD\n+GRLDPTFEAIQADLTAITPRTAPGIIAAGLAQVRQRGGVPPVILCCDNVNSNGATLRQAVIDLAALKGDD\n+LLAAWIETNVQFPDTMVDRIVPTATPDDIADACRLLGGIEDRAPISAEPWFQWVIGEFDGPRPRWVAHPG\n+TKFVSDVGVFERAKLQMLNGTHMLLAYVGALANLNTVSEAASDDALGRIAARFMRNEQTADVSLDTDELD\n+RYTVDLMQRFRNPGIVHEVTRIGRNGSAKMASRIVQPMRSNIEAGRPVDGAVLLIASWIRWFALHEQDEF\n+DIALTDPRAETLRGLCADARDDHKAQAEAFLAMEEVFGAPLPDHGKQVEAIASMLRRLTEESVPELLRTI\n+AH\n+>gi|384055713|ref|YP_005485337.1| phosphatase/phosphohexomutase [Acetobacter pasteurianus IFO 3283-22]\n+MTDTVFPAHLLKHKQEPVHGVVFDMDGLLLDSESLAMEALVFAARDLNYDIPMSFCRTMIGVPADGCRTM\n+VRKTYGQDFPLERFFELQEVHLRNFVDTGKLALKKGVLPLLDLLDTYKIPRAIATSSSRVRTDHHLKLVN\n+LFHRFNAIVTRDDVSKGKPDPEPYLTAAKKIGVNPAHALALEDSHSGARAAHAAGIRVIVVPDLLEATDE\n+IRGKALAIVQDLSIVEAYLKHAITGQA\n+>gi|384055714|ref|YP_005485338.1| hypothetical protein APA22_40090 [Acetobacter pasteurianus IFO 3283-22]\n+MRRDMDLVRQLLLKLEGIEKGPHDVLLIGGNSEEVAVDGRTSDEIYFHLTKIEEAGFLERVGGGAMTAVT\n+FRALSWKGQEFLDTIRDDSIWKKTKEKAGSASFDILAAVAKAVIKDRIKSLTGLDIG\n+>gi|384055715|ref|YP_005485339.1| hypothetical protein APA22_40100 [Acetobacter pasteurianus IFO 3283-22]\n+MRPLGSGLSVRTYGCSEADDQENDGWAKKDTGEIVALYEMSSPVMPSGLVSISRWKIKGCYPKSGLSRAM\n+LCPTKIPQSASNIALLIGSDWSFIEENVFCNHIEWQTCLPVFVMNLDHPA\n+>gi|384055716|ref|YP_005485340.1| DNA helicase superfamily I [Acetobacter pasteurianus IFO 3283-22]\n+MSSKPSHHSVLSYWHSALLDDAQMKISFSRDNLVALDEEGFEKGKLPPDKTQALRKMHPASRDLAPDDSI\n+IAMAGIRILLGQVSHSTEHSKQPALFCMAMLVNVSPEGTIQPLKDAPPWINRELLEPSDGDVLIGDLATM\n+DTWLQLNPFEGGSLGKTLEWAEKLWNAVTGEDGLPDGYELWERVALQPAEASIGMIATLHQRRFYDTVLA\n+DTGLVTPLLARYIDGGPEPAVVDESQKWAAAGRARGTMTFAYGMSSSQSEAMTAFCSVKDGDILAVNGPP\n+GTGKTTLLQGIVATELVTRALEGGDPAVIVGTSTNNQAVTNIIDAMKKAMASKDSRPWARRWIEGADALG\n+LYFPSGEKEKEALKAGYLIASPGRGLGTMEWKGFPERERDTVDAWASRDAWINGYYGSFYPGVTPPLRKE\n+HLSGHGPQGARHDISLVEDGIAKIRARMKVLVETGRVCAGEARKLNQLYVASGYGTYPDITKAIAQREAL\n+LQERRPREDALKSDLKEKEAAAAVPRARINEENRKTRDLLKQRDDAVHAAGQKVEEVGAHAVALIAALPG\n+GGFFSNLMSGRNWANVERLVAEGRQGSFFRSLMQAQVKSKREWMDAINEMTASAERELATVRESREETRQ\n+ARDTLIQKLEREVAAADLVSKTARAEYDHYVGGSYVLAGRELEKLVTLKHQILQQLQDCCTAIETVLAPS\n+DWAAMFDMPEEKLPWRQSNWTGRLDVIEDFLDR'..b'DEVAPAV\n+RHLISQIQTTIA\n+>gi|384055875|ref|YP_005485499.1| multidrug resistance transporter EmrB/QacA [Acetobacter pasteurianus IFO 3283-22]\n+MGTSMTSSRVTNPLFVLLAASTGCALTVLDTNVVAIILPTIAREFRASFADIEWVISTYVLCFASLLLPA\n+GAIADRYGRRRIYLIGITTFALTSLFCGAAPSATALYLARALQGVSAAFLLAPALAIIGHTFHNPDERNR\n+AWAIWGSIMGLTMVLAPIIGGIIAYALGWRWAFYINIPICVLLAGAVFILVKESRDTDARRLDPVGIIFF\n+AAFMFGLTWGMINGQASGWTSWNALNGFIGGSISLGIFIASERAQSRPMLDLGLFSNPRFLGAVWAMFAY\n+AASAQVMASMLPLFLQNGLGRSALQAGFAMLPFALAMLIFPHIGRLLERHISSSGILAGGLSCVAIGNGI\n+TAWGAYVGSWIIVMAGMVVIGSGGGLLNGETQKAIMSVVPKERSGMASGISTTSRFSGILLGFAMLSGIL\n+ATMVRKWVAAFGCGTGCHHPSDFADAIVAGDLPSAISGLEGSNQEIAIQHAHHAFSYGFAVALLVASIFA\n+LGSSITVFTLMQSKMKQNIT\n+>gi|384055876|ref|YP_005485500.1| transposase, partial [Acetobacter pasteurianus IFO 3283-22]\n+MLAYAVMASVRYQANSLKPKKTQLRTRQSLSAGPFRRSGASS\n+>gi|384055877|ref|YP_005485501.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MQTECSAGAYEFPASCGRRVVARFDGGRMSSDGGVILVKQADDILGLSRRFAACFRDKRHPGFVEYRVED\n+LVRQRIMGLALGYEDLNDHDALRHDLIFGLASGRLSGGRANCAALAGKSTLNRLERSGQQADRYCRIIAD\n+HEALATLFVTLFLDQHEHAPARIVLDVDATDDRIHGHQEGRAFHGYYGHNCYLPLYVFCGDHLLSATLRT\n+ADRDPGKEALADIRRIVEQIRSRWPRVRILVRGDSGFARDSLMTWCEDNHVDFLFGLAGNTRLYDRIASL\n+SAEVRDEAATTGRAARGFASFDWITKDSWTRRRRVVAKAEWRHGNRYHRFIVTTLPQGMSDPRHLYEQIY\n+CARGDMENRIKECQMDLFSDRTSSHTIRANQLRLWFSAAAYVLLTALQRLALGQTSLETATCGTIRARLL\n+KIATRVTLSVRRIVLSMPDMFPCQHEFALAHARLRRLRQAI\n+>gi|384055878|ref|YP_005485502.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MQTECSAGAYEFPASCGRRVVARFDGGRMSSDGGVIVVKQADDILGLSRRFAACFRDKRHPGFVEYRVED\n+LVRQRIMGLALGYEDLNDHDALRHDLIFGLASGRLSGGRANCAALAGKSTLNRLERSGHKADRYCRIIAD\n+HEALATLFVTLFLDQHEHAPARIVLDVDATDDRIHGHQEGRAFHGYYGHNCYLPLYVFCGDHLLSATLRT\n+ADRDPGKEALADIRRIVEQIRSRWPRVRILVRGDSGFARDSLMTWCEDNHVDFLFGLAGNTRLYDRIASL\n+SAEVRDEAATTGRAARGFASFDWITKDSWTRRRRVVAKAEWRHGNRYHRFIVTTLPQGMSDPRHLYEQIY\n+CARGDMENRIKECQMDLFSDRTSSHTIRANQLRLWFSAAAYVLLTALQRLALGQTSLETATCGTIRARLL\n+KIATRVTLSVRRIVLSMPDMFPCQHEFALAHARLRRLRQAI\n+>gi|384055879|ref|YP_005485503.1| DNA helicase II UvrD/Rep [Acetobacter pasteurianus IFO 3283-22]\n+MLQFSYMSEEADAIAAEIGRRAASGCAWHDIAVIYRQNRLSRAIEEALIQARVPYEIVGDVGFYQRVAVK\n+DALALLSLAARPDDRQSDEAFRADFSHLRQFRVIL\n+>gi|384055880|ref|YP_005485504.1| DNA helicase RecD/TraA [Acetobacter pasteurianus IFO 3283-22]\n+MTSAVVGEQCQTEALAGLVERVTFHNAENGFCVLRVKVRGQRDLVTVVGHAAMISAGEFVQMSGRWFNDH\n+THGLQFKAEFLKASPPTTVEGIERYLGSGMIRGIGPVYAKKLVKAFGEAVFDLIEQEPHRLREVTGIGPK\n+RAERIVGGWADQKVIREIMLFLHSNGVGTSRAVRIFKTYGQDAVRLISENPYRLAKDIRGIGFKTADQIA\n+RKMGIAPDAMIRVRAGISYALGEAMDEGHCGLPVGELLTSTAELLEVAAPLIETALALELEAGDVVADSV\n+GETSCIFLAGLYRAEQSIAERLRACAVGRPPWPEIDAEKAMTWVEGKTGLAMAPSQQEAVRLALRSKVLV\n+ITGGPGVGKTTLVNAILKIVTAKGTDVQLCAPTGRAAKRLSESTGLEGKTIHRLLETDPGNGSFKRDDTN\n+PLTCDLLVVDEASMVDVLLMRSLLRALPDSASLLIVGDVDQLPSVGPGQVLADIIGSDAVPVVRLTEVFR\n+QAAQSRIITNAHRINEGKMPELSAEEGSDFYFVEAAEPEVGLRKLLAVVKDRIPARFGLDPVRDVQVLCP\n+MNRGGLGARSLNIELQQALNPAGDVKVERFGWTYGPGDKVMQIANDYDRDVFNGDLGVIDKIDVEEGELT\n+VLFDGREVVYGFGELDELVLAYATTIHKSQGSEYPVVVIPLVTQHYTMLARNLLYTGVTRGRKLVVLVGQ\n+KKALAIAVRNQGGRLRWSKLRDWLVGTSGTGHLSRLKKP\n+>gi|384055881|ref|YP_005485505.1| phage integrase [Acetobacter pasteurianus IFO 3283-22]\n+MVESQVSHIQPEYKFHINLDEYDRRATLSADELKVVRRWKEENLVITKRQAPRLHKPLTDILYRSNLDRA\n+NSHRALKYLLLTVAHQEKPYWGWSEDLWVEIINNSPVLKKTGMVPQLIAVAYLLCGFRSVYKIQRNVATA\n+VVARLVFGAEIVDTECERLFSALTRVGFVCQTVRPLVPSVFAAVALQGENPKLESFDRKILEHTRECYTG\n+NHIAKRIGILSNGLAAMGLTSKVIHFRAYPPRHGTETDNINPEWMTWCRRWLETTTLREGSRRAVYNTLT\n+RIGIWLGREHPEVTGPEQWTVSVCADYLAAVDRLRVGDWGGSTFDYRLIPTVGQPLQAPTKVAYYQVMRR\n+FLSDIQSWEWARLRCNPRYHLSTPKNIAKYLGVNPRTIDDASWLKLTWASLNIEPDDLSPDCFYPFALLQ\n+AIAVVWTHAGLRSNEIARLRVGCTREQSEDVVDQSGNVVPAGQVCWLDVPEGKTSVAYTKPVGHAVHKYI\n+TAWMKKRASPRKHLDRRTGEHVHFLFQLRNRPIAKEVLNQTVIPLLCKKAGIPIEDSKGRITSHRGRASA\n+VSMLASVPQGMTIFDLAKWCGHTSVQSTMSYVRSKPTQLASAFAKADQAARMIEIVIDNEVIAAGATKDG\n+APWKYYDLGDSYCSNAFWSTCPHRMACARCYFNIPKPSAKGVVLAAQQAANRLLEEVWLSPEERDAVSGD\n+VEALEGMLNKLRDKPALDGRTPGEISATCGSQVSSPFTESE\n+>gi|384055882|ref|YP_005485506.1| transposase [Acetobacter pasteurianus IFO 3283-22]\n+MELGITPGQDADITQAEPLLENIEPDAFLADKAYDADRLIDRLIQRGITPVIPPKRNRTTRRVIPP\n'
b
diff -r a712b378e090 -r 0773b11fb822 rnaseq/cutadapt/cutadapt_adapters.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rnaseq/cutadapt/cutadapt_adapters.txt Sat Feb 21 16:56:49 2015 +0100
b
@@ -0,0 +1,48 @@
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGAGCATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAAAAGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAACTAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCACCGGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCACGATATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCACTCAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGGCGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCATGGCATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCATTTTATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCAACAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGGAATATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTAGCTATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTATACATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTCAGAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGACGACATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTGGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGTAGCATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTAGAGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAATCGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTACAGCATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTATAATATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCATTCATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCCCGAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCGAAGATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTCGGCAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG 
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG