Previous changeset 5:e159dbecdad6 (2015-02-21) Next changeset 8:e51957423315 (2015-02-21) |
Commit message:
FASTA to RDF |
modified:
gbk2rdf/gbktordf.py |
added:
fasta2rdf/fastatordf.py fasta2rdf/fastatordf.xml fasta2rdf/test-data/.DS_Store fasta2rdf/test-data/NC_017117.fna |
b |
diff -r e159dbecdad6 -r c79025539d9b fasta2rdf/fastatordf.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta2rdf/fastatordf.py Sat Feb 21 15:23:15 2015 +0100 |
[ |
@@ -0,0 +1,145 @@ +#!/usr/bin/env python3.4 +# Author: Jasper Jan Koehorst +# Date created: Jan 22 2015 +# Function: generation of a RDF file from a genome fasta file + +def delete_galaxy(): + import sys + for index, path in enumerate(sys.path): + if "galaxy-dist/" in path: + sys.path[index] = '' + +#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. +delete_galaxy() + +# from io import StringIO +from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin +# import rdflib +from rdflib.store import Store +import sys + +store = plugin.get('IOMemory', Store)() + +global URI +URI = "http://csb.wur.nl/genome/" +global seeAlso +seeAlso = "rdfs:seeAlso" +global coreURI +coreURI = Namespace(URI) + +def createClass(uri): + genomeGraph.add((uri,RDF.type,OWL.Class)) + genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) + return uri + +def fasta_parser(input_file): + createClass(coreURI["Genome"]) #Genome class + createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) + + genomeDict = {} + + #requires chromosome_1, chromosome_2, chromosome_1... #For multiple scaffolds +# regex = re.compile('\[type=(.*?)\]') + sequence = "" + genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") + if genomeID == 'None': + genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") + + genomeURI = coreURI[genomeID] + for index, element in enumerate(sys.argv): + if '-organism' == element: + genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) + if '-ncbi_taxid' == element: + genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) + if '-idtag' == element: + genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) + if '-ids' == element: + genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) + + genomeDict[genomeID] = {} + # typDict = {"plasmid":0,"scaffold":0,"chromosome":0} + + #Generating genome dictionary + data = open(input_file).readlines() + fastadict = {} + key = "" + for index, line in enumerate(data): + if ">" == line[0]: + key = line.strip(">").strip() + fastadict[key] = "" + else: + fastadict[key] += line.strip() + + # for line in fastadict: + # typ = regex.findall(line) + # value = 0 + #If something is found + # if len(typ) > 0: + # typ = typ[0] + #If something is not found + # elif typ == []: + # typ = "scaffold" + #If something is found but does not contain a value + # elif "_" in typ: + # value = typ.split("_")[-1] + # try: + # value = int(value) + # except: + # value = 1 + #Not a integer + + #If a value is not given it is automatically assigned as the first one + #If a value is given... + # if value > -1: + #If a second scaffold of a chromosome_1 is found + # if typ in genomeDict[genome]: + #Retrieve how many + # value = len(genomeDict[genome][typ]) + 1 + # genomeDict[genome][typ]["scaffold_"+str(value)] = {"contig":fastadict[line]} + # else: + # genomeDict[genome][typ] = {} + # genomeDict[genome][typ]["scaffold_1"] = {"contig":fastadict[line]} + + #Genome dictionary to TTL + genomeClass = createClass(coreURI["Genome"]) + typeClass = createClass(coreURI["DnaObject"]) + for index, genome in enumerate(fastadict): + # for typ in genomeDict[genome]: + # for scaf in genomeDict[genome][typ]: + # for con in genomeDict[genome][typ][scaf]: + #A note is required here... + #Due to RDF performances we are reducing the amount of triples needed from a genome to a contig. + #Previously it was + # Genome > Class > Scaffold > Contig + #Now it will be + # Genome > Class/Scaffold/Contig + #typeURI = coreURI[genome + "/" + typ] + #scaffoldURI = coreURI[genome + "/" + typ + "/" + scaf] + #Was contigURI + typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] # + "/" + scaf + "/" + con] + # sequence = genomeDict[genome][typ][scaf][con] + sequence = fastadict[genome] + genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) + genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) + genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) + genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((genomeURI, RDF.type,genomeClass)) + genomeGraph.add((typeURI, RDF.type,typeClass)) + +def save(): + data = genomeGraph.serialize(format='turtle') + open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) + +def main(): + store = plugin.get('IOMemory', Store)() + global genomeGraph + genomeGraph = Graph(store,URIRef(URI)) + genomeGraph.bind("ssb",coreURI) + input_file = sys.argv[sys.argv.index("-input")+1] + fasta_parser(input_file) + save() + +if __name__ == '__main__': + main() + |
b |
diff -r e159dbecdad6 -r c79025539d9b fasta2rdf/fastatordf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta2rdf/fastatordf.xml Sat Feb 21 15:23:15 2015 +0100 |
b |
@@ -0,0 +1,34 @@ +<tool id="SAPP_genome_to_ttl" name="FASTA to RDF" version="0.1"> + <description></description> + <command interpreter="python3">fastatordf.py '-input' '$input' '-output' '$output' '-organism' '$organism' '-ncbi_taxid' '$ncbi_taxid' '-idtag' '$identification_tag' -sourcedb SAPP + #for $index, $id in enumerate( $ids ) + '-ids' '$id.id_tag' + #end for + '-id_alternative' '$input.name' + </command> + <inputs> + <param size="60" name="input" type="data" format="fasta,fa" label="File for annotation, file types used fasta,fa"/> + <param size="60" name="organism" type="text" format="text" label="organism name" optional="false"/> + <param size="60" name="ncbi_taxid" type="text" format="text" label="NCBI taxonomy ID"/> + <param size="60" name="identification_tag" type="text" format="text" label="An identification tag used for RDF storage !Needs to be very unique!" optional="false"/> + <repeat name="ids" title="Identification tags"> + <param size="60" name="id_tag" type="text" format="text" label="An identification tag used by other consortiums"/> + </repeat> + </inputs> + + <outputs> + <data format="rdf" name="output" label="genomeTTL: ${input.name}" /> + </outputs> + + <tests> + <test> + <param name="input" value="test-data/NC_017117.fna"/> + <output name="$output" file="NC_017117.rdf"/> + <output name="$ncbi_taxid" value="634455"/> + <output name="$idtag" value="Acetobacter pasteurianus IFO 3283-22"/> + <output name="$organism" value="Acetobacter pasteurianus IFO 3283-22"/> + </test> + </tests> + +<help> Genome FASTA file to RDF</help> +</tool> |
b |
diff -r e159dbecdad6 -r c79025539d9b fasta2rdf/test-data/.DS_Store |
b |
Binary file fasta2rdf/test-data/.DS_Store has changed |
b |
diff -r e159dbecdad6 -r c79025539d9b fasta2rdf/test-data/NC_017117.fna --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta2rdf/test-data/NC_017117.fna Sat Feb 21 15:23:15 2015 +0100 |
b |
b'@@ -0,0 +1,2736 @@\n+>gi|384055705|ref|NC_017117.1| Acetobacter pasteurianus IFO 3283-22 plasmid pAPA22-010, complete sequence\n+CGCAGGTTGAGTTCCTGTTCCCGATAGATCCGATAAACCCGCTTATGATTCCAGAGCTGTCCCTGCACAT\n+TGCGCAGATACAGGAAACACAGACCAAATCCCCATCTCCTGTGAGCCTGGGTCAGTCCCACCAGAAGAGC\n+GGCAATCCTGTCGTTCTCCGCTGCCAGTCGCGGACGATAGCGAAAGCAGGTCTCGGATATCCCAAAAATC\n+CGACAGGCCAGCGCAATGCTGACCCCATGATGCGCCACAGCTTGTGCGGCCAGTTCCCGGCGCTGGGCTG\n+GCCGCTTCATTTTTTTCCAAGGGCTTCCTTCAGGATATCCGTCTGCATGCTCAAATCCGCATACATGCGC\n+TTCAGCCGACGGTTCTCCTCTTCCAAAGCCTTCATCTGACTGATCATCGAAGCATCCATGCCGCCATATT\n+TCGCGCGCCACCGGTAAAACGTGGCGTTGCTGATCCCATGCTCCCGACACAGGTCAGGAACCGGGACACC\n+GCCCTCAGCCTGGCGGATCACACCCATGATCTGGGCGTCAGTAAAGCGATCACTCTTCATCAGAATCTCC\n+TCAATTCTTACGCTGAGAAAATTCTCATTCAAAAGTCACTCTTTTTATGGGGGGATTACCACTCTAAATC\n+AATGCATTCCAATTAACTTATAAAATGCTTTGAGAGTCATCACCTACAGCAAAGAACTCTGCTGACACCT\n+CTGAAATCATTTTGAGCCATAAAAACTGGGCTCGATTGACGTCCTGAAACTCATCAGCCATCACGGCTGT\n+GAAACGGCGTGACCAGCGGTAGCGATAGGCATCATTGTGCAGCATTGCCAATGTCGGCCACATCAACAGA\n+TCCCCAAAATCTGCAGCATTCTGTTCGCGCAGCAAACTCTGGTAACGACCATACAACTCAACCACATAGC\n+GCCAGCCCGCATCGTCCATAAAACGTTTCTGGGCATGTGCTCGCGCTATCATGGCTTCAACATGATGCCC\n+TGCCATCTCAGGCGTCACCAGATCTTCCTTCAAACGAGATGGGGTGGTTGCCGTCCTCCCCGCACGGCAT\n+CGCAATGTGCCAGAATGGTCGTTGGAAGAAACGACTGCGCGAAAGGACGGCAGATGAAGGATACAGTGAT\n+AGGCGTTGATCTGGCAAAGAACATTTTCCAGGTTCATGGAGCTTCGCGTGCGGGCGAGGTGATGTTTCGC\n+AAAAAGCTGCGTCGTCAGCAGTTTATGCAGTTCATGGCCACGCAGCCGCCTGCTCTGGTCGTTCTTGAAG\n+CGTGCGGGAGCGCGCATTACTGGGCTCGCGAACTGGCAGGAGCTGGTCACGAGGTCAGACTGATCGCTCC\n+GCAGTATGTGAAGCCTTTCGTGAAGCGCCAGAAGAACGATGCTGCTGATGCGGAAGCGATCGTCATTGCG\n+GCCCGTCAGCCGGAAATGCGCTTTGTCGAACCACGCACTGAAGCGCAGCAGGCGCGTGGCGTTCTTTTCC\n+GGGCCCGGCAGCGTCTGGTGCACCAGCGCACGGAACTGGTGAATGCCCTGCGTGCCGTTCTGTATGAATT\n+CGGTCTCGTCGTGCCACAGGGGATTGCGCATATCAGACACATTGAAGCCATGCTGGATGAGGCGGTTCTG\n+CCAGAGGCTGTGAAGCAGGAATGCCTTGATCTGCTGCGACAGATTTCGGAGCAGAGTGTGCGGATTGATG\n+TCAGAACAAAGAAGATCAGGATGCTTGCCCAGGAAAGTGAAAACACCTGCAGATTGCAGAGCATGCCTGG\n+AGTGGGTCCTCTGACCGCTCTTGCGATTGAAGCTTTTGCGCCTGACCTGCAGAGCTTCCGGCGCGGGCGC\n+GACTTTGCTGCGTGGCTGGGGCTGGTGCCCCGTCAGTTCTCATCTGGCGGAAAGGAAAGGCTGGGGAAGA\n+TATCAAAAGCCGGGCAGGCTGATATCCGCAGGCTTCTCATCATGGGCGCCATGACCCAGGTGAACTGGGC\n+CAGCCGTAAGGCCCCTGCACCGGGAAGCTGGCTGGCACGGATGCTGGCCCGCAAGCCCCGTATGCTGGTA\n+GCCATTGCGCTGGCCAACAGGATGGCACGAGCCATCTGGGCCATGGCAACAAAACAGGAGGATTATCGGG\n+ATCCGGCCCTGTCCGTGGCAGCCTGAGCGATGGCTCGGCTCCCGCGGATGGAACCGGTAGGGGTGTGAGA\n+GGGCGATGACCTGAATGGGCGCATGATCGTCTGATCCGGATCGGAAAAACCAGTGGATTTCTCTGTGCTT\n+TAAAGCACGCCTGTGAGATTTGGATCTGATCCGCTGATCACCATACTGGCCAGTGGCTTCTGAAAGGCCA\n+CATCAACAGGCCTTACAGAAGACCGCACACGATCACACGTCAATATGGGTCAGAAAACTCTTGCATAACG\n+GACGGCAACCATATGTGGACGGCTCCCCCTTGCAAGAGGCTAGGCAAGAAAATGATCGGATCTTTGCTTC\n+CATATGTCCGGCCTGTTGATGCGGCCATAGGGTCGCTGGCCAAGATGGCTTCCGCAGCGTGAGCCCCAAA\n+CACAGAAGCGGTCTTTGATGACCACTGGTTGCCACGGGTTTTCTCACGCCATGGATCGATCGATCACACC\n+ATCTGCTCTATTACTTGCAAGCCACGACCTCAGCTCGGCACGAGAGCGTCAAATGTCAGCGCATCGTGCC\n+AGGCTAAGCTCAAACAGCAGCTGCGCCGGGTTGCTGCAGAAGGCGCTTATAGTGTTCGCCGCTGACCATC\n+AGTTTCCAAGCAATCCGCGCAATCTTATTGGCAAGGGCCACCGCTGCGAGTTTCGGTTTTTTGCGCTCCA\n+GCAATTCACGTAACCAAGATGAGGCATTCTTCCCATTGGTCCGCCGGGCATGCGACACGACTGCGGTCGC\n+GCCAACCACCAGCGTGCTTCGCAAGACCTCATCGCCAGCGCGTGTGATTCTGCCAAGCCTTGTTTTTCCA\n+CCGGTTGAGTGATCCCTGGGCGTCAATCCGATCCAGGCCGCAAAGGCTCGACCCGATTTGAACAGATGCG\n+GATCAGGCGTTTTCATCATCAGCAGCGCTGCGCCGATCGGGCCAACGCCCGGAATTTTCGCAAGACGCTG\n+ACTGCATTCGTTGGCGCGGTGCCATGCCATCACCTTGCCCTCAAGCTGTTCGATTTCACCTTGCAATTCA\n+GCATATTCCTTTGCGTGAAGGGCAAACAACTCGCGCGTCAATGTGGGCAGGCTTTCGTCCGCAGCGATCC\n+GATCAAGGAGTGCCTCAATCCGGCACATGCCTTTGGGCGCCGTGATCCCAAACTCGGCAGCATATCCCCG\n+GATCGTATTGGCGAGCTGTGTGCGGTTCCGGATAAGTCGTGCCCGCATTCCAATCAGCATCAACGCTGCC\n+TGCTCTTCCTCGCTCTTGAGCGGGACGAACCGCATTGTAGGCCGACTCATCGCTTCACAGAGGGCTTCCG\n+CGTCGGCGGCATCGTTTTTCCCGCGCTTGACATAAGGCTTCACGAGCTGCGGCGCGATCAGCTTCACTGT\n+GTGTCCCAGACACGAGAGCACCCGCCCCCAGTAATGGGAGGCGCCACAGGCCTCAATCGCGATTTCAATC\n+GGGGGCAGTTTCTCAAAAAACTTTACCATCTCCCGGCGGGATAGCTTCCTGCGCAAAACAGGCTGCTCCT\n+TCGCGTTTACACCGTGCAATTGGAAAACACTTTTTGACGTGTCCATGCCAATACGGATAATTTGTTCCAT\n+GGGTGGCCTCCTCTGTGAGTTCTGCAACGACTTCACCTTGGCACATCGCGATGCCG'..b'TTGCTCCGAAGGCCTGCATGTGTCCACACCACGGCGATTGCCTGCAACAAGGCGAATGGAT\n+AGAAACAGTCAGGAGAAAGGTCGTCTGGTTCGATGTTGAGGCTGGCCCAGGTCAGTTTCAGCCAAGAAGC\n+ATCATCTATGGTCCTAGGATTTACGCCAAGATATTTCGCAATATTCTTCGGTGTCGAGAGATGATATCGC\n+GGATTACACCTGAGCCGCGCCCATTCCCAACTCTGAATATCAGATAAAAACCGGCGCATAACCTGATAAT\n+AAGCAACTTTCGTTGGAGCCTGCAGAGGCTGGCCAACTGTTGGGATCAGACGATAATCAAAGGTGGAACC\n+GCCCCAATCGCCAACACGTAACCTGTCGACGGCGGCGAGATAGTCGGCACATACCGATACCGTCCATTGC\n+TCTGGTCCAGTGACCTCAGGGTGCTCGCGACCCAACCAGATTCCGATACGGGTTAGAGTGTTGTAAACTG\n+CTCGCCTCGACCCTTCTCGCAACGTTGTTGTTTCCAACCAACGTCGGCACCATGTCATCCACTCAGGATT\n+GATATTATCAGTTTCAGTTCCGTGACGAGGTGGATATGCCCGAAAATGGATAACCTTTGATGTTAATCCC\n+ATTGCCGCCAAACCGTTTGACAATATTCCAATCCGCTTGGCGATATGATTTCCTGTGTAACACTCTCGTG\n+TATGTTCCAGTATCTTCCTATCAAAACTTTCAAGTTTTGGATTCTCGCCTTGTAATGCAACTGCAGCAAA\n+TACTGATGGTACGAGGGGCCGAACGGTCTGACAGACGAAGCCGACACGGGTTAGGGCCGAGAACAGACGC\n+TCACATTCTGTATCAACAATCTCCGCTCCAAAAACCAATCGGGCAACGACAGCCGTCGCCACATTGCGTT\n+GAATTTTGTACACGCTTCGAAAACCACACAGAAGATAAGCGACTGCAATCAACTGCGGTACCATTCCGGT\n+TTTTTTCAGAACAGGACTATTATTGATAATCTCAACCCACAGGTCTTCACTCCACCCCCAGTAGGGTTTT\n+TCCTGGTGTGCGACCGTCAGAAGCAAATACTTCAAAGCACGATGGCTGTTAGCGCGGTCGAGATTGCTGC\n+GATACAGGATGTCCGTCAGAGGCTTATGAAGACGCGGAGCCTGCCGTTTCGTAATAACAAGATTTTCTTC\n+TTTCCAACGTCTGACAACTTTTAATTCGTCTGCAGACAATGTCGCCCGTCTGTCGTATTCATCGAGATTA\n+ATGTGGAATTTGTATTCGGGCTGGATGTGAGAAACCTGAGATTCTACCACTTTTCTATCCTCCGAAGACC\n+CTGTGACCAAGCTTCATATCCATCTGTTCGACTGAGTTTGCGATCTTGCGAAGCAGATCTTCACCGGAAA\n+GATGGATATAGAGTGTCGTGCTTTGAACATTGCGATGCCCGGCATACGTCGCAATATCGTGTAGACGCCA\n+GCCAGCACGGGCCAGATGCGTCAATCTCAGGTGACGCAAAGTGTGCGTACTGAACAATGGCATATCAGCC\n+TGGAGAGCAAGACGTCTGACAGTTTTGCTCCATGACCACTTCGTAATAGGCTGCCGAAAGTTCCGATCTG\n+ACTCAGAGAGAAACAGGGCCGCTGAATGAGTCGCTGCGTTGCGCCTTTGATGCAGATATACCGCCAACAC\n+AGGACAGAGCGCCGCTGAATAACAAACCACACGAGGGCGAGCGCTTTTACTTGTTTCGGCCCGAATGGTG\n+AGCAAACGTCTCGCAGGGTCGATATCCGAGACGCGCAAATTTACTACGGCGTGTCGTCAGTTAAGCCCTG\n+AGAGTGGCACGTGAGGGTTGTACTTTGTGTCTGCGTGTGCTGACTGTTTTCCCATTTTTTGGGGAGACAG\n+ACAGATGCGGCGCTATAGTTTACGCGATGACCAGTGGGAGCGGATAAAGGATCTTCTTCCTGGTCGAGAA\n+GGCTATGTCGGCGGCACTGCGGTGAACAACCGTCTGTTCGTGGAGGCGGTGCTGTATCGCTATCGCGCGG\n+GTATTCCATGGCGCGACCTTCCTGCCCGTTTCGGTGACTGGAAAAACGTGCACCGGCGTCTGCGCCGCTG\n+GTGTGAAAGCGGCGTCATCGAACGGATATTTCGTTATCTGGCCGCTGATTACGACAACGAATACATGATG\n+ATCGACAGCACAATTGTCCGAGCGCATCAGCATAGTGCCGGAGCTCTCAAAAAAGGGGCACGGATCAGGC\n+CATCGGACGATCACGGGCGGGCTAACTACAAAGATCCATGCCATCTGCGACGCTCTGGGCAATCCAGTGG\n+AACTCGGCATCACACCGGGACAGGATGCCGATATCACCCAGGCAGAACCACTTCTGGAAAACATCGAACC\n+GGATGCTTTCCTTGCTGACAAGGCGTATGACGCGGACAGGTTGATCGATCGGCTGATACAGCGCGGGATT\n+ACCCCGGTCATCCCGCCAAAACGCAACAGAACGACACGACGGGTAATCCCCCCATAAAAAGAGTGACTTT\n+TGAATGAGAATTTTCTCAGCGTAAGAATTGAGGAGATTCTGATGAAGAGTGATCGCTTTAGTGACGCCCA\n+GATCATGGGTGTGATCCGCCAGGCTGAGGGCGGTGTCCCGGTTCCTGACCTGTGCCGGGAGCATGGGATC\n+AGCAACGCCACGTTTTACCGGTGGCGCGCGAAATATGGCGGCATGGATGCTTCGATGATCAGTCAGATGA\n+AGGCTTTGGAAGAGGAGAACCGTCGGCTGAAGCGCATGTATGCGGATTTGAGCATGCAGACGGATATCCT\n+GAAGGAAGCCCTTGGAAAAAAATGAAGCGGCCAGCCCAGCGCCGGGAACTGGCCGCACAGGCTGTGGCGC\n+ATCATGGGGTCAGCATTGCGCTGGCCTGTCGGATTTTTGGGATATCCGAGACCTGCTTTCGCTATCGTCC\n+GCGACTGGCAGCGGAGAATGACAGGATTGCCGCTCTTCTGGTGGGACTGACCCAGGCTCACAGGAGATGG\n+GGATTTGGTCTGTGTTTCCTGTATCTGCGCAATGTGCAGGGACAGCTCTGGAATCATAAGCGGGTTTATC\n+GGATCTATCGGGAACTGGAGTTCAACCTGCGGATTAAACCCCGCAGGCGTCTGGTTCGCGAAAAGCCTGA\n+AAAGCTGTCGGTTCCGGCCCTTCCCAACACGGTCTGGTCCATGGATTTCATGGCGGACAGGCTTTTGGAT\n+GGACGCGCTTTTCGGCTCCTGAACATCCTGGATGAGTTCAATCGTGAAGGACTGGCGATCGAGGTTGATT\n+TTTCCCTGCCGGCCTGTCGGGTTGTCCGCTGGTAATCCCCCCATTTTTAGTGGGGCATTGAATGAGAATT\n+CAGGCAGCTGTTTTTAGTTTCTGGGCGGGGGTTAGCCCGCTGTTCCCCATGTTGGGTCTGTCATTGTTAT\n+ATGTCCAGAGCCATTGTGTTGCGACCTCCTGTACGTCCTGAATGCTTTCAAACAAATACTGCTCTAGCCA\n+TTCCTGCCGGACAGTTCTGTTGTAGCGTTCAATATAGGCGTTCTGCTGCGGATTGCCCGGTTGTGTATAG\n+ATCAGGGTAATCCCCTGCTTTTCGGCCCATGAAACCAACGTATGACTGACATATTCAGGGCCATTGTCCA\n+TTCGGATAGCCTCTGGCCTGCCACGCCACTCCATAACCTGTTCCAGACAGCGAACAACCCGACAGGCTGG\n+CAGGGAAAAATCAACCTCAATCGCCAGTCCTTCACGATTGAAATCATCCAGAATGTTCAGGAGCCGAAAA\n+GCACGTCCATCCATCAGCCTGTCCGCCATAAAATCCATGGACCAGACCCTGTTGGGAAGGGCCGGAACCG\n+ACAGCTTTTCAGGCTTTTCGCGAACCAGACGCCTGCGGGGTTTAATC\n' |
b |
diff -r e159dbecdad6 -r c79025539d9b gbk2rdf/gbktordf.py --- a/gbk2rdf/gbktordf.py Sat Feb 21 07:56:16 2015 -0500 +++ b/gbk2rdf/gbktordf.py Sat Feb 21 15:23:15 2015 +0100 |
[ |
@@ -108,8 +108,12 @@ gi = record.annotations["gi"] typ = str(gi) except: - scaf_value += 1 - typ = "scaffold_"+str(scaf_value) + try: + gi = record.annotations["accessions"][0] + typ = str(gi) + except: + scaf_value += 1 + typ = "scaffold_"+str(scaf_value) genomeURI = coreURI[genome] gbkURI = coreURI[genome + "/" + typ] #To contig connection to connect all data to it @@ -148,8 +152,7 @@ int_add(gbkURI,coreURI[annot.lower()],str(a)) else: int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot])) - - + #####END of RECORD#### if len(sequence) > 0: genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence))) @@ -167,13 +170,6 @@ if strand == 'None': strand = 0 - -# if feature_type == "gene": -# gene = feature - #Store gene in next feature.... -# gene_location_start = end = str(gene.location.end).replace(">","").replace("<","") -# gene_location_stop = str(gene.location.start).replace(">","").replace("<","") -# gene_qualifiers = gene.qualifiers else: if feature.type == "misc_feature": #Store as part of previous cds or something... if strand == "-1": @@ -181,8 +177,6 @@ else: miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)] - # genomeGraph.add((generalURI,coreURI["subFeature"],miscURI)) - # TODO: Check if biopython has an overlap function... if int(prevObjStart) <= int(start): if int(end) <= int(prevObjStop): @@ -201,15 +195,12 @@ prevObjStart = start prevObjStop = end - if strand == "-1": typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)] else: typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)] -# cds_sequence = str(feature.extract(sequence)) - #Contig specific connection - + #Contig specific connection genomeGraph.add((gbkURI, coreURI["feature"] , typeURI)) ############################ @@ -228,6 +219,7 @@ genomeGraph.add((typeURI, coreURI["feature"] , subURI)) store_general_information(subURI,subfeature,record,feature) + def store_general_information(generalURI,feature,record,superfeature=""): proteinClass = createClass(coreURI["Protein"], root=True) sequence = str(record.seq) @@ -277,8 +269,6 @@ #And subfeature variable will contain the superfeature if superfeature: codon = superfeature.qualifiers["transl_table"][0] -# else: -# codon = subfeature.qualifiers["transl_table"][0] except: #Default codon table 11 codon = "11" @@ -356,7 +346,6 @@ genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing)) genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"])) genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"])) - genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"])) genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class)) def main(): |