Mercurial > repos > artbio > mircounts
changeset 5:9ea96a02c416 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit 04980585c257ab5f8eb5d10de007316c47c5d1ce
author | artbio |
---|---|
date | Tue, 05 Sep 2017 06:33:16 -0400 |
parents | da1aa7de2b19 |
children | 543472c9e272 |
files | mature_mir_gff_translation.py mircounts.xml test-data/translated_dme.gff3 |
diffstat | 3 files changed, 33 insertions(+), 70 deletions(-) [+] |
line wrap: on
line diff
--- a/mature_mir_gff_translation.py Mon Sep 04 17:55:01 2017 -0400 +++ b/mature_mir_gff_translation.py Tue Sep 05 06:33:16 2017 -0400 @@ -1,6 +1,6 @@ -#!/usr/bin/env python +import argparse -import argparse +from datetime import datetime def Parser(): @@ -14,19 +14,14 @@ return args -GFF3_header = '''##gff-version 3 -##generated by mature_mir_gff_translation.py -# -# Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** -# microRNAs: miRBase current_version -# genome-build-id: check http://mirbase.org/ -# -# Hairpin precursor sequences have type "miRNA_primary_transcript". -# Note, these sequences do not represent the full primary transcript, -# rather a predicted stem-loop portion that includes the precursor -# miRNA. Mature sequences have type "miRNA". -# -''' +def get_gff_header(gff_input_file): + string_list = [] + for line in open(gff_input_file, "r"): + if line[0] == '#': + string_list.append(line) + string_list.append('# generated by mature_mir_gff_translation.py %s\n#\n' % + str(datetime.now())) + return ''.join(string_list) def load_gff_in_dict(gff_input_file): @@ -51,9 +46,9 @@ gff_dict[ID]["strand"] = gff_fields[6] gff_dict[ID]["phase"] = gff_fields[7] gff_dict[ID]["attributes"] = gff_fields[8] - if "Derives_from" in gff_dict[ID]["attributes"]: + if "erives_from" in gff_dict[ID]["attributes"]: parent_primary_transcript = gff_dict[ID]["attributes"].split( - "Derives_from=")[1] + "erives_from=")[1] parent_primary_transcript = gff_dict[parent_primary_transcript][ "attributes"].split("Name=")[1] gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % ( @@ -61,7 +56,7 @@ return gff_dict -def genome_to_mir_gff(gff_dict, output): +def genome_to_mir_gff(gff_dict, output, header): ''' Converts seqid field from chromosome to item Name Then converts coordinates relative to "miRNA_primary_transcript" @@ -70,9 +65,9 @@ for key in gff_dict: name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0] gff_dict[key]["seqid"] = name - if "Derives_from=" in gff_dict[key]["attributes"]: + if "erives_from=" in gff_dict[key]["attributes"]: parent_ID = gff_dict[key]["attributes"].split( - "Derives_from=")[1].split(";")[0] + "erives_from=")[1].split(";")[0] gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int( gff_dict[parent_ID]["start"])+1) gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int( @@ -95,7 +90,7 @@ "Name=")[1].split( ";")[0] with open(output, "w") as output: - output.write(GFF3_header) + output.write(header) for ID in sorted(hairpins, key=hairpins.get): output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"], gff_dict[ID]["type"], @@ -117,7 +112,7 @@ def main(infile, outfile): gff_dict = load_gff_in_dict(infile) - genome_to_mir_gff(gff_dict, outfile) + genome_to_mir_gff(gff_dict, outfile, get_gff_header(infile)) if __name__ == "__main__":
--- a/mircounts.xml Mon Sep 04 17:55:01 2017 -0400 +++ b/mircounts.xml Tue Sep 05 06:33:16 2017 -0400 @@ -1,4 +1,4 @@ -<tool id="mircounts" name="miRcounts" version="0.9.4"> +<tool id="mircounts" name="miRcounts" version="1.0.0"> <description> Counts miRNA alignments from small RNA sequence data</description> <requirements> <requirement type="package" version="1.18">gnu-wget</requirement> @@ -9,9 +9,9 @@ <requirement type="package" version="0.20_34=r3.3.2_0">r-lattice</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ - wget ftp://mirbase.org/pub/mirbase/CURRENT/genomes/${genomeKey}.gff3 && ## download gff3 specified by the variable genomeKey + wget ftp://mirbase.org/pub/mirbase/${mirbase_version}/genomes/${genomeKey}.gff3 && ## download the gff3 file specified by the variable genomeKey python '$__tool_directory__'/mature_mir_gff_translation.py --input ${genomeKey}.gff3 --output $gff3 && ## transcode the mature miR genome coordinates into coordinates relative to the corresponding "miRNA_primary_transcript". - wget ftp://mirbase.org/pub/mirbase/CURRENT/hairpin.fa.gz && + wget ftp://mirbase.org/pub/mirbase/${mirbase_version}/hairpin.fa.gz && sh '$__tool_directory__'/format_fasta_hairpins.sh $genomeKey && #if $cutadapt.cutoption == "yes": python '$__tool_directory__'/yac.py --input $cutadapt.input @@ -74,48 +74,11 @@ <column name="value" index="0"/> </options> </param> - <!-- work on proper selection of the url (that varies between gff, gff2 and gff3) <param name="mirbase_version" type="select" label="Choose miRbase version"> <option selected="true" value="CURRENT">Current</option> - <option value="1.0">1.0</option> - <option value="1.1">1.1</option> - <option value="1.2">1.2</option> - <option value="1.3">1.3</option> - <option value="1.4">1.4</option> - <option value="1.5">1.5</option> - <option value="2.0">2.0</option> - <option value="2.1">2.0</option> - <option value="2.2">2.2</option> - <option value="3.0">3.0</option> - <option value="3.1">3.1</option> - <option value="4.0">4.0</option> - <option value="5.0">5.0</option> - <option value="5.1">5.1</option> - <option value="6.0">6.0</option> - <option value="7.0">7.0</option> - <option value="7.1">7.1</option> - <option value="8.0">8.0</option> - <option value="8.1">8.1</option> - <option value="8.2">8.2</option> - <option value="9.0">9.0</option> - <option value="9.1">9.1</option> - <option value="9.2">9.2</option> - <option value="10.0">10.0</option> - <option value="10.1">10.1</option> - <option value="11.0">11.0</option> - <option value="12.0">12.0</option> - <option value="13.0">13.0</option> - <option value="14">14</option> - <option value="15">15</option> - <option value="16">16</option> - <option value="17">17</option> - <option value="18">18</option> + <option value="20">20</option> <option value="19">19</option> - <option value="20">20</option> - <option value="21">21</option> - </options> </param> - --> <param help="command [ bowtie -v 0,1,2,3 -M 1 --best --strata --norc ] will be used. Specify a value for -v (number of mismatches allowed)" label="Number of mismatches allowed" name="v" type="select"> <option value="0">0</option> <option selected="true" value="1">1</option> @@ -166,10 +129,11 @@ <param name="clip_sequence" value="TCGTATGCCGTCTTCTGCTTG"/> <param name="v" value="0"/> <param name="genomeKey" value="dme"/> + <param name="mirbase_version" value="CURRENT"/> <param name="input" value="input.unclipped.fastqsanger" ftype="fastqsanger"/> <param name="plottingOption" value="no"/> <output name="output" file="unclipped.out.bam" ftype="bam"/> - <output name="gff3" file="translated_dme.gff3" ftype="gff3"/> + <output name="gff3" file="translated_dme.gff3" ftype="gff3" lines_diff="22"/> <output name="pre_mir_count_file" file="pre_mirs_unclipped_count.tab"/> <output name="mir_count_file" file="mirs_unclipped_count.tab"/> </test> @@ -181,11 +145,12 @@ <param name="clip_sequence" value="TCGTATGCCGTCTTCTGCTTG"/> <param name="v" value="0"/> <param name="genomeKey" value="dme"/> + <param name="mirbase_version" value="CURRENT"/> <param name="input" value="input.unclipped.fastqsanger" ftype="fastqsanger"/> <param name="plottingOption" value="yes"/> <param name="display" value="relative"/> <output name="output" file="unclipped.out.bam" ftype="bam"/> - <output name="gff3" file="translated_dme.gff3" ftype="gff3"/> + <output name="gff3" file="translated_dme.gff3" ftype="gff3" lines_diff="22"/> <output name="pre_mir_count_file" file="pre_mirs_unclipped_count.tab"/> <output name="mir_count_file" file="mirs_unclipped_count.tab"/> <output name="latticePDF" file="mir_unclipped_coverage.pdf" ftype="pdf"/> @@ -195,11 +160,12 @@ <param name="cutoption" value="no" /> <param name="v" value="1"/> <param name="genomeKey" value="dme"/> + <param name="mirbase_version" value="CURRENT"/> <param name="clipped_input" value="input.clipped.fastqsanger" ftype="fastqsanger"/> <param name="plottingOption" value="yes"/> <param name="display" value="absolute"/> <output name="output" file="clipped.out.bam" ftype="bam"/> - <output name="gff3" file="translated_dme.gff3" ftype="gff3"/> + <output name="gff3" file="translated_dme.gff3" ftype="gff3" lines_diff="22"/> <output name="pre_mir_count_file" file="pre_mirs_clipped_count.tab"/> <output name="mir_count_file" file="mirs_clipped_count.tab"/> <output name="latticePDF" file="mir_clipped_coverage.pdf" ftype="pdf"/>
--- a/test-data/translated_dme.gff3 Mon Sep 04 17:55:01 2017 -0400 +++ b/test-data/translated_dme.gff3 Tue Sep 05 06:33:16 2017 -0400 @@ -1,15 +1,17 @@ ##gff-version 3 -##generated by mature_mir_gff_translation.py +##date 2014-6-22 # -# Chromosomal coordinates of microRNAs ** relative to the hairpin precursors ** -# microRNAs: miRBase current_version -# genome-build-id: check http://mirbase.org/ +# Chromosomal coordinates of Drosophila melanogaster microRNAs +# microRNAs: miRBase v21 +# genome-build-id: BDGP5.0 # # Hairpin precursor sequences have type "miRNA_primary_transcript". # Note, these sequences do not represent the full primary transcript, # rather a predicted stem-loop portion that includes the precursor # miRNA. Mature sequences have type "miRNA". # +# generated by mature_mir_gff_translation.py 2017-09-05 11:53:29.042787 +# dme-bantam . miRNA_primary_transcript 1 81 . + . ID=MI0000387;Alias=MI0000387;Name=dme-bantam dme-bantam-5p . miRNA 15 37 . + . ID=MIMAT0020823;Alias=MIMAT0020823;Name=dme-bantam-5p;Derives_from=MI0000387;Parent_mir_Name=dme-bantam dme-bantam-3p . miRNA 52 74 . + . ID=MIMAT0000365;Alias=MIMAT0000365;Name=dme-bantam-3p;Derives_from=MI0000387;Parent_mir_Name=dme-bantam