Mercurial > repos > artbio > mircounts
changeset 10:de227b7307cf draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit af0f70b8156c078cc0d832c54ebb678af10c42a0
author | artbio |
---|---|
date | Sun, 29 Apr 2018 18:57:13 -0400 |
parents | 2a08a6eb471c |
children | 7d50d8d0c8c4 |
files | format_fasta_hairpins.py format_fasta_hairpins.sh mature_mir_gff_translation.py mirbase.tar.gz mircounts.xml test-data/clipped.out.bam test-data/unclipped.out.22.bam test-data/unclipped.out.bam |
diffstat | 8 files changed, 96 insertions(+), 44 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/format_fasta_hairpins.py Sun Apr 29 18:57:13 2018 -0400 @@ -0,0 +1,61 @@ +import argparse +import gzip + + +def Parser(): + the_parser = argparse.ArgumentParser() + the_parser.add_argument( + '--hairpins_path', action="store", type=str, + help="BASE url. ex: /pub/mirbase/22/") + the_parser.add_argument( + '--output', action="store", type=str, + help="parsed hairpin output in fasta format") + the_parser.add_argument( + '--basename', action="store", type=str, + help="genome basename of the parsed fasta") + args = the_parser.parse_args() + return args + + +def get_fasta_dic(gzipfile): + ''' + gzipfile value example : 'mirbase/22/hairpin.fa.gz' + ''' + item_dic = {} + with gzip.open(gzipfile, 'rb') as f: + current_item = '' + stringlist = [] + for line in f: + line = line.decode('utf-8').strip('\n') + if (line[0] == ">"): + # dump the sequence of the previous item + if current_item and stringlist: + item_dic[current_item] = "".join(stringlist) + # take first word of item ''' + current_item = line[1:].split()[0] + stringlist = [] + else: + stringlist.append(line) + item_dic[current_item] = "".join(stringlist) # for the last item + return item_dic + + +def convert_and_print_hairpins(gzipfile, basename, fasta_output): + raw_fasta_dict = get_fasta_dic(gzipfile) + parsed_fasta_dict = {} + trs = str.maketrans("uU", "tT") + for head in raw_fasta_dict: + if basename in head: + parsed_fasta_dict[head] = raw_fasta_dict[head].translate(trs) + with open(fasta_output, "w") as output: + for head in sorted(parsed_fasta_dict): + output.write('>%s\n%s\n' % (head, parsed_fasta_dict[head])) + + +def main(hairpins_path, basename, outfile): + convert_and_print_hairpins(hairpins_path, basename, outfile) + + +if __name__ == "__main__": + args = Parser() + main(args.hairpins_path, args.basename, args.output)
--- a/format_fasta_hairpins.sh Wed Apr 25 12:48:27 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -GENOME_KEY=$1 - -gunzip hairpin.fa.gz -sed -i.bak '/^[^>]/ y/uU/tT/' hairpin.fa ## replace U by tT -sed -i.bak2 -E 's/ .+//' hairpin.fa ## just leaves mir name as one word header -awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < hairpin.fa > hairpin.fa.bak3 -tail -n +2 hairpin.fa.bak3 > hairpin.fa ## generate single line sequences -awk 'BEGIN{RS=">"}{gsub("\n","\t",$0); print ">"$0}' < hairpin.fa > hairpin.fa.tmp -mv hairpin.fa hairpin.bak4 && tail -n +2 hairpin.fa.tmp > hairpin.fa -rm hairpin.fa.tmp ## tabular sequences -sed -i.bak5 -E $'s/\t$//g' hairpin.fa ## remove tab before end line leaved by previous awk -grep ">${GENOME_KEY}-" hairpin.fa > hairpin.fa.tmp -mv hairpin.fa hairpin.fa.bak6 -mv hairpin.fa.tmp hairpin.fa ## filter tabular hairpins with proper genomeKey -tr '\t' '\n' < hairpin.fa > hairpin.fa.tmp -mv hairpin.fa hairpin.fa.bak7 -mv hairpin.fa.tmp hairpin.fa ## terminate parsing by regenerating fasta format, bowtie-build ready -rm ./*.bak* ## cleaning job directory -
--- a/mature_mir_gff_translation.py Wed Apr 25 12:48:27 2018 -0400 +++ b/mature_mir_gff_translation.py Sun Apr 29 18:57:13 2018 -0400 @@ -5,10 +5,14 @@ def Parser(): the_parser = argparse.ArgumentParser() the_parser.add_argument( - '--input', action="store", type=str, help="input miRBase GFF3 file") + '--gff_path', action="store", type=str, + help="path to miRBase GFF3 file") the_parser.add_argument( '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates") + the_parser.add_argument( + '--basename', action="store", type=str, + help="basename of the parsed gff file returned") args = the_parser.parse_args() return args @@ -95,10 +99,10 @@ output.write('\n') -def main(infile, outfile): - convert_and_print_gff(infile, outfile) +def main(gff_path, outfile): + convert_and_print_gff(gff_path, outfile) if __name__ == "__main__": args = Parser() - main(args.input, args.output) + main(args.gff_path, args.output)
--- a/mircounts.xml Wed Apr 25 12:48:27 2018 -0400 +++ b/mircounts.xml Sun Apr 29 18:57:13 2018 -0400 @@ -1,18 +1,24 @@ -<tool id="mircounts" name="miRcounts" version="1.2.6"> +<tool id="mircounts" name="miRcounts" version="1.3.0"> <description> Counts miRNA alignments from small RNA sequence data</description> <requirements> - <requirement type="package" version="1.18">gnu-wget</requirement> <requirement type="package" version="1.2.0">bowtie</requirement> <requirement type="package" version="1.6.0">samtools</requirement> <requirement type="package" version="0.11.2.2">pysam</requirement> <requirement type="package" version="1.3.2=r3.3.2_0">r-optparse</requirement> <requirement type="package" version="0.20_34=r3.3.2_0">r-lattice</requirement> </requirements> + <stdio> + <exit_code range="1:" level="warning" description="Tool exception" /> + </stdio> <command detect_errors="exit_code"><![CDATA[ - wget --quiet ftp://mirbase.org/pub/mirbase/${mirbase_version}/genomes/${genomeKey}.gff3 && ## download the gff3 file specified by the variable genomeKey - python '$__tool_directory__'/mature_mir_gff_translation.py --input ${genomeKey}.gff3 --output $gff3 && ## transcode the mature miR genome coordinates into coordinates relative to the corresponding "miRNA_primary_transcript". - wget --quiet ftp://mirbase.org/pub/mirbase/${mirbase_version}/hairpin.fa.gz && - sh '$__tool_directory__'/format_fasta_hairpins.sh $genomeKey && + tar -xvzf '$__tool_directory__'/mirbase.tar.gz && + python '$__tool_directory__'/mature_mir_gff_translation.py + --gff_path mirbase/${mirbase_version}/genomes/${genomeKey}.gff3 + --output $gff3 && ## transcode the mature miR genome coordinates into coordinates relative to the corresponding "miRNA_primary_transcript". + python '$__tool_directory__'/format_fasta_hairpins.py + --hairpins_path mirbase/${mirbase_version}/hairpin.fa.gz + --basename ${genomeKey} + --output hairpin.fa && #if $cutadapt.cutoption == "yes": python '$__tool_directory__'/yac.py --input $cutadapt.input --output clipped_input.fastq @@ -21,15 +27,15 @@ --min $cutadapt.min --max $cutadapt.max --Nmode $cutadapt.Nmode && - #else + #else: ln -f -s '$cutadapt.clipped_input' clipped_input.fastq && #end if - bowtie-build hairpin.fa hairpin >/dev/null && - bowtie -v $v -M 1 --best --strata --norc -p \${GALAXY_SLOTS:-4} --sam hairpin -q clipped_input.fastq | samtools sort -O bam -o '$output' 2>&1 && + bowtie-build hairpin.fa hairpin && + bowtie -v $v -M 1 --best --strata --norc -p \${GALAXY_SLOTS:-4} --sam hairpin -q clipped_input.fastq | samtools sort -O bam -o '$output' && samtools index $output && - python '$__tool_directory__'/mircounts.py -pm --alignment $output --gff $gff3 --quality_threshold 10 --pre_mirs $pre_mir_count_file --mirs $mir_count_file --lattice $coverage_dataframe; + python '$__tool_directory__'/mircounts.py --alignment $output --gff $gff3 --quality_threshold 10 --pre_mirs $pre_mir_count_file --mirs $mir_count_file --lattice $coverage_dataframe #if $plotting.plottingOption == 'yes': - Rscript '$__tool_directory__'/coverage_plotting.R --dataframe $coverage_dataframe --type $plotting.display --output $latticePDF + && Rscript '$__tool_directory__'/coverage_plotting.R --dataframe $coverage_dataframe --type $plotting.display --output $latticePDF #end if ]]></command> <inputs> @@ -141,8 +147,8 @@ <param name="plottingOption" value="no"/> <param name="output_premir_counts" value="True"/> <param name="output_mir_counts" value="True"/> - <output name="output" file="unclipped.out.22.bam" ftype="bam"/> - <output name="gff3" file="translated_dme.22.gff3" ftype="gff3" lines_diff="22"/> + <output name="output" file="unclipped.out.22.bam"/> + <output name="gff3" file="translated_dme.22.gff3" lines_diff="22"/> <output name="pre_mir_count_file" file="pre_mirs_unclipped_count.22.tab"/> <output name="mir_count_file" file="mirs_unclipped_count.22.tab"/> </test> @@ -159,8 +165,8 @@ <param name="plottingOption" value="no"/> <param name="output_premir_counts" value="True"/> <param name="output_mir_counts" value="True"/> - <output name="output" file="unclipped.out.bam" ftype="bam"/> - <output name="gff3" file="translated_dme.gff3" ftype="gff3" lines_diff="22"/> + <output name="output" file="unclipped.out.bam" /> + <output name="gff3" file="translated_dme.gff3" lines_diff="22"/> <output name="pre_mir_count_file" file="pre_mirs_unclipped_count.tab"/> <output name="mir_count_file" file="mirs_unclipped_count.tab"/> </test> @@ -178,11 +184,11 @@ <param name="display" value="relative"/> <param name="output_premir_counts" value="True"/> <param name="output_mir_counts" value="True"/> - <output name="output" file="unclipped.out.bam" ftype="bam"/> - <output name="gff3" file="translated_dme.gff3" ftype="gff3" lines_diff="22"/> + <output name="output" file="unclipped.out.bam"/> + <output name="gff3" file="translated_dme.gff3" lines_diff="22"/> <output name="pre_mir_count_file" file="pre_mirs_unclipped_count.tab"/> <output name="mir_count_file" file="mirs_unclipped_count.tab"/> - <output name="latticePDF" file="mir_unclipped_coverage.pdf" ftype="pdf"/> + <output name="latticePDF" file="mir_unclipped_coverage.pdf"/> <output name="coverage_dataframe" file="lattice_unclipped_dataframe.tab"/> </test> <test> @@ -195,11 +201,11 @@ <param name="display" value="absolute"/> <param name="output_premir_counts" value="True"/> <param name="output_mir_counts" value="True"/> - <output name="output" file="clipped.out.bam" ftype="bam"/> - <output name="gff3" file="translated_dme.gff3" ftype="gff3" lines_diff="22"/> + <output name="output" file="clipped.out.bam"/> + <output name="gff3" file="translated_dme.gff3" lines_diff="22"/> <output name="pre_mir_count_file" file="pre_mirs_clipped_count.tab"/> <output name="mir_count_file" file="mirs_clipped_count.tab"/> - <output name="latticePDF" file="mir_clipped_coverage.pdf" ftype="pdf"/> + <output name="latticePDF" file="mir_clipped_coverage.pdf"/> <output name="coverage_dataframe" file="lattice_clipped_dataframe.tab"/> </test> </tests>