# HG changeset patch # User artbio # Date 1525042633 14400 # Node ID de227b7307cfe41882098de40fbb54c1c3cedea1 # Parent 2a08a6eb471cf4926a74dcb83f33540d23207d62 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit af0f70b8156c078cc0d832c54ebb678af10c42a0 diff -r 2a08a6eb471c -r de227b7307cf format_fasta_hairpins.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/format_fasta_hairpins.py Sun Apr 29 18:57:13 2018 -0400 @@ -0,0 +1,61 @@ +import argparse +import gzip + + +def Parser(): + the_parser = argparse.ArgumentParser() + the_parser.add_argument( + '--hairpins_path', action="store", type=str, + help="BASE url. ex: /pub/mirbase/22/") + the_parser.add_argument( + '--output', action="store", type=str, + help="parsed hairpin output in fasta format") + the_parser.add_argument( + '--basename', action="store", type=str, + help="genome basename of the parsed fasta") + args = the_parser.parse_args() + return args + + +def get_fasta_dic(gzipfile): + ''' + gzipfile value example : 'mirbase/22/hairpin.fa.gz' + ''' + item_dic = {} + with gzip.open(gzipfile, 'rb') as f: + current_item = '' + stringlist = [] + for line in f: + line = line.decode('utf-8').strip('\n') + if (line[0] == ">"): + # dump the sequence of the previous item + if current_item and stringlist: + item_dic[current_item] = "".join(stringlist) + # take first word of item ''' + current_item = line[1:].split()[0] + stringlist = [] + else: + stringlist.append(line) + item_dic[current_item] = "".join(stringlist) # for the last item + return item_dic + + +def convert_and_print_hairpins(gzipfile, basename, fasta_output): + raw_fasta_dict = get_fasta_dic(gzipfile) + parsed_fasta_dict = {} + trs = str.maketrans("uU", "tT") + for head in raw_fasta_dict: + if basename in head: + parsed_fasta_dict[head] = raw_fasta_dict[head].translate(trs) + with open(fasta_output, "w") as output: + for head in sorted(parsed_fasta_dict): + output.write('>%s\n%s\n' % (head, parsed_fasta_dict[head])) + + +def main(hairpins_path, basename, outfile): + convert_and_print_hairpins(hairpins_path, basename, outfile) + + +if __name__ == "__main__": + args = Parser() + main(args.hairpins_path, args.basename, args.output) diff -r 2a08a6eb471c -r de227b7307cf format_fasta_hairpins.sh --- a/format_fasta_hairpins.sh Wed Apr 25 12:48:27 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -GENOME_KEY=$1 - -gunzip hairpin.fa.gz -sed -i.bak '/^[^>]/ y/uU/tT/' hairpin.fa ## replace U by tT -sed -i.bak2 -E 's/ .+//' hairpin.fa ## just leaves mir name as one word header -awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < hairpin.fa > hairpin.fa.bak3 -tail -n +2 hairpin.fa.bak3 > hairpin.fa ## generate single line sequences -awk 'BEGIN{RS=">"}{gsub("\n","\t",$0); print ">"$0}' < hairpin.fa > hairpin.fa.tmp -mv hairpin.fa hairpin.bak4 && tail -n +2 hairpin.fa.tmp > hairpin.fa -rm hairpin.fa.tmp ## tabular sequences -sed -i.bak5 -E $'s/\t$//g' hairpin.fa ## remove tab before end line leaved by previous awk -grep ">${GENOME_KEY}-" hairpin.fa > hairpin.fa.tmp -mv hairpin.fa hairpin.fa.bak6 -mv hairpin.fa.tmp hairpin.fa ## filter tabular hairpins with proper genomeKey -tr '\t' '\n' < hairpin.fa > hairpin.fa.tmp -mv hairpin.fa hairpin.fa.bak7 -mv hairpin.fa.tmp hairpin.fa ## terminate parsing by regenerating fasta format, bowtie-build ready -rm ./*.bak* ## cleaning job directory - diff -r 2a08a6eb471c -r de227b7307cf mature_mir_gff_translation.py --- a/mature_mir_gff_translation.py Wed Apr 25 12:48:27 2018 -0400 +++ b/mature_mir_gff_translation.py Sun Apr 29 18:57:13 2018 -0400 @@ -5,10 +5,14 @@ def Parser(): the_parser = argparse.ArgumentParser() the_parser.add_argument( - '--input', action="store", type=str, help="input miRBase GFF3 file") + '--gff_path', action="store", type=str, + help="path to miRBase GFF3 file") the_parser.add_argument( '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates") + the_parser.add_argument( + '--basename', action="store", type=str, + help="basename of the parsed gff file returned") args = the_parser.parse_args() return args @@ -95,10 +99,10 @@ output.write('\n') -def main(infile, outfile): - convert_and_print_gff(infile, outfile) +def main(gff_path, outfile): + convert_and_print_gff(gff_path, outfile) if __name__ == "__main__": args = Parser() - main(args.input, args.output) + main(args.gff_path, args.output) diff -r 2a08a6eb471c -r de227b7307cf mirbase.tar.gz Binary file mirbase.tar.gz has changed diff -r 2a08a6eb471c -r de227b7307cf mircounts.xml --- a/mircounts.xml Wed Apr 25 12:48:27 2018 -0400 +++ b/mircounts.xml Sun Apr 29 18:57:13 2018 -0400 @@ -1,18 +1,24 @@ - + Counts miRNA alignments from small RNA sequence data - gnu-wget bowtie samtools pysam r-optparse r-lattice + + + /dev/null && - bowtie -v $v -M 1 --best --strata --norc -p \${GALAXY_SLOTS:-4} --sam hairpin -q clipped_input.fastq | samtools sort -O bam -o '$output' 2>&1 && + bowtie-build hairpin.fa hairpin && + bowtie -v $v -M 1 --best --strata --norc -p \${GALAXY_SLOTS:-4} --sam hairpin -q clipped_input.fastq | samtools sort -O bam -o '$output' && samtools index $output && - python '$__tool_directory__'/mircounts.py -pm --alignment $output --gff $gff3 --quality_threshold 10 --pre_mirs $pre_mir_count_file --mirs $mir_count_file --lattice $coverage_dataframe; + python '$__tool_directory__'/mircounts.py --alignment $output --gff $gff3 --quality_threshold 10 --pre_mirs $pre_mir_count_file --mirs $mir_count_file --lattice $coverage_dataframe #if $plotting.plottingOption == 'yes': - Rscript '$__tool_directory__'/coverage_plotting.R --dataframe $coverage_dataframe --type $plotting.display --output $latticePDF + && Rscript '$__tool_directory__'/coverage_plotting.R --dataframe $coverage_dataframe --type $plotting.display --output $latticePDF #end if ]]> @@ -141,8 +147,8 @@ - - + + @@ -159,8 +165,8 @@ - - + + @@ -178,11 +184,11 @@ - - + + - + @@ -195,11 +201,11 @@ - - + + - + diff -r 2a08a6eb471c -r de227b7307cf test-data/clipped.out.bam Binary file test-data/clipped.out.bam has changed diff -r 2a08a6eb471c -r de227b7307cf test-data/unclipped.out.22.bam Binary file test-data/unclipped.out.22.bam has changed diff -r 2a08a6eb471c -r de227b7307cf test-data/unclipped.out.bam Binary file test-data/unclipped.out.bam has changed