Previous changeset 46:ca05d68aca13 (2014-11-13) Next changeset 48:28ad3b598670 (2014-12-03) |
Commit message:
Uploaded |
modified:
matching.pl rfam.pl |
added:
html_preprocess.pl preProcess.pl preProcess.xml |
removed:
convert_bowtie_to_blast.pl html.pl miRDeep_plant.pl miRNA_Express_and_sequence.pl miRPlant.pl miRPlant.xml precursors.pl quantify.pl tool_dependencies.xml |
b |
diff -r ca05d68aca13 -r c75593f79aa9 convert_bowtie_to_blast.pl --- a/convert_bowtie_to_blast.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,126 +0,0 @@ -#!/usr/bin/perl - - -use warnings; -use strict; -use Getopt::Std; - -######################################### USAGE ################################ - -my $usage= -"$0 file_bowtie_result file_solexa_seq file_chromosome - -This is a converter which changes Bowtie output into Blast format. -The input includes three files: a Bowtie result file (default Bowtie -output file), a fasta file consisting of small Reads and a chromosome -fasta file. It outputs the alignments in blast_parsed format. - -file_bowtie_result likes: - -AtFlower100010_x2 + MIR319c 508 AAGGAGATTCTTTCAGTCCAG IIIIIIIIIIIIIIIIIIIII 0 -AtFlower1000188_x1 + MIR2933a 421 TCGGAGAGGAAATTCGTCGGCG IIIIIIIIIIIIIIIIIIIIII 0 - -file_solexa_seq likes: - ->AtFlower100010_x2 -AAGGAGATTCTTTCAGTCCAG - -file_chromosome contains chromosome seq in fasta format - -"; - - -####################################### INPUT FILES ############################ - -my $file_bowtie_result=shift or die $usage; -my $file_short_seq=shift or die $usage; -my $file_chromosome_seq=shift or die $usage; - - -##################################### GLOBAL VARIBALES ######################### - -my %short_seq_length=(); -my %chromosome_length=(); - - -######################################### MAIN ################################# - -#get the short sequence id and its length -sequence_length($file_short_seq,\%short_seq_length); - -#get the chromosome sequence id and its length -sequence_length($file_chromosome_seq,\%chromosome_length); - -#convert bowtie result format to blast format; -change_format($file_bowtie_result); - -exit; - - -##################################### SUBROUTINES ############################## - -sub sequence_length{ - my ($file,$hash) = @_; - my ($id, $desc, $sequence, $seq_length) = (); - - open (FASTA, "<$file") or die "can not open $$file\n"; - while (<FASTA>) - { - chomp; - if (/^>(\S+)(.*)/) - { - $id = $1; - $desc = $2; - $sequence = ""; - while (<FASTA>){ - chomp; - if (/^>(\S+)(.*)/){ - $$hash{$id} = length $sequence; - $id = $1; - $desc = $2; - $sequence = ""; - next; - } - $sequence .= $_; - } - } - } - $seq_length=length($sequence); - $$hash{$id} = $seq_length; - close FASTA; -} - - - - - -sub change_format{ - #Change Bowtie format into blast format - my $file=shift @_; - open(FILE,"<$file")||die"can not open the bowtie result file:$!\n"; - #open(BLASTOUT,">blastout")||die"can not create the blastout file:$!\n"; - - while(<FILE>){ - chomp; - my @tmp=split("\t",$_); - #Clean the reads ID - my @tmp1=split(" ",$tmp[0]); - print "$tmp1[0]"."\t"."$short_seq_length{$tmp1[0]}"."\t"."1".'..'."$short_seq_length{$tmp1[0]}"."\t"."$tmp[2]"."\t"."$chromosome_length{$tmp[2]}"."\t"; - if($tmp[1] eq "+"){ - my $seq_end=$tmp[3] + $short_seq_length{$tmp1[0]}; - my $seq_bg=$tmp[3] + 1; - print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Plus"."\n"; - } - if($tmp[1] eq "-"){ - my $seq_end=$chromosome_length{$tmp[2]} - $tmp[3]; - my $seq_bg=$seq_end - $short_seq_length{$tmp1[0]} + 1; - print "$seq_bg".'..'."$seq_end"."\t"."1e-04"."\t"."1.00"."\t"."42.1"."\t"."Plus / Minus"."\n"; - } - } - -# close BLASTOUT; - -} - - - |
b |
diff -r ca05d68aca13 -r c75593f79aa9 html.pl --- a/html.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,273 +0,0 @@ -#!/usr/bin/perl -w -#Filename: -#Author: Tian Dongmei -#Email: tiandm@big.ac.cn -#Date: 2014-5-29 -#Modified: -#Description: -my $version=1.00; - -use strict; -use Getopt::Long; -use File::Basename; - -my %opts; -GetOptions(\%opts,"i=s","format=s","o=s","h"); -if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments -&usage; -} -my ($config,$prepath,$rfampath,$knownpath,$genomepath,$novelpath); -my ($predir,$rfamdir,$knowndir,$genomedir,$noveldir); -open IN,"<$opts{i}"; -$config=<IN>; chomp $config; -$prepath=<IN>; chomp $prepath; -$rfampath=<IN>;chomp $rfampath; -$knownpath=<IN>; chomp $knownpath; -$genomepath=<IN>; chomp $genomepath; -$novelpath=<IN>; chomp $novelpath; -close IN; -my @tmp=split/\//,$prepath; -$predir=$tmp[-1]; -@tmp=split/\//,$rfampath; -$rfamdir=$tmp[-1]; -@tmp=split/\//,$knownpath; -$knowndir=$tmp[-1]; -@tmp=split/\//,$genomepath; -$genomedir=$tmp[-1]; -@tmp=split/\//,$novelpath; -$noveldir=$tmp[-1]; - -my $dir=dirname($opts{'o'}); - -open OUT ,">$opts{'o'}"; -print OUT "<HTML>\n <HEAD>\n <TITLE> Analysis Report </TITLE>\n </HEAD> - <BODY bgcolor=\"lightgray\">\n <h1 align=\"center\">\n <font face=\"����\">\n <b>Small RNA Analysis Report</b>\n </font>\n </h1> - <h2>1. Sequence No. and quality</h2> - <h3>1.1 Sequece No.</h3> -"; - -### raw data no -open IN,"<$config"; -my @files;my @marks; my @rawNo; -while (my $aline=<IN>) { - chomp $aline; - my @tmp=split/\t/,$aline; - push @files,$tmp[0]; - - my $no=`less $tmp[0] |wc -l `; - chomp $no; - if ($opts{'format'} eq "fq" || $opts{'format'} eq "fastq") { - $no=$no/4; - } - else{ - $no=$no/2; - } - push @rawNo,$no; - - push @marks,$tmp[1]; -} -close IN; - -### preprocess -unless ($prepath=~/\/$/) { - $prepath .="/"; -} - -my @trimNo;my @collapse; -my $collapsefile=$prepath."collapse_reads.fa"; -open IN,"<$collapsefile"; -while (my $aline=<IN>) { - chomp $aline; - <IN>; - $aline=~/:([\d|_]+)_x(\d+)$/; - my @lng=split/_/,$1; - for (my $i=0;$i<@lng;$i++) { - if ($lng[$i]>0) { - $trimNo[$i] +=$lng[$i]; - $collapse[$i] ++; - } - } -} -close IN; - -my @cleanR;my @cleanT; -my $clean=$prepath."collapse_reads_19_28.fa"; -open IN,"<$clean"; -while (my $aline=<IN>) { - chomp $aline; - <IN>; - $aline=~/:([\d|_]+)_x(\d+)$/; - my @lng=split/_/,$1; - for (my $i=0;$i<@lng;$i++) { - if ($lng[$i]>0) { - $cleanR[$i] +=$lng[$i]; - $cleanT[$i] ++; - } - } -} -close IN; - -print OUT "<table border=\"1\"> -<tr align=\"center\"> -<th> </th> -"; -foreach (@marks) { - print OUT "<th> $_ </th>\n"; -} -print OUT "</tr> -<tr align=\"center\"> -<th align=\"left\">Raw Reads No. </th> -"; -foreach (@rawNo) { - print OUT "<td> $_ </td>\n"; -} -print OUT "</tr> -<tr align=\"center\"> -<th align=\"left\">Reads No. After Trimed 3\' adapter </th> -"; -foreach (@trimNo) { - print OUT "<td> $_ </td>\n"; -} -print OUT "</tr> -<tr align=\"center\"> -<th align=\"left\">Unique Tags No. </th> -"; -foreach (@collapse) { - print OUT "<td> $_ </td>\n"; -} -print OUT "</tr> -<tr align=\"center\"> -<th align=\"left\">Clean Reads No. </th> -"; -foreach (@cleanR) { - print OUT "<td> $_ </td>\n"; -} -print OUT "</tr> -<tr align=\"center\"> -<th align=\"left\">Clean Tags No. </th> -"; -foreach (@cleanT) { - print OUT "<td> $_ </td>\n"; -} -print OUT "</tr>\n</table>"; -print OUT "<p> -Note:<br /> -The raw data file path is: <b>$files[0]</b><br /> -"; -for (my $i=1;$i<@files;$i++) { - print OUT "          <b>$files[$i]</b><br />"; -} -print OUT "The collapsed file path is: <b>$collapsefile</b><br /> -The clean data file path is: <b>$clean</b><br /> -</p> -<h2> 1. Sequence length count</h2> -"; -print OUT "\n"; - -my $length=$prepath."length.html"; -open IN,"<$length"; -while (my $aline=<IN>) { - chomp $aline; - print OUT "$aline\n"; -} - -print OUT "<p> Note:<br />The sequence length data: <a href=\"./$predir/reads_length_distribution.txt\"> length file</a> -</p> -"; - -#### rfam -unless ($rfampath=~/\/$/) { - $rfampath .="/"; -} -print OUT "<h2>2. Rfam non-miRNA annotation</h2> -<h3>2.1 Reads count</h3> -<table border=\"1\"> -<tr align=\"center\"> -"; - -my @rfamR; my @rfamT; -my $tag=1; -open IN,"<$dir/rfam_non-miRNA_annotation.txt"; -while (my $aline=<IN>) { - chomp $aline; - $tag=0 if($aline=~/tags\s+number/); - next if($aline=~/^\#/); - next if($aline=~/^\s*$/); - my @tmp=split/\s+/,$aline; - if($tag == 1){push @rfamR,[@tmp];} - else{push @rfamT,[@tmp];} -} -close IN; - - -print OUT "<th>RNA Name</th>\n"; -foreach (@marks) { - print OUT "<th> $_ </th>\n"; -} -for (my $i=0;$i<@rfamR;$i++) { - print OUT "</tr> - <tr align=\"center\"> - <th align=\"left\">$rfamR[$i][0]</th> - "; - for (my $j=1;$j<@{$rfamR[$i]} ;$j++) { - print OUT "<td> $rfamR[$i][$j]</td>\n"; - } -} - -print OUT "</tr>\n</table> - <h3>2.2 Tags count</h3> - <table border=\"1\"> - <tr align=\"center\"> - <th>RNA Name</th>\n"; -foreach (@marks) { - print OUT "<th> $_ </th>\n"; -} -for (my $i=0;$i<@rfamT;$i++) { - print OUT "</tr> - <tr align=\"center\"> - <th align=\"left\">$rfamT[$i][0]</th> - "; - for (my $j=1;$j<@{$rfamT[$i]} ;$j++) { - print OUT "<td> $rfamT[$i][$j]</td>\n"; - } -} -print OUT "</tr>\n</table> -<p>Note:<br />The rfam mapping results is: <b>$rfampath</b>"; -print OUT "<b>rfam_mapped.bwt</b></p> - <h2>3. MicroRNA result</h2> - <h3>3.1 known microRNA</h3> - <p>The known microRNA express list: <a href=\"./known_microRNA_express.txt\"> known_microRNA_express.txt</a><br/> - The known microRNA alngment file: <a href=\"./known_microRNA_express.aln\"> known_microRNA_express.aln</a><br/> - The known moRs file: <a href=\"./known_microRNA_express.moRs\"> known_microRNA_express.moRs</a><br/> - The known microRNA mature sequence file: <a href=\"./known_microRNA_mature.fa\"> known_microRNA_mature.fa</a><br/> - The knowm microRNA precursor sequence file: <a href=\"./known_microRNA_precursor.fa\"> known_microRNA_precursor.fa</a> - </p> - - <h3>3.2 novel microRNA</h3> - <p>The novel microRNA prediction file:<a href=\"./microRNA_prediction.mrd\"> microRNA_prediction.mrd</a><br/> - The novel microRNA express list: <a href=\"./novel_microRNA_express.txt\"> novel_microRNA_express.txt</a><br/> - The novel microRNA mature sequence file: <a href=\"./novel_microRNA_mature.fa\"> novel_microRNA_mature.fa</a><br/> - The novel microRNA precursor sequence file: <a href=\"./novel_microRNA_precursor.fa\"> novel_microRNA_precursor.fa</a> - </p> -"; - - - -print OUT " - </BODY> -</HTML> -"; -close OUT; - -sub usage{ -print <<"USAGE"; -Version $version -Usage: -$0 -o -options: --o output file --h help -USAGE -exit(1); -} - |
b |
diff -r ca05d68aca13 -r c75593f79aa9 html_preprocess.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/html_preprocess.pl Wed Dec 03 01:54:29 2014 -0500 |
[ |
b'@@ -0,0 +1,388 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2014-5-29\n+#Modified:\n+#Description: \n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use File::Basename;\n+\n+my %opts;\n+GetOptions(\\%opts,"i=s","format=s","min=i","max=i","o=s","h");\n+if (!(defined $opts{o} and defined $opts{format} and defined $opts{i} ) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+my ($config,$prepath,$rfampath,$knownpath,$genomepath,$novelpath);\n+my ($predir,$rfamdir,$knowndir,$genomedir,$noveldir);\n+open IN,"<$opts{i}";\n+$config=<IN>; chomp $config;\n+$prepath=<IN>; chomp $prepath;\n+$genomepath=<IN>; chomp $genomepath;\n+$rfampath=<IN>;\n+close IN;\n+my @tmp=split/\\//,$prepath;\n+$predir=$tmp[-1];\n+@tmp=split/\\//,$genomepath;\n+$genomedir=$tmp[-1];\n+\n+my $dir=dirname($opts{\'o\'});\n+\n+open OUT ,">$opts{\'o\'}";\n+print OUT "<HTML>\\n <HEAD>\\n <TITLE> Analysis Report </TITLE>\\n </HEAD>\n+ <BODY bgcolor=\\"lightgray\\">\\n <h1 align=\\"center\\">\\n <font face=\\"\xba\xda\xcc\xe5\\">\\n\t<b>Preprocess Report</b>\\n </font>\\n </h1>\n+ <h2>1. Sequence No. and quality</h2>\n+ <h3>1.1 Sequece No.</h3>\n+";\n+\n+### raw data no\n+open IN,"<$config";\n+my @files;my @marks; my @rawNo;\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tmy @tmp=split/\\t/,$aline;\n+\tpush @files,$tmp[0];\n+\t\n+\tmy $no=`less $tmp[0] |wc -l `;\n+\tchomp $no;\n+\tif ($opts{\'format\'} eq "fq" || $opts{\'format\'} eq "fastq") {\n+\t\t$no=$no/4;\n+\t}\n+\telse{\n+\t\t$no=$no/2;\n+\t}\n+\tpush @rawNo,$no;\n+\n+\tpush @marks,$tmp[1];\n+}\n+close IN;\n+\n+### preprocess \n+unless ($prepath=~/\\/$/) {\n+\t$prepath .="/";\n+}\n+\n+my @trimNo;my @collapse;\n+my $collapsefile=$prepath."collapse_reads.fa";\n+open IN,"<$collapsefile";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @lng=split/_/,$1;\n+\tfor (my $i=0;$i<@lng;$i++) {\n+\t\tif ($lng[$i]>0) {\n+\t\t\t$trimNo[$i] +=$lng[$i];\n+\t\t\t$collapse[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+my @cleanR;my @cleanT;\n+my $clean=$prepath."collapse_reads_$opts{min}_$opts{max}.fa";\n+open IN,"<$clean";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @lng=split/_/,$1;\n+\tfor (my $i=0;$i<@lng;$i++) {\n+\t\tif ($lng[$i]>0) {\n+\t\t\t$cleanR[$i] +=$lng[$i];\n+\t\t\t$cleanT[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+print OUT "<table border=\\"1\\">\n+<tr align=\\"center\\">\n+<th> </th>\n+";\n+foreach (@marks) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Raw Reads No. </th>\n+";\n+foreach (@rawNo) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Reads No. After Trimed 3\\\' adapter </th>\n+";\n+foreach (@trimNo) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Unique Tags No. </th>\n+";\n+foreach (@collapse) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Clean Reads No. </th>\n+";\n+foreach (@cleanR) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Clean Tags No. </th>\n+";\n+foreach (@cleanT) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\\n</table>";\n+print OUT "<p>\n+Note:<br />\n+The raw data file path is: <b>$files[0]</b><br />\n+";\n+for (my $i=1;$i<@files;$i++) {\n+\tprint OUT "          <b>$files[$i]</b><br />";\n+}\n+print OUT "The collapsed file path is: <b>$collapsefile</b><br />\n+The clean data file path is: <b>$clean</b><br />\n+</p>\n+<h2> 1. Sequence length count</h2>\n+<h3> 1.1 Reads length count </h3>\n+";\n+print OUT "\\n";\n+\n+my (%length); my $key="Tags Length";\n+open IN,"<$prepath/reads_length_distribution.txt";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tnext if($aline=~/^\\s*$/);\n+\tif ($aline=~/^Reads/) { $key="Reads Length";}\n+\tmy @tmp=split/\\t/,$aline;\n+\tmy @array=split/\\s/,$tmp[1];\n+\tpush @{$length{$key}},[$tmp[0],@array];\n+}\n+close IN;\n+\n+print OUT "<table border=\\"1\\">\n+<tr align=\\"center\\">";\n+my $hashkey="Reads Length'..b'+\n+print OUT "<table border=\\"1\\">\n+<tr align=\\"center\\">";\n+$hashkey="Tags Length";\n+foreach (@{$length{$hashkey}[0]}) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+print OUT "</tr>";\n+\n+for (my $i=1;$i<@{$length{$hashkey}};$i++) {\n+\tprint OUT "<tr align=\\"center\\">\n+\t<th > $length{$hashkey}[$i][0] </th>\n+\t";\n+\tfor(my $j=1;$j<@{$length{$hashkey}[$i]};$j++) {\n+\t\tprint OUT "<td> $length{$hashkey}[$i][$j] </td>\\n";\n+\t}\n+\tprint OUT "</tr>\\n";\n+}\n+\n+print OUT "</table>\\n";\n+\n+print OUT "<h2> 2. Sequence length distribution </h2>";\n+my $length=$prepath."length.html";\n+open IN,"<$length";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\tprint OUT "$aline\\n";\n+}\n+\n+#print OUT "<p> Note:<br />The sequence length data: <a href=\\"./$predir/reads_length_distribution.txt\\"> length file</a>\n+#</p>\n+#";\n+\n+\n+\n+\n+####genome map\n+#unless ($genomedir=~/\\/$/) {\n+#\t$genomedir .="/";\n+#}\n+\n+print OUT "<h2>2. Genome Alignment Result</h2>\n+<h3>2.1 Mapping count</h3>\n+";\n+\n+open IN,"<$genomepath/genome_mapped.fa";\n+my (@gread,@gtag);\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t<IN>;\n+\t$aline=~/:([\\d|_]+)_x(\\d+)$/;\n+\tmy @sss=split/_/,$1;\n+\tfor (my $i=0;$i<@sss;$i++) {\n+\t\tif ($sss[$i]>0) {\n+\t\t\t$gread[$i] +=$sss[$i];\n+\t\t\t$gtag[$i] ++;\n+\t\t}\n+\t}\n+}\n+close IN;\n+\n+print OUT "<table border=\\"1\\">\n+<tr align=\\"center\\">\n+<th> </th>\n+";\n+foreach (@marks) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Genome Mapped Reads No. </th>\n+";\n+foreach (@gread) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Genome Mapped Reads Percent </th>\n+";\n+\n+for (my $i=0;$i<@gread;$i++) {\n+\tmy $per=sprintf ("%.2f",$gread[$i]/$cleanR[$i]*100);\n+\tprint OUT "<td> $per\\%</td>\\n";\n+}\n+\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Genome Mapped Tags No. </th>\n+";\n+foreach (@gtag) {\n+\tprint OUT "<td> $_ </td>\\n";\n+}\n+print OUT "</tr>\n+<tr align=\\"center\\">\n+<th align=\\"left\\">Genome Mapped Tags Percent </th>\n+";\n+\n+for (my $i=0;$i<@gtag;$i++) {\n+\tmy $per=sprintf ("%.2f",$gtag[$i]/$cleanT[$i]*100);\n+\tprint OUT "<td> $per\\%</td>\\n";\n+}\n+print OUT "</tr>\\n</table>";\n+print OUT "<p>\n+Note:<br />\n+The genome mapped bwt file path is: <b>$genomedir/genome_mapped.bwt</b><br />\n+The genome mapped FASTA file path is: <b>$genomedir/genome_mapped.fa</b>\n+<br />\n+";\n+\n+\n+\n+#### rfam\n+if(defined $rfampath && $rfampath=~/rfam_match/){\n+chomp $rfampath;\n+@tmp=split/\\//,$rfampath;\n+$rfamdir=$tmp[-1];\n+\n+unless ($rfampath=~/\\/$/) {\n+\t$rfampath .="/";\n+}\n+print OUT "<h2>3. Rfam non-miRNA annotation</h2>\n+<h3>3.1 Reads count</h3>\n+<table border=\\"1\\">\n+<tr align=\\"center\\">\n+";\n+\n+my @rfamR; my @rfamT;\n+my $tag=1;\n+open IN,"<$dir/rfam_non-miRNA_annotation.txt";\n+while (my $aline=<IN>) {\n+\tchomp $aline;\n+\t$tag=0 if($aline=~/tags\\s+number/);\n+\tnext if($aline=~/^\\#/);\n+\tnext if($aline=~/^\\s*$/);\n+\tmy @tmp=split/\\s+/,$aline;\n+\tif($tag == 1){push @rfamR,[@tmp];}\n+\telse{push @rfamT,[@tmp];}\n+}\n+close IN;\n+\n+\n+print OUT "<th>RNA Name</th>\\n";\n+foreach (@marks) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+for (my $i=0;$i<@rfamR;$i++) {\n+\tprint OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">$rfamR[$i][0]</th>\n+\t";\n+\tfor (my $j=1;$j<@{$rfamR[$i]} ;$j++) {\n+\tprint OUT "<td> $rfamR[$i][$j]</td>\\n";\n+\t}\n+}\n+\n+print OUT "</tr>\\n</table>\n+ <h3>3.2 Tags count</h3>\n+ <table border=\\"1\\">\n+ <tr align=\\"center\\">\n+ <th>RNA Name</th>\\n";\n+foreach (@marks) {\n+\tprint OUT "<th> $_ </th>\\n";\n+}\n+for (my $i=0;$i<@rfamT;$i++) {\n+\tprint OUT "</tr>\n+\t<tr align=\\"center\\">\n+\t<th align=\\"left\\">$rfamT[$i][0]</th>\n+\t";\n+\tfor (my $j=1;$j<@{$rfamT[$i]} ;$j++) {\n+\tprint OUT "<td> $rfamT[$i][$j]</td>\\n";\n+\t}\n+}\n+print OUT "</tr>\\n</table>\n+<p>Note:<br />The rfam mapping results is: <b>$rfampath</b>";\n+print OUT "<b>rfam_mapped.bwt</b></p>\n+";\n+}\n+\n+\n+print OUT "\n+ </BODY>\n+</HTML>\n+";\n+close OUT;\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 -o\n+options:\n+-o output file\n+-h help\n+USAGE\n+exit(1);\n+}\n+\n' |
b |
diff -r ca05d68aca13 -r c75593f79aa9 matching.pl --- a/matching.pl Thu Nov 13 22:43:35 2014 -0500 +++ b/matching.pl Wed Dec 03 01:54:29 2014 -0500 |
b |
@@ -11,7 +11,7 @@ use Getopt::Long; my %opts; -GetOptions(\%opts,"i=s","g=s","index:s","v:i","p:i","r:s","o=s","time:s","h"); +GetOptions(\%opts,"i=s","g=s","index:s","v:i","p:i","r:s","o=s","h"); if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments &usage; } @@ -27,11 +27,7 @@ my $time=&Time(); -if (defined $opts{'time'}) { - $time=$opts{'time'}; -} - -my $mapdir=$fileout."/genome_match_".$time; +my $mapdir=$fileout."/genome_match"; if(not -d $mapdir){ mkdir $mapdir; } |
b |
diff -r ca05d68aca13 -r c75593f79aa9 miRDeep_plant.pl --- a/miRDeep_plant.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
b'@@ -1,1622 +0,0 @@\n-#!/usr/bin/perl\n-\n-use warnings;\n-use strict;\n-use Getopt::Std;\n-#use RNA;\n-\n-\n-################################# MIRDEEP #################################################\n-\n-################################## USAGE ##################################################\n-\n-\n-my $usage=\n-"$0 file_signature file_structure temp_out_directory\n-\n-This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with\n-information on the positions of reads aligned to potential precursor sequences (signature).\n-It also takes as input an RNAfold output file, giving information on the sequence, structure\n-and mimimum free energy of the potential precursor sequences.\n-\n-Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA\n-sequences that should be considered for conservation purposes. -t prints out the potential\n-precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that\n-exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors\n-that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences\n-obtained through conventional cloning. -z consider the number of base pairings in the lower\n-stems (this option is not well tested).\n-\n--h print this usage\n--s fasta file with known miRNAs\n-#-o temp directory ,maked befor running the program.\n--t print filtered\n--u limited output (only ids)\n--v cut-off (default 1)\n--x sensitive option for Sanger sequences\n--y use Randfold\n--z consider Drosha processing\n-";\n-\n-\n-\n-\n-\n-############################################################################################\n-\n-################################### INPUT ##################################################\n-\n-\n-#signature file in blast_parsed format\n-my $file_blast_parsed=shift or die $usage;\n-\n-#structure file outputted from RNAfold\n-my $file_struct=shift or die $usage;\n-\n-my $tmpdir=shift or die $usage;\n-#options\n-my %options=();\n-getopts("hs:tuv:xyz",\\%options);\n-\n-\n-\n-\n-\n-\n-#############################################################################################\n-\n-############################# GLOBAL VARIABLES ##############################################\n-\n-\n-#parameters\n-my $nucleus_lng=11;\n-\n-my $score_star=3.9;\n-my $score_star_not=-1.3;\n-my $score_nucleus=7.63;\n-my $score_nucleus_not=-1.17;\n-my $score_randfold=1.37;\n-my $score_randfold_not=-3.624;\n-my $score_intercept=0.3;\n-my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0);\n-my $score_min=1;\n-if($options{v}){$score_min=$options{v};}\n-if($options{x}){$score_min=-5;}\n-\n-my $e=2.718281828;\n-\n-#hashes\n-my %hash_desc;\n-my %hash_seq;\n-my %hash_struct;\n-my %hash_mfe;\n-my %hash_nuclei;\n-my %hash_mirs;\n-my %hash_query;\n-my %hash_comp;\n-my %hash_bp;\n-\n-#other variables\n-my $subject_old;\n-my $message_filter;\n-my $message_score;\n-my $lines;\n-my $out_of_bound;\n-\n-\n-\n-##############################################################################################\n-\n-################################ MAIN ###################################################### \n-\n-\n-#print help if that option is used\n-if($options{h}){die $usage;}\n-unless ($tmpdir=~/\\/$/) {$tmpdir .="/";}\n-if(!(-s $tmpdir)){mkdir $tmpdir;}\n-$tmpdir .="TMP_DIR/";\n-mkdir $tmpdir;\n-\n-#parse structure file outputted from RNAfold\n-parse_file_struct($file_struct);\n-\n-#if conservation is scored, the fasta file of known miRNA sequences is parsed\n-if($options{s}){create_hash_nuclei($options{s})};\n-\n-#parse signature file in blast_parsed format and resolve each potential precursor\n-parse_file_blast_parsed($file_blast_parsed);\n-#`rm -rf $tmpdir`;\n-exit;\n-\n-\n-\n-\n-##############################################################################################\n-\n-############################## SUBROUTINES ###################################################\n-\n-\n-\n-sub parse_file_blast_parsed{\n-\n-# read through the signature blastparsed file, fills up a hash with infor'..b'th\n-# the n\'t nucleotide on the plus strand\n-\n-# This subroutine reverses the begin and end positions of positions of the minus strand so that they\n-# are relative to the 5\' end of the plus strand\t\n- \n- my($beg,$end,$lng)=@_;\n- \n- my $new_end=$lng-$beg+1;\n- my $new_beg=$lng-$end+1;\n- \n- return($new_beg,$new_end);\n-}\n-\n-sub round {\n-\n- #rounds to nearest integer\n- \n- my($number) = shift;\n- return int($number + .5);\n- \n-}\n-\n-\n-sub rev{\n-\n- #reverses the order of nucleotides in a sequence\n-\n- my($sequence)=@_;\n-\n- my $rev=reverse $sequence; \n-\n- return $rev;\n-}\n-\n-sub com{\n-\n- #the complementary of a sequence\n-\n- my($sequence)=@_;\n-\n- $sequence=~tr/acgtuACGTU/TGCAATGCAA/; \n- \n- return $sequence;\n-}\n-\n-sub revcom{\n- \n- #reverse complement\n-\n- my($sequence)=@_;\n-\n- my $revcom=rev(com($sequence));\n-\n- return $revcom;\n-}\n-\n-\n-sub max2 {\n-\n- #max of two numbers\n- \n- my($a, $b) = @_;\n- return ($a>$b ? $a : $b);\n-}\n-\n-sub min2 {\n-\n- #min of two numbers\n-\n- my($a, $b) = @_;\n- return ($a<$b ? $a : $b);\n-}\n-\n-\n-\n-sub score_freq{\n-\n-# scores the count of reads that map to the potential precursor\n-# Assumes geometric distribution as described in methods section of manuscript\n-\n- my $freq=shift;\n-\n- #parameters of known precursors and background hairpins\n- my $parameter_test=0.999;\n- my $parameter_control=0.6;\n-\n- #log_odds calculated directly to avoid underflow\n- my $intercept=log((1-$parameter_test)/(1-$parameter_control));\n- my $slope=log($parameter_test/$parameter_control);\n- my $log_odds=$slope*$freq+$intercept;\n-\n- #if no strong evidence for 3\' overhangs, limit the score contribution to 0\n- unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);}\n- \n- return $log_odds;\n-}\n-\n-\n-\n-##sub score_mfe{\n-\n-# scores the minimum free energy in kCal/mol of the potential precursor\n-# Assumes Gumbel distribution as described in methods section of manuscript\n-\n-## my $mfe=shift;\n-\n- #numerical value, minimum 1\n-## my $mfe_adj=max2(1,-$mfe);\n-\n- #parameters of known precursors and background hairpins, scale and location\n-## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);\n-## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);\n-\n-## my $odds=$prob_test/$prob_background;\n-## my $log_odds=log($odds);\n-\n-## return $log_odds;\n-##}\n-\n-sub score_mfe{\n-#\tuse bignum;\n-\n-# scores the minimum free energy in kCal/mol of the potential precursor\n-# Assumes Gumbel distribution as described in methods section of manuscript\n-\n- my ($mfe,$mlng)=@_;\n-\n- #numerical value, minimum 1\n- my $mfe_adj=max2(1,-$mfe);\n-my $mfe_adj1=$mfe/$mlng;\n- #parameters of known precursors and background hairpins, scale and location\n-\tmy $a=1.339e-12;my $b=2.778e-13;my $c=45.834;\n-\tmy $ev=$e**($mfe_adj1*$c);\n-\t#print STDERR "\\n***",$ev,"**\\t",$ev+$b,"\\t";\n-\tmy $log_odds=($a/($b+$ev));\n-\n-\n-\tmy $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);\n- my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);\n-\n- my $odds=$prob_test/$prob_background;\n- my $log_odds_2=log($odds);\n-\t#print STDERR "log_odds :",$log_odds,"\\t",$log_odds_2,"\\n";\n- return $log_odds;\n-}\n-\n-\n-\n-sub prob_gumbel_discretized{\n-\n-# discretized Gumbel distribution, probabilities within windows of 1 kCal/mol\n-# uses the subroutine that calculates the cdf to find the probabilities\n-\n- my ($var,$scale,$location)=@_;\n-\n- my $bound_lower=$var-0.5;\n- my $bound_upper=$var+0.5;\n-\n- my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location);\n- my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location);\n-\n- my $prob=$cdf_upper-$cdf_lower;\n-\n- return $prob;\n-}\n-\n-\n-sub cdf_gumbel{\n-\n-# calculates the cumulative distribution function of the Gumbel distribution\n-\n- my ($var,$scale,$location)=@_;\n-\n- my $cdf=$e**(-($e**(-($var-$location)/$scale)));\n-\n- return $cdf;\n-}\n-\n' |
b |
diff -r ca05d68aca13 -r c75593f79aa9 miRNA_Express_and_sequence.pl --- a/miRNA_Express_and_sequence.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,173 +0,0 @@ -#!/usr/bin/perl -w -#Filename: -#Author: Tian Dongmei -#Email: tiandm@big.ac.cn -#Date: 2014-6-4 -#Modified: -#Description: solexa miRNA express and sequence -my $version=1.00; - -use strict; -use Getopt::Long; - -my %opts; -GetOptions(\%opts,"i=s","list=s","fa=s","pre=s","tag=s","h"); -if (!(defined $opts{i} and defined $opts{list} and defined $opts{fa} and defined $opts{pre} and defined $opts{tag}) || defined $opts{h}) { #necessary arguments -&usage; -} - -my $filein=$opts{'i'}; -my $fileout=$opts{'list'}; -my $out=$opts{'fa'}; -my $preout=$opts{'pre'}; - -=cut -my %hash_pri; -open PRI,"<$opts{p}"; -while (my $aline=<PRI>) { - chomp $aline; - if($aline=~/^>(\S+)/){$hash_pri{$1}=$aline;} -} -close PRI; -=cut - -open IN,"<$filein"; #input file -open OUT,">$fileout"; #output file -open FA ,">$out"; -open PRE,">$preout"; - -print OUT "#ID\tcoordinate\tpos1\tpos2"; -my @marks=split/\,/,$opts{'tag'}; -foreach (@marks) { - print OUT "\t",$_,"_matureExp"; -} -foreach (@marks) { - print OUT "\t",$_,"_starExp"; -} -foreach (@marks) { - print OUT "\t",$_,"_totalExp"; -} - -print OUT "\n"; - -my (%uniq_id,$novel); -while (my $aline=<IN>) { - chomp $aline; - until ($aline =~ /^score\s+[-\d\.]+/){ - $aline = <IN>; - if (eof) {last;} - } - if (eof) {last;} -########## miRNA ID ################ - $novel++; -########### annotate#################### - do {$aline=<IN>;} until($aline=~/flank_first_end/) ; - chomp $aline; - my @flank1=split/\t/,$aline; - do {$aline=<IN>;} until($aline=~/flank_second_beg/) ; - chomp $aline; - my @flank2=split/\t/,$aline; -# -########## mature start loop pre #### - do {$aline=<IN>;} until($aline=~/mature_beg/) ; - chomp $aline; - my @start=split/\t/,$aline; -# $start[1] -=$flank1[1]; - do {$aline=<IN>;} until($aline=~/mature_end/) ; - chomp $aline; - my @end=split/\t/,$aline; -# $end[1] -=$flank1[1]; - do {$aline=<IN>;} until($aline=~/mature_seq/) ; - chomp $aline; - my @arr1=split/\t/,$aline; - do {$aline=<IN>;} until($aline=~/pre_seq/) ; - chomp $aline; - my @arr2=split/\t/,$aline; - do {$aline=<IN>;} until($aline=~/pri_id/) ; - chomp $aline; - my @pri_id=split/\t/,$aline; - do {$aline=<IN>;} until($aline=~/pri_seq/) ; - chomp $aline; - my @pri_seq=split/\t/,$aline; - do {$aline=<IN>;} until($aline=~/star_beg/) ; - chomp $aline; - my @star_start=split/\t/,$aline; -# $star_start[1] -=$flank1[1]; - do {$aline=<IN>;} until($aline=~/star_end/) ; - chomp $aline; - my @star_end=split/\t/,$aline; -# $star_end[1] -=$flank1[1]; - do {$aline=<IN>;} until($aline=~/star_seq/) ; - chomp $aline; - my @arr3=split/\t/,$aline; - print OUT "miR-c-$novel\t$pri_id[1]\tmature:$start[1]:$end[1]\tstar:$star_start[1]:$star_end[1]\t"; - #print OUT "$arr1[1]\t$arr3[1]\t$arr2[1]\t\/\t"; - print FA ">miR-c-$novel\n$arr1[1]\n"; - print PRE ">miR-c-$novel\n$pri_seq[1]\n"; -########## reads count ############# - <IN>; - my @count1;my @count2;my @count3;my @count4; - $aline=<IN>; - do { - chomp $aline; - my @reads=split/\t/,$aline; - my @pos=(); - $reads[5]=~/(\d+)\.\.(\d+)/; -# $pos[0] =$1-$flank1[1]; -# $pos[1] =$2-$flank1[1]; - $pos[0]=$1; - $pos[1]=$2; - $reads[0]=~/:([\d|_]+)_x(\d+)$/; - my @ss=split/_/,$1; - for (my $i=0;$i<@ss ;$i++) { - if (!(defined $count3[$i])) { - $count3[$i]=0; - } - if (!(defined $count4[$i])) { - $count4[$i]=0; - } - $count2[$i]+=$ss[$i]; - - } -# $count3 +=$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 ); -# $count4 +=$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 ); -# $count1 =$1 if($end[1]-$pos[0]>=10 && $pos[1]-$start[1]>=10 && $count1<$1); -# $count2 =$1 if($star_end[1]-$pos[0]>=10 && $pos[1]-$star_start[1]>=10 && $count2<$1); - if($end[1]-$pos[1]>=-5 && $end[1]-$pos[1]<=5 && $pos[0]-$start[1]>=-3 && $pos[0]-$start[1]<=3 ) - { - for (my $i=0;$i<@ss;$i++) { - $count3[$i]+=$ss[$i]; - } - } - if($star_end[1]-$pos[1]<=5 && $star_end[1]-$pos[1]>=-5 && $pos[0]-$star_start[1]>=-3 && $pos[0]-$star_start[1]<=3){ - for (my $i=0;$i<@ss;$i++) { - $count4[$i]+=$ss[$i]; - } - } - $aline=<IN>; - chomp $aline; - } until(length $aline < 1) ; - $"="\t"; - print OUT "@count3\t@count4\t@count2\n"; - $"=" "; -} - -close IN; -close OUT; - -sub usage{ -print <<"USAGE"; -Version $version -Usage: -$0 -i -list -fa -pre -tag -options: --i input file,predictions file --list output file miRNA list file --fa output file ,miRNA sequence fasta file. --pre output file, miRNA precursor fasta file. --tag string, sample names# eg: samA,samB,samC --h help -USAGE -exit(1); -} - |
b |
diff -r ca05d68aca13 -r c75593f79aa9 miRPlant.pl --- a/miRPlant.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,506 +0,0 @@\n-#!/usr/bin/perl -w\n-#Filename:\n-#Author: Tian Dongmei\n-#Email: tiandm@big.ac.cn\n-#Date: 2014-4-22\n-#Modified:\n-#Description: plant microRNA prediction\n-my $version=1.00;\n-\n-use strict;\n-use Getopt::Long;\n-use threads;\n-#use threads::shared;\n-use File::Path;\n-use File::Basename;\n-#use RNA;\n-#use Term::ANSIColor;\n-\n-my %opts;\n-GetOptions(\\%opts,"i:s@","tag:s@","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h");\n-if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments\n-&usage;\n-}\n-\n-my $time=&Time();\n-print "miPlant program start:\\n The time is $time!\\n";\n-print "Command line:\\n $0 @ARGV\\n";\n-\n-my $format=$opts{\'format\'};\n-if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { \n-\t#&printErr();\n-\tdie "Parameter \\"-format\\" is error! Parameter is fastq, fq, fasta or fa\\n";\n-}\n-\n-my $phred_qv=64;\n-\n-\n-my @inputfiles=@{$opts{\'i\'}};\n-my @inputtags=@{$opts{\'tag\'}};\n-\n-my $mypath=`pwd`;\n-chomp $mypath;\n-\n-my $dir=defined $opts{\'o\'} ? $opts{\'o\'} : "$mypath/miRPlant_out/";\n-\n-\n-unless ($dir=~/\\/$/) {$dir.="/";}\n-if (not -d $dir) {\n-\tmkdir $dir;\n-}\n-my $config=$dir."/input_config";\n-open CONFIG,">$config";\n-\tfor (my $i=0;$i<@inputfiles;$i++) {\n-\t\tprint CONFIG $inputfiles[$i],"\\t",$inputtags[$i],"\\n";\n-\t}\n-close CONFIG;\n-\n-my $scipt_path=defined $opts{\'path\'} ? $opts{\'path\'} : "/Users/big/galaxy-dist/tools/myTools/";\n-\n-my $a="ATCTCGTATG"; #adapter\n-if (defined $opts{\'a\'}) {$a=$opts{\'a\'};}\n-\n-my $m=6; #adapter minimum mapped nt\n-if (defined $opts{\'M\'}) {$m=$opts{\'M\'};}\n-\n-my $t=1; #threads number\n-if (defined $opts{\'t\'}) {$t=$opts{\'t\'};}\n-\n-my $min_nt=19; # minimum reads length\n-if (defined $opts{\'min\'}) {$min_nt=$opts{\'min\'};}\n-\n-my $max_nt=28; #maximum reads length\n-if (defined $opts{\'max\'}) {$max_nt=$opts{\'max\'};}\n-\n-my $mis=0; #mismatch number for microRNA\n-if (defined $opts{\'mis\'}) {$mis=$opts{\'mis\'};}\n-\n-my $mis_rfam=0;# mismatch number for rfam\n-if (defined $opts{\'v\'}) {$mis_rfam=$opts{\'v\'};}\n-\n-my $hit=25; # maximum reads mapping hits in genome\n-if (defined $opts{\'r\'}) {$hit=$opts{\'r\'};}\n-\n-my $upstream = 2; # microRNA 5\' extension\n-$upstream = $opts{\'e\'} if(defined $opts{\'e\'});\n-\n-my $downstream = 5;# microRNA 3\' extension\n-$downstream = $opts{\'f\'} if(defined $opts{\'f\'});\n-\n-my $maxd=defined $opts{\'dis\'} ? $opts{\'dis\'} : 200;\n-my $flank=defined $opts{\'flank\'} ? $opts{\'flank\'} :10;\n-my $mfe=defined $opts{\'mfe\'} ? $opts{\'mfe\'} : -20;\n-\n-$time=&Time();\n-print "$time, Checking input file!\\n";\n-\n-my (@filein,@mark,@clean);\n-#&read_config();\n-@filein=@inputfiles;\n-@mark=@inputtags;\n-\n-&checkfa($opts{pre});\n-&checkfa($opts{mat});\n-&checkfa($opts{gfa});\n-\n-\n-##### clip adpter --> clean data start\n-$time=&Time();\n-print "$time, Preprocess:\\n trim adapter, reads collapse and filter reads by length.\\n";\n-\n-$time=~s/:/-/g;\n-$time=~s/ /-/g;\n-my $preprocess=$dir."preProcess_${time}/";\n-mkdir $preprocess;\n-my $can_use_threads = eval \'use threads; 1\';\n-if ($can_use_threads) {\n-# Do processing using threads\n-\tprint "Do processing using threads\\n";\n-\tmy @filein1=@filein; my @mark1=@mark;\n-\twhile (@filein1>0) {\n-\t\tmy @thrs; my @res;\n-\t\tfor (my $i=0;$i<$t ;$i++) {\n-\t\t\tlast if(@filein1==0);\n-\t\t\tmy $in=shift @filein1;\n-\t\t\tmy $out=shift @mark1;\n-\t\t\tpush @clean,$preprocess.$out."_clips_adapter.fq";\n-\t\t\t$thrs[$i]=threads->create(\\&clips,$in,$out);\n-\t\t}\n-\t\tfor (my $i=0;$i<@thrs;$i++) {\n-\t\t\t$res[$i]=$thrs[$i]->join();\n-\t\t}\n-\t}\n-} else {\n-# Do not processing using threads\n-\tprint "Do not processing using threads\\n";\n-\tfor (my $i=0;$i<@filein ;$i++) {\n-\t\tmy $in=$filein[$i];\n-\t\tmy $out=$mark[$i];\n-\t\tpush @clean,$preprocess.$out."_clips_adapter.fq";\n-\t\t&clips($in,$out);\n-\t}\n-}\n-\n-##### clip adpter --> clean data end\n-\n-my $collapsed=$preprocess."collapse_reads.fa";\n-my $da'..b'opendir I,$dir;\n-\tmy @ret;\n-\twhile (my $file=readdir I) {\n-\t\tif ($file=~/$str/) {\n-\t\t\tpush @ret, $file;\n-\t\t}\n-\t}\n-\tclosedir I;\n-\tif (@ret != 1) {\n-\t\t#&printErr();\n-\n-\t\tdie "Can not find directory or file which name has string: $str !!!\\n";\n-\t}\n-\treturn $ret[0];\n-}\n-\n-=cut\n-\n-sub printErr{\n- print STDERR color \'bold red\';\n- print STDERR "Error: ";\n- print STDERR color \'reset\';\n-}\n-sub Time{\n- my $time=time();\n- my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\n- $month++;\n- $year+=1900;\n- if (length($sec) == 1) {$sec = "0"."$sec";}\n- if (length($min) == 1) {$min = "0"."$min";}\n- if (length($hour) == 1) {$hour = "0"."$hour";}\n- if (length($day) == 1) {$day = "0"."$day";}\n- if (length($month) == 1) {$month = "0"."$month";}\n- #print "$year-$month-$day $hour:$min:$sec\\n";\n- return("$year-$month-$day-$hour-$min-$sec");\n-}\n-=cut\n-sub Time{\n- my $time=time();\n- my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\n- $month++;\n- $year+=1900;\n- if (length($sec) == 1) {$sec = "0"."$sec";}\n- if (length($min) == 1) {$min = "0"."$min";}\n- if (length($hour) == 1) {$hour = "0"."$hour";}\n- if (length($day) == 1) {$day = "0"."$day";}\n- if (length($month) == 1) {$month = "0"."$month";}\n- #print "$year-$month-$day $hour:$min:$sec\\n";\n- return("$year-$month-$day $hour:$min:$sec");\n-}\n-\n-\n-sub usage{\n-print <<"USAGE";\n-Version $version\n-Usage:\n-\n-$0 -i -format -gfa -index -pre -mat -rfam -D -a -M -min -max -mis -e -f -v -t -o -path\n-options:\n--i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ...\n--tag string # raw data file names, -tag xxx -tag xxx\n-\n--format string,#specific input rawdata file format : fastq|fq|fasta|fa\n-\n--path scirpt path\n-\n--gfa string, input file # genome fasta. sequence file\n--idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\n- string must be the prefix of the bowtie index. For instance, if\n- the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n- the prefix is \'h_sapiens_37_asm\'.##can be null\n-\n--pre string, input file #species specific microRNA precursor sequences\n--mat string, input file #species specific microRNA mature sequences\n-\n--rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count.\n--idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter\n- string must be the prefix of the bowtie index. For instance, if\n- the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n- the prefix is \'h_sapiens_37_asm\'.##can be null\n-\n--D If [-D] is specified,will discard rfam mapped reads(nead -rfam).\n-\n--a string, ADAPTER string. default is ATCTCGTATG.\n--M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don\'t clip it. \n--min int, reads min length,default is 19.\n--max int, reads max length,default is 28.\n-\n--mis [int] number of allowed mismatches when mapping reads to precursors, default 0\n--e [int] number of nucleotides upstream of the mature sequence to consider, default 2\n--f [int] number of nucleotides downstream of the mature sequence to consider, default 5\n--v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment\n--r int a read is allowed to map up to this number of positions in the genome,default is 25 \n-\n--dis <int> Maximal space between miRNA and miRNA* (200)\n--flank <int> Flank sequence length of miRNA precursor (10)\n--mfe <folat> Maximal free energy allowed for a miRNA precursor (-20)\n-\n--t int, number of threads [1]\n-\n--o output directory# absolute path\n--h help\n-USAGE\n-exit(1);\n-}\n-\n' |
b |
diff -r ca05d68aca13 -r c75593f79aa9 miRPlant.xml --- a/miRPlant.xml Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,89 +0,0 @@ -<tool id="plant_microRNA_v1" name="miRPlant" veision="1.0.0"> - <description>tool for plant microRNA analisis</description> - - <requirements> - <requirement type="package" version="0.0.13">fastx_toolkit </requirement> - <requirement type="package" version="0.12.7">bowtie</requirement> - <requirement type="set_environment">SCRIPT_PATH</requirement> - <!--requirement type="package" version="3.0.1">R</requirement!--> - <requirement type="package" version="2.59">SVG</requirement> - <requirement type="package" version="2.1.8">ViennaRNA</requirement> - </requirements> - - <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command--> - - <command interpreter="perl">miRPlant.pl - ## Change this to accommodate the number of threads you have available. - -t \${GALAXY_SLOTS:-4} - ## Do or not delet rfam mapped tags - #if $params.delet_rfam == "yes": - -D - #end if - -path \$SCRIPT_PATH - - #for $j, $s in enumerate( $series ) - ##rank_of_series=$j - -i ${s.input} - -tag ${s.tag} - #end for - - -format $format -gfa $gfa -pre $pre -mat $mat -rfam $rfam -a $a -M $mapnt -min $min -max $max -mis $mismatch -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe > run.log - </command> - - <inputs> - - <repeat name="series" title="Series"> - <param name="input" type="data" label="Raw data"/> - <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/> - </repeat> - - <conditional name="params"> - <param name="delet_rfam" type="select" label="delet rfam mapped reads"> - <option value="yes" selected="true">yes</option> - <option value="no">no</option> - </param> - </conditional> <!-- params --> - - <!--param name="input" format="tabular" type="data" label="input config file" /--> - - <param name="format" type="select" lable="raw data format" multiple="false"> - <option value="fastq">Raw data is fastq. format</option> - <option value="fasta">Raw data is fasta. format</option> - </param> - - <param name="gfa" type="data" label="genome sequence fasta file"/> - <!--param type="data" name="index" label="genome sequence bowtie index"/--> - <param name="mat" type="data" label="mature microRNA sequence file" /> - <param name="pre" type="data" label="precursor microRNA sequence fie" /> - <param name="rfam" type="data" label="rfam sequence file" /> - <!--param type="data" name="idx2" label="rfam sequence bowtie index " --> - <param name="a" type="text" value="ATCTCGTATG" label="3' adapter sequence" /> - <param name="mapnt" type="integer" value="8" label="minimum adapter map nts" /> - <param name="min" type="integer" value="19" label="minimum microRNA length" /> - <param name="max" type="integer" value="28" label="maximum microRNA length" /> - <param name="mismatch" type="integer" value="0" label="number of allowed mismatches when mapping reads to precursors" /> - <param name="e" type="integer" value="2" label="number of nucleotides upstream of the mature sequence to consider" /> - <param name="f" type="integer" value="5" label="number of nucleotides downstream of the mature sequence to consider" /> - <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches"/> - <param name="r" type="integer" value="25" label="a read is allowed to map up to this number of positions in the genome" /> - <param name="dis" type="integer" value="200" label="Maximal space between miRNA and miRNA*" /> - <param name="flank" type="integer" value="10" label="Flank sequence length of miRNA precursor" /> - <param name="mfe" type="float" value="-30" label="Maximal free energy allowed for a miRNA precursor" /> - </inputs> - - <outputs> - <data format="txt" name="known microRNA express list" from_work_dir="miRPlant_out/known_microRNA_express.txt" label="${tool.name} on ${on_string}: known microRNA express list"/> - <data format="txt" name="known microRNA express alignment" from_work_dir="miRPlant_out/known_microRNA_express.aln" label="${tool.name} on ${on_string}: known microRNA express alignment"/> - <data format="txt" name="known microRNA moRs result" from_work_dir="miRPlant_out/known_microRNA_express.moRs" label="${tool.name} on ${on_string}: known microRNA moRs result"/> - <data format="txt" name="known microRNA precursor file" from_work_dir="miRPlant_out/known_microRNA_precursor.fa" label="${tool.name} on ${on_string}: known microRNA precursor file"/> - <data format="txt" name="known microRNA mature file" from_work_dir="miRPlant_out/known_microRNA_mature.fa" label="${tool.name} on ${on_string}: known microRNA mature file"/> - <data format="txt" name="novel microRNA express list" from_work_dir="miRPlant_out/novel_microRNA_express.txt" label="${tool.name} on ${on_string}: novel microRNA express list"/> - <data format="txt" name="novel microRNA precursor file" from_work_dir="miRPlant_out/novel_microRNA_precursor.fa" label="${tool.name} on ${on_string}: novel microRNA precursor file"/> - <data format="txt" name="novel microRNA mature sequence file" from_work_dir="miRPlant_out/novel_microRNA_mature.fa" label="${tool.name} on ${on_string}: novel microRNA mature sequence file"/> - <data format="html" name="analysis result" from_work_dir="miRPlant_out/result.html" label="${tool.name} on ${on_string}: analysis result"/> - </outputs> - - <help> - - </help> - </tool> |
b |
diff -r ca05d68aca13 -r c75593f79aa9 preProcess.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preProcess.pl Wed Dec 03 01:54:29 2014 -0500 |
[ |
b'@@ -0,0 +1,383 @@\n+#!/usr/bin/perl -w\n+#Filename:\n+#Author: Tian Dongmei\n+#Email: tiandm@big.ac.cn\n+#Date: 2014-12-2\n+#Modified:\n+#Description: RNA-seq data pre-process\n+my $version=1.00;\n+\n+use strict;\n+use Getopt::Long;\n+use threads;\n+#use threads::shared;\n+use File::Path;\n+use File::Basename;\n+#use RNA;\n+#use Term::ANSIColor;\n+\n+my %opts;\n+GetOptions(\\%opts,"i:s@","tag:s@","format=s","phred:i","gfa=s","rfam:s","idx:s","idx2:s","mis:i","v:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","h");\n+if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} ) || defined $opts{h}) { #necessary arguments\n+&usage;\n+}\n+\n+my $time=&Time();\n+print "miPlant program start:\\n The time is $time!\\n";\n+print "Command line:\\n $0 @ARGV\\n";\n+\n+my $format=$opts{\'format\'};\n+if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { \n+\t#&printErr();\n+\tdie "Parameter \\"-format\\" is error! Parameter is fastq, fq, fasta or fa\\n";\n+}\n+\n+my $phred_qv=64;\n+if (defined $opts{\'phred\'}) {$phred_qv=$opts{\'phred\'};}\n+\n+my @inputfiles=@{$opts{\'i\'}};\n+my @inputtags=@{$opts{\'tag\'}};\n+\n+my $mypath=`pwd`;\n+chomp $mypath;\n+\n+my $dir=defined $opts{\'o\'} ? $opts{\'o\'} : "$mypath/preProcess/";\n+\n+\n+unless ($dir=~/\\/$/) {$dir.="/";}\n+if (not -d $dir) {\n+\tmkdir $dir;\n+}\n+my $config=$dir."/input_config";\n+open CONFIG,">$config";\n+\tfor (my $i=0;$i<@inputfiles;$i++) {\n+\t\tprint CONFIG $inputfiles[$i],"\\t",$inputtags[$i],"\\n";\n+\t}\n+close CONFIG;\n+\n+my $scipt_path=defined $opts{\'path\'} ? $opts{\'path\'} : "/Users/big/galaxy-dist/tools/myTools/";\n+\n+my $a="ATCTCGTATG"; #adapter\n+if (defined $opts{\'a\'}) {$a=$opts{\'a\'};}\n+\n+my $m=6; #adapter minimum mapped nt\n+if (defined $opts{\'M\'}) {$m=$opts{\'M\'};}\n+\n+my $t=1; #threads number\n+if (defined $opts{\'t\'}) {$t=$opts{\'t\'};}\n+\n+my $min_nt=19; # minimum reads length\n+if (defined $opts{\'min\'}) {$min_nt=$opts{\'min\'};}\n+\n+my $max_nt=28; #maximum reads length\n+if (defined $opts{\'max\'}) {$max_nt=$opts{\'max\'};}\n+\n+my $mis=0; #mismatch number for microRNA\n+if (defined $opts{\'mis\'}) {$mis=$opts{\'mis\'};}\n+\n+my $mis_rfam=0;# mismatch number for rfam\n+if (defined $opts{\'v\'}) {$mis_rfam=$opts{\'v\'};}\n+\n+my (@filein,@mark,@clean);\n+#&read_config();\n+@filein=@inputfiles;\n+@mark=@inputtags;\n+\n+&checkfa($opts{gfa});\n+\n+\n+##### clip adpter --> clean data start\n+my $preprocess=$dir."preProcess_clean/";\n+mkdir $preprocess;\n+my $can_use_threads = eval \'use threads; 1\';\n+if ($can_use_threads) {\n+# Do processing using threads\n+\tprint "Do processing using threads\\n";\n+\tmy @filein1=@filein; my @mark1=@mark;\n+\twhile (@filein1>0) {\n+\t\tmy @thrs; my @res;\n+\t\tfor (my $i=0;$i<$t ;$i++) {\n+\t\t\tlast if(@filein1==0);\n+\t\t\tmy $in=shift @filein1;\n+\t\t\tmy $out=shift @mark1;\n+\t\t\tpush @clean,$preprocess.$out."_clips_adapter.fq";\n+\t\t\t$thrs[$i]=threads->create(\\&clips,$in,$out);\n+\t\t}\n+\t\tfor (my $i=0;$i<@thrs;$i++) {\n+\t\t\t$res[$i]=$thrs[$i]->join();\n+\t\t}\n+\t}\n+} else {\n+# Do not processing using threads\n+\tprint "Do not processing using threads\\n";\n+\tfor (my $i=0;$i<@filein ;$i++) {\n+\t\tmy $in=$filein[$i];\n+\t\tmy $out=$mark[$i];\n+\t\tpush @clean,$preprocess.$out."_clips_adapter.fq";\n+\t\t&clips($in,$out);\n+\t}\n+}\n+\n+##### clip adpter --> clean data end\n+\n+my $collapsed=$preprocess."collapse_reads.fa";\n+my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data\n+&collapse(\\@clean,$collapsed); #collapse reads to tags\n+\n+&filterbylength(); # filter <$min_nt && >$max_nt\n+\n+print "The final clean data file is $data, only contains reads which length is among $min_nt\\~$max_nt\\n\\n";\n+\n+\n+$time=Time();\n+print "$time: Genome alignment!\\n\\n";\n+my $genome_map=$dir."genome_match";\n+&genome($data);\n+#my $genome_map=&search($dir,"genome_match_");\n+my $mapfile=$genome_map."/genome_mapped.bwt";\n+my $mapfa=$genome_map."/genome_mapped.fa";\n+my $unmap=$genome_map."/genome_not_mapped.fa";\n+\n+chdir $dir;\n+my $pathfile="$dir/path.txt";\n+open PA,">$pathfile";\n+print PA "$config\\n";\n+print PA "$preprocess\\n";\n+print PA "$genome_map\\n";\n+\n+if'..b'<$file_reads";\n+\tmy $line=<N>;\n+\tchomp $line;\n+ if($line !~ /^>\\S+/){\n+ #printErr();\n+ die "The first line of file $file_reads does not start with \'>identifier\'\n+Reads file $file_reads is not a valid fasta file\\n\\n";\n+ }\n+ if(<N> !~ /^[ACGTNacgtn]*$/){\n+ #printErr();\n+ die "File $file_reads contains not allowed characters in sequences\n+Allowed characters are ACGTN\n+Reads file $file_reads is not a fasta file\\n\\n";\n+ }\n+\tclose N;\n+}\n+sub checkfq{\n+\tmy ($file_reads)=@_;\n+\n+\topen N,"<$file_reads";\n+\tfor (my $i=0;$i<10;$i++) {\n+\t\tmy $a=<N>;\n+\t\tmy $b=<N>;\n+\t\tmy $c=<N>;\n+\t\tmy $d=<N>;\n+\t\tchomp $a;\n+\t\tchomp $b;\n+\t\tchomp $c;\n+\t\tchomp $d;\n+\t\tif($a!~/^\\@/){\n+\t\t\t#&printErr();\n+\t\t\tdie "$file_reads is not a fastq file\\n\\n";\n+\t\t}\n+\t\tif($b!~ /^[ACGTNacgtn]*$/){\n+\t\t\t#&printErr();\n+\t\t\tdie "File $file_reads contains not allowed characters in sequences\n+Allowed characters are ACGTN\n+Reads file $file_reads is not a fasta file\\n\\n";\n+\t\t}\n+\t\tif ($c!~/^\\@/ && $c!~/^\\+/) {\n+\t\t\t#&printErr();\n+\t\t\tdie "$file_reads is not a fastq file\\n\\n";\n+\t\t}\n+\t\tif ((length $b) != (length $d)) {\n+\t\t\t#&printErr();\n+\t\t\tdie "$file_reads is not a fastq file\\n\\n";\n+\t\t}\n+\t\tmy @qv=split //,$d;\n+\t\tfor (my $j=0;$j<@qv ;$j++) {\n+\t\t\tmy $q=ord($qv[$j])-64;\n+\t\t\tif($q<0){$phred_qv=33;}\n+\t\t}\n+\t}\n+\tclose N;\n+}\n+\n+sub search{\n+\tmy ($dir,$str)=@_;\n+\topendir I,$dir;\n+\tmy @ret;\n+\twhile (my $file=readdir I) {\n+\t\tif ($file=~/$str/) {\n+\t\t\tpush @ret, $file;\n+\t\t}\n+\t}\n+\tclosedir I;\n+\tif (@ret != 1) {\n+\t\t#&printErr();\n+\n+\t\tdie "Can not find directory or file which name has string: $str !!!\\n";\n+\t}\n+\treturn $ret[0];\n+}\n+\n+sub Time{\n+ my $time=time();\n+ my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\n+ $month++;\n+ $year+=1900;\n+ if (length($sec) == 1) {$sec = "0"."$sec";}\n+ if (length($min) == 1) {$min = "0"."$min";}\n+ if (length($hour) == 1) {$hour = "0"."$hour";}\n+ if (length($day) == 1) {$day = "0"."$day";}\n+ if (length($month) == 1) {$month = "0"."$month";}\n+ #print "$year-$month-$day $hour:$min:$sec\\n";\n+ return("$year-$month-$day $hour:$min:$sec");\n+}\n+\n+\n+sub usage{\n+print <<"USAGE";\n+Version $version\n+Usage:\n+$0 -i -format -gfa -index -rfam -a -M -min -max -mis -v -t -o -path\n+options:\n+-i input files, # raw data file, can be multipe eg. -i xxx.fq -i xxx .fq ...\n+-tag string # raw data file names, -tag xxx -tag xxx\n+\n+-format string,#specific input rawdata file format : fastq|fq|fasta|fa\n+-phred int # phred quality number, default is 64\n+\n+-path scirpt path\n+\n+-gfa string, input file # genome fasta. sequence file\n+-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter\n+ string must be the prefix of the bowtie index. For instance, if\n+ the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n+ the prefix is \'h_sapiens_37_asm\'.##can be null\n+\n+-rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count.\n+-idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter\n+ string must be the prefix of the bowtie index. For instance, if\n+ the first indexed file is called \'h_sapiens_37_asm.1.ebwt\' then\n+ the prefix is \'h_sapiens_37_asm\'.##can be null\n+\n+-a string, ADAPTER string. default is ATCTCGTATG.\n+-M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don\'t clip it. \n+-min int, reads min length,default is 19.\n+-max int, reads max length,default is 28.\n+\n+-mis [int] number of allowed mismatches when mapping reads to genome, default 0\n+-v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment\n+\n+-t int, number of threads [1]\n+\n+-o output directory# absolute path\n+-h help\n+USAGE\n+exit(1);\n+}\n+\n' |
b |
diff -r ca05d68aca13 -r c75593f79aa9 preProcess.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/preProcess.xml Wed Dec 03 01:54:29 2014 -0500 |
[ |
@@ -0,0 +1,120 @@ +<tool id="preprocess" name="preProcess" veision="1.0.0"> + <description>tool for Raw data preprocess analisis, including 3' adapter triming, reads collaping, genome mapping and rfam non-miRNA analysis </description> + + <requirements> + <requirement type="package" version="0.0.13">fastx_toolkit </requirement> + <requirement type="package" version="0.12.7">bowtie</requirement> + <requirement type="set_environment">SCRIPT_PATH</requirement> + <!--requirement type="package" version="3.0.1">R</requirement!--> + <requirement type="package" version="2.59">SVG</requirement> + <requirement type="package" version="2.1.8">ViennaRNA</requirement> + </requirements> + + <!--command interpreter="perl">miPlant.pl -i $input -format $format -gfa $gfa -idx $index -pre $pre -mat $mat -rfam $rfam -idx2 $idx2 -D $D -a $a -M $M -min $min -max $max -mis $mis -e $e -f $f -v $v -r $r -dis $dis -flank $flank -mfe $mfe -t $t -o $output</command--> + + <command interpreter="perl">miRPlant.pl + ## Change this to accommodate the number of threads you have available. + -t \${GALAXY_SLOTS:-4} + -path \$SCRIPT_PATH + + #for $j, $s in enumerate( $series ) + ##rank_of_series=$j + -i ${s.input} + -tag ${s.tag} + #end for + + ## Do or not annotate rfam non-miRNA RNAs + #if $params.annotate_rfam == "yes": + -rfam $rfam + #end if + + ## prepare bowtie index + #set index_path = '' + #if str($reference_genome.source) == "history": + bowtie-build "$reference_genome.own_file" genome; ln -s "$reference_genome.own_file" genome.fa; + #set index_path = 'genome' + #else: + #set index_path = $reference_genome.index.fields.path + #end if + + -format $format -gfa ${index_path}.fa -idx $index_path -a $a -M $mapnt -min $min -max $max -mis $mismatch -v $v > run.log + </command> + + <inputs> + + <repeat name="series" title="Series"> + <param name="input" type="data" label="Raw data"/> + <param name="tag" type="text" data_ref="input" label="Sample name of raw data"/> + </repeat> + + <!--param name="input" format="tabular" type="data" label="input config file" /--> + + <param name="format" type="select" lable="raw data format" multiple="false"> + <option value="fastq">Raw data is fastq. format</option> + <option value="fasta">Raw data is fasta. format</option> + </param> + + <!-- reference genome --> + <conditional name="reference_genome"> + <param name="source" type="select" label="Will you select a reference genome from your history or use a built-in index?" help="Built-ins were indexed using default options"> + <option value="indexed">Use a built-in index</option> + <option value="history">Use one from the history</option> + </param> + <when value="indexed"> + <param name="index" type="select" label="Select a reference genome" help="If your genome of interest is not listed, contact the Galaxy team"> + <options from_data_table="bowtie_indexes"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + </when> + <when value="history"> + <param name="own_file" type="data" format="fasta" metadata_name="dbkey" label="Select the reference genome" /> + </when> + </conditional> + + <!--param name="gfa" type="data" label="genome sequence fasta file"/--> + <!--param type="data" name="index" label="genome sequence bowtie index"/--> + <param name="a" type="text" value="ATCTCGTATG" label="3' adapter sequence" /> + <param name="mapnt" type="integer" value="8" label="minimum adapter map nts" /> + <param name="min" type="integer" value="19" label="minimum microRNA length" /> + <param name="max" type="integer" value="28" label="maximum microRNA length" /> + <param name="mismatch" type="integer" value="0" label="number of allowed mismatches when mapping reads to genome" /> + + <conditional name="params"> + <param name="annotate_rfam" type="select" label="annotate rfam nocoding RNAs(excluding miRNA)"> + <option value="yes" selected="true">yes</option> + <option value="no">no</option> + </param> + <when value="yes"> + <param name="rfam" type="data" label="rfam sequence file" /> + <!--param type="data" name="idx2" label="rfam sequence bowtie index " --> + <param name="v" type="integer" value="0" label="report end-to-end hits less than v mismatches for rfam mapping"/> + </when> + </conditional> <!-- params --> + + </inputs> + + <outputs> + <data format="html" name="preprocess result" from_work_dir="preProcess/preprocessResult.html" label="${tool.name} on ${on_string}: preprocess result"/> + + <data format="txt" name="clean FASTA data" from_work_dir="preProcess/preProcess_clean/collapse_reads_$min_$max.fa" label="${tool.name} on ${on_string}: clean FASTA data"/> + + <data format="txt" name="genome mapping result" from_work_dir="preProcess/genome_match/genome_mapped.bwt" label="${tool.name} on ${on_string}: genome mapping result"/> + <data format="txt" name="genome mapped FASTA reads" from_work_dir="preProcess/genome_match/genome_mapped.fa" label="${tool.name} on ${on_string}: genome mapped FASTA reads"/> + + <data format="txt" name="Rfam mapping result" from_work_dir="preProcess/rfam_match/rfam_mapped.bwt" label="${tool.name} on ${on_string}: Rfam mapping result"> + <filter>(params['annotate_rfam'] == 'Yes')</filter> + </data> + <data format="txt" name="Rfam mapped FASTA file" from_work_dir="preProcess/rfam_match/rfam_mapped.fa" label="${tool.name} on ${on_string}: Rfam mapped FASTA file"> + <filter>(params['annotate_rfam'] == 'Yes')</filter> + </data> + <data format="txt" name="Rfam not mapped FASTA file" from_work_dir="preProcess/rfam_match/rfam_not_mapped.fa" label="${tool.name} on ${on_string}: Rfam not mapped FASTA file"> + <filter>(params['annotate_rfam'] == 'Yes')</filter> + </data> + </outputs> + + <help> + + </help> + </tool> |
b |
diff -r ca05d68aca13 -r c75593f79aa9 precursors.pl --- a/precursors.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,829 +0,0 @@\n-#!/usr/bin/perl -w\n-#Filename:\n-#Author: Tian Dongmei\n-#Email: tiandm@big.ac.cn\n-#Date: 2013/7/19\n-#Modified:\n-#Description: \n-my $version=1.00;\n-\n-use strict;\n-use Getopt::Long;\n-#use RNA;\n-\n-my %opts;\n-GetOptions(\\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h");\n-if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments\n-&usage;\n-}\n-\n-my $checkno=1;\n-my $filein=$opts{\'map\'};\n-my $faout=$opts{\'o\'};\n-my $strout=$opts{\'s\'};\n-my $genome= $opts{\'g\'};\n-\n-my $maxd=defined $opts{\'d\'} ? $opts{\'d\'} : 200;\n-my $flank=defined $opts{\'f\'}? $opts{\'f\'} : 10;\n-\n-my $MAX_ENERGY=-18;\n-if (defined $opts{\'e\'}) {$MAX_ENERGY=$opts{\'e\'};}\n-my $MAX_UNPAIR=5;\n-my $MIN_PAIR=15;\n-my $MAX_SIZEDIFF=4;\n-my $MAX_BULGE=2;\n-my $ASYMMETRY=5;\n-my $MIN_UNPAIR=0;\n-my $MIN_SPACE=5;\n-my $MAX_SPACE=$maxd;\n-my $FLANK=$flank;\n-\n-######### load in genome sequences start ########\n-my %genome;\n-my %lng;\n-my $name;\n-open IN,"<$genome";\n-while (my $aline=<IN>) {\n-\tchomp $aline;\n-\tnext if($aline=~/^\\#/);\n-\tif ($aline=~/^>(\\S+)/) {\n-\t\t$name=$1;\n-\t\tnext;\n-\t}\n-\t$genome{$name} .=$aline;\n-}\n-close IN;\n-foreach my $key (keys %genome) {\n-\t$lng{$key}=length($genome{$key});\n-}\n-####### load in genome sequences end ##########\n-\n-my %breaks; ### reads number bigger than 3\n-open IN,"<$filein"; #input file \n-while (my $aline=<IN>) {\n-\tchomp $aline;\n-\tmy @tmp=split/\\t/,$aline;\n-\t$tmp[0]=~/_x(\\d+)$/;\n-\tmy $no=$1;\n-\tnext if($no<3);\n-\t#my $trand=&find_strand($tmp[9]);\n-\t#my @pos=split/\\.\\./,$tmp[5];\n-\tmy $end=$tmp[3]+length($tmp[4])-1;\n-\tif($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);}\n-\tpush @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base\n-}\n-close IN;\n-\n-my %cites; ### peaks\n-foreach my $chr (keys %breaks) {\n-\tforeach my $strand (keys %{$breaks{$chr}}) {\n-\t\tmy @array=@{$breaks{$chr}{$strand}};\n-\t\t@array=sort{$a->[0]<=>$b->[0]} @array;\n-\t\tfor (my $i=0;$i<@array;$i++) {\n-\t\t\tmy $start=$array[$i][0];my $end=$array[$i][1];\n-\t\t\tmy @subarray=();\n-\t\t\tpush @subarray,$array[$i];\n-\n-\t\t\tfor (my $j=$i+1;$j<@array;$j++) {\n-\t\t\t\tif ($start<$array[$j][1] && $end>$array[$j][0]) {\n-\t\t\t\t\tpush @subarray,$array[$j];\n-\t\t\t\t\t($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]);\n-\t\t\t\t}\n-\t\t\t\telse{\n-\t\t\t\t\t$i=$j;\n-\t\t\t\t\t&find_cites(\\@subarray,$chr,$strand);\n-\t\t\t\t\tlast;\n-\t\t\t\t}\n-\t\t\t}\n-\t\t}\n-\t}\n-}\n-\n-open FA,">$faout"; #output file \n-open STR,">$strout";\n-foreach my $chr (keys %cites) {\n-\tforeach my $strand (keys %{$cites{$chr}}) {\n-\n-\t\tmy @array2=@{$cites{$chr}{$strand}};\n-\t\t@array2=sort{$a->[0]<=>$b->[0]} @array2;\n-\t\t&excise(\\@array2,$chr,$strand);\n-\t}\n-}\n-close FA;\n-close STR;\n-sub oneCiteDn{\n-\tmy ($array,$a,$chr,$strand)=@_;\n-\n-\tmy $ss=$$array[$a][0]-$flank;\n-\t$ss=0 if($ss<0);\n-\tmy $ee=$$array[$a][1]+$maxd+$flank;\n-\t$ee=$lng{$chr} if($ee>$lng{$chr});\n-\t\n-\tmy $seq=substr($genome{$chr},$ss,$ee-$ss+1);\n-\tif($strand eq "-"){$seq=revcom($seq);}\n-\n-\tmy $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);\n-\treturn $val;\n-}\n-sub oneCiteUp{\n-\tmy ($array,$a,$chr,$strand)=@_;\n-\n-\tmy $ss=$$array[$a][0]-$maxd-$flank;\n-\t$ss=0 if($ss<0);\n-\tmy $ee=$$array[$a][1]+$flank;\n-\t$ee=$lng{$chr} if($ee>$lng{$chr});\n-\t\n-\tmy $seq=substr($genome{$chr},$ss,$ee-$ss+1);\n-\tif($strand eq "-"){$seq=revcom($seq);}\n-\n-\tmy $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);\n-\treturn $val;\n-\n-}\n-\n-sub twoCites{\n-\tmy ($array,$a,$b,$chr,$strand)=@_;\n-\n-\tmy $ss=$$array[$a][0]-$flank;\n-\t$ss=0 if($ss<0);\n-\tmy $ee=$$array[$b][1]+$flank;\n-\t$ee=$lng{$chr} if($ee>$lng{$chr});\n-\t\n-\tmy $seq=substr($genome{$chr},$ss,$ee-$ss+1);\n-\tif($strand eq "-"){$seq=revcom($seq);}\n-\n-#\tmy( $str,$mfe)=RNA::fold($seq);\n-#\treturn 0 if($mfe>$MAX_ENERGY); ### minimum mfe\n-\tmy $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee);\n-\t\n-\treturn $val;\n-\n-}\n-sub excise{\n-\tmy ($cluster,$chr,$strand)=@_;\n-\n-\tmy $last_pos=0;\n-\tfor (my $i=0;$i<@{$cluster};$i++) {\n-\t\tnext if($$cluster[$i][0]<$last_pos);\n-\t\tmy $ok=0;\n-\t\tfor (my $j=$i+1;$j<@{$cluster} '..b' s2\n-\n-\tmy $pair_num=0;\n-\tmy $overhang1=0;\n-\tmy $overhang2=0;\n-\tmy ($s1,$e1,$s2,$e2);\n-\tforeach my $i ($beg1..$end1) {\n-\t\tif (defined $table->{$i}) {\n-\t\t\tmy $j=$table->{$i};\n-\t\t\tif ($j <= $end2 && $j >= $beg2) {\n-\t\t\t\t$s1=$i;\n-\t\t\t\t$e2=$j;\n-\t\t\t\tlast;\n-\t\t\t}\n-\t\t}\n-\t}\n-\tforeach my $i (reverse ($beg1..$end1)) {\n-\t\tif (defined $table->{$i}) {\n-\t\t\tmy $j=$table->{$i};\n-\t\t\tif ($j <= $end2 && $j >= $beg2) {\n-\t\t\t\t$e1=$i;\n-\t\t\t\t$s2=$j;\n-\t\t\t\tlast;\n-\t\t\t}\n-\t\t}\n-\t}\n-\n-#\tprint "$beg1,$end1 $s1,$e1\\n";\n-#\tprint "$beg2,$end2 $s2,$e2\\n";\n-\n-\tforeach my $i ($beg1..$end1) {\n-\t\tif (defined $table->{$i}) {\n-\t\t\tmy $j=$table->{$i};\n-\t\t\tif ($j <= $end2 && $j >= $beg2) {\n-\t\t\t\t++$pair_num;\n-\t\t\t}\n-\t\t}\n-\t}\n-\tif (defined $s1 && defined $e2) {\n-\t\t$overhang1=($end2-$e2)-($s1-$beg1);\n-\t}\n-\tif (defined $e1 && defined $s2) {\n-\t\t$overhang2=($end1-$e1)-($s2-$beg2);\n-\t}\n-\t\n-\tif ($pair_num < 13) {\n-\t\t$like_mir_duplex=0;\n-\t}\n-\tif ($overhang1 < 0 && $overhang2 < 0) {\n-\t\t$like_mir_duplex=0;\n-\t}\n-\treturn ($like_mir_duplex,$pair_num,$overhang1,$overhang2);\n-}\n-sub parse_struct {\n-\tmy $struct=shift;\n-\tmy $table=shift;\n-\n-\tmy @t=split(\'\',$struct);\n-\tmy @lbs; # left brackets\n-\tforeach my $k (0..$#t) {\n-\t\tif ($t[$k] eq "(") {\n-\t\t\tpush @lbs, $k+1;\n-\t\t}\n-\t\telsif ($t[$k] eq ")") {\n-\t\t\tmy $lb=pop @lbs;\n-\t\t\tmy $rb=$k+1;\n-\t\t\t$table->{$lb}=$rb;\n-\t\t\t$table->{$rb}=$lb;\n-\t\t}\n-\t}\n-\tif (@lbs) {\n-\t\twarn "unbalanced RNA struct.\\n";\n-\t}\n-}\n-sub which_arm {\n-\tmy $substruct=shift;\n-\tmy $arm;\n-\tif ($substruct=~/\\(/ && $substruct=~/\\)/) {\n-\t\t$arm="-";\n-\t}\n-\telsif ($substruct=~/\\(/) {\n-\t\t$arm="5p";\n-\t}\n-\telse {\n-\t\t$arm="3p";\n-\t}\n-\treturn $arm;\n-}\n-sub biggest_bulge {\n-\tmy $struct=shift;\n-\tmy $bulge_size=0;\n-\tmy $max_bulge=0;\n-\twhile ($struct=~/(\\.+)/g) {\n-\t\t$bulge_size=length $1;\n-\t\tif ($bulge_size > $max_bulge) {\n-\t\t\t$max_bulge=$bulge_size;\n-\t\t}\n-\t}\n-\treturn $max_bulge;\n-}\n-sub get_asy {\n-\tmy($table,$a1,$a2)=@_;\n-\tmy ($pre_i,$pre_j);\n-\tmy $asymmetry=0;\n-\tforeach my $i ($a1..$a2) {\n-\t\tif (defined $table->{$i}) {\n-\t\t\tmy $j=$table->{$i};\n-\t\t\tif (defined $pre_i && defined $pre_j) {\n-\t\t\t\tmy $diff=($i-$pre_i)+($j-$pre_j);\n-\t\t\t\t$asymmetry += abs($diff);\n-\t\t\t}\n-\t\t\t$pre_i=$i;\n-\t\t\t$pre_j=$j;\n-\t\t}\n-\t}\n-\treturn $asymmetry;\n-}\n-\n-sub peaks{\n-\tmy @cluster=@{$_[0]};\n-\n-\treturn if(@cluster<1);\n-\n-\tmy $max=0; my $index=-1;\n-\tfor (my $i=0;$i<@cluster;$i++) {\n-\t\tif($cluster[$i][2]>$max){\n-\t\t\t$max=$cluster[$i][2];\n-\t\t\t$index=$i;\n-\t\t}\n-\t}\n-#\t&excise(\\@cluster,$index,$_[1],$_[2]);\n-\treturn($index);\n-}\n-\n-sub find_cites{\n-\tmy @tmp=@{$_[0]};\n-\tmy $i=&peaks(\\@tmp);\n-\t\n-\tmy $start=$tmp[$i][0];\n-\tmy $total=0; my $node5=0;\n-\tfor (my $j=0;$j<@tmp ;$j++) {\n-\t\t$total+=$tmp[$j][2];\n-\t\t$node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2);\n-\t}\n-\tpush @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5);\n-}\n-\n-sub newpos{\n-\tmy ($a,$b,$c,$d)=@_;\n-\tmy $s= $a>$c ? $c : $a;\n-\tmy $e=$b>$d ? $b : $d;\n-\treturn($s,$e);\n-}\n-\n-sub rev{\n-\n- my($sequence)=@_;\n-\n- my $rev=reverse $sequence; \n-\n- return $rev;\n-}\n-\n-sub com{\n-\n- my($sequence)=@_;\n-\n- $sequence=~tr/acgtuACGTU/TGCAATGCAA/; \n- \n- return $sequence;\n-}\n-\n-sub revcom{\n-\n- my($sequence)=@_;\n-\n- my $revcom=rev(com($sequence));\n-\n- return $revcom;\n-}\n-\n-sub find_strand{\n-\n- #A subroutine to find the strand, parsing different blast formats\n- my($other)=@_;\n-\n- my $strand="+";\n-\n- if($other=~/-/){\n-\t$strand="-";\n- }\n-\n- if($other=~/minus/i){\n-\t$strand="-";\n- }\n-\n- return($strand);\n-}\n-sub usage{\n-print <<"USAGE";\n-Version $version\n-Usage:\n-$0 -map -g -d -f -o -s -e\n-options:\n- -map input file# align result # bst. format\n- -g input file # genome sequence fasta format\n- -d <int> Maximal space between miRNA and miRNA* (200)\n- -f <int> Flank sequence length of miRNA precursor (10)\n- -o output file# percursor fasta file\n- -s output file# precursor structure file\n- -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol)\n-\n- -h help\n-USAGE\n-exit(1);\n-}\n-\n' |
b |
diff -r ca05d68aca13 -r c75593f79aa9 quantify.pl --- a/quantify.pl Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,503 +0,0 @@\n-#!/usr/bin/perl -w\r\n-#Filename:\r\n-#Author: Tian Dongmei\r\n-#Email: tiandm@big.ac.cn\r\n-#Date: 2013/7/19\r\n-#Modified:\r\n-#Description: \r\n-my $version=1.00;\r\n-\r\n-use File::Path;\r\n-use strict;\r\n-use File::Basename;\r\n-#use Getopt::Std;\r\n-use Getopt::Long;\r\n-#use RNA;\r\n-\r\n-my %opts;\r\n-GetOptions(\\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","time:s","h");\r\n-if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments\r\n-&usage;\r\n-}\r\n-\r\n-my $read=$opts{\'r\'};\r\n-my $pre=$opts{\'p\'};\r\n-my $mature=$opts{\'m\'};\r\n-\r\n-my $dir=$opts{\'o\'};\r\n-unless ($dir=~/\\/$/) {$dir .="/";}\r\n-if (not -d $dir) {\r\n-\tmkdir $dir;\r\n-}\r\n-\r\n-my $threads=defined $opts{\'t\'} ? $opts{\'t\'} : 1;\r\n-my $mismatch=defined $opts{\'mis\'} ? $opts{\'mis\'} : 0;\r\n-\r\n-my $upstream = 2;\r\n-my $downstream = 5;\r\n-\r\n-$upstream = $opts{\'e\'} if(defined $opts{\'e\'});\r\n-$downstream = $opts{\'f\'} if(defined $opts{\'f\'});\r\n-\r\n-my $marks=defined $opts{\'tag\'} ? $opts{\'tag\'} : "";\r\n-\r\n-my $time=Time();\r\n-if (defined $opts{\'time\'}) { $time=$opts{\'time\'};}\r\n-\r\n-my $tmpdir="${dir}/miRNA_Express_${time}";\r\n-if(not -d $tmpdir){\r\n-\tmkdir($tmpdir);\r\n-}\r\n-chdir $tmpdir;\r\n-\r\n-`cp $pre ./`;\r\n-my $pre_file_name=basename($pre);\r\n-\r\n-&mapping(); # matures align to precursors && reads align to precursors;\r\n-\r\n-my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end;\r\n-&maturePosOnPre(); # acknowledge mature positions on precursor \r\n-\r\n-my %pre_read;\r\n-&readPosOnPre(); # acknowledge reads positions on precursors\r\n-\r\n-if(!(defined $opts{\'tag\'})){\r\n-\tforeach my $key (keys %pre_read) {\r\n-\t\t$pre_read{$key}[0][0]=~/:([\\d|_]+)_x(\\d+)$/;\r\n-\t\tmy @ss=split/_/,$1;\r\n-\t\tfor (my $i=1;$i<=@ss;$i++) {\r\n-\t\t\t$marks .="Smp$i;";\r\n-\t\t}\r\n-\t\tlast;\r\n-\t}\r\n-}\r\n-\r\n-my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...."\r\n-&attachPre();\r\n-\r\n-my $preno=scalar (keys %pre);\r\n-print "Total Precursor Number is $preno !!!!\\n";\r\n-\r\n-my %struc; #mature star loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe;\r\n-&structure();\r\n-\r\n-\r\n-##### analysis and print out && moRs\r\n-my $aln=$dir."known_microRNA_express.aln";\r\n-my $list=$dir."known_microRNA_express.txt";\r\n-my $moRs=$dir."known_microRNA_express.moRs";\r\n-\r\n-system("ln $mature $dir/known_microRNA_mature.fa ");\r\n-system("ln $pre $dir/known_microRNA_precursor.fa ");\r\n-\r\n-open ALN,">$aln";\r\n-open LIST,">$list";\r\n-open MORS,">$moRs";\r\n-\r\n-$"="\\t"; ##### @array print in \\t\r\n-\r\n-my @marks=split/\\;/,$marks;\r\n-#print LIST "#matueID\\tpreID\\tpos1\\tpos2\\tmatureExp\\tstarExp\\ttotalExp\\n";\r\n-print LIST "#matueID\\tpreID\\tpos1\\tpos2";\r\n-for (my $i=0;$i<@marks;$i++) {\r\n-\tprint LIST "\\t",$marks[$i],"_matureExp";\r\n-}\r\n-for (my $i=0;$i<@marks;$i++) {\r\n-\tprint LIST "\\t",$marks[$i],"_starExp";\r\n-}\r\n-for (my $i=0;$i<@marks;$i++) {\r\n-\tprint LIST "\\t",$marks[$i],"_totalExp";\r\n-}\r\n-print LIST "\\n";\r\n-print ALN "#>precursor ID \\n#precursor sequence\\n#precursor structure (mfe)\\n#RNA_seq\\t@marks\\ttotal\\n";\r\n-print MORS "#>precursor ID\\tstrand\\texpress_reads\\texpress_reads\\/total_reads\\tblock_number\\tprecursor_sequence\\n#\\tblock_start\\tblock_end\\t@marks\\ttotal\\ttag_number\\tsequence\\n";\r\n-my %moRs;\r\n-\r\n-foreach my $key (keys %pre) {\r\n-\tprint ALN ">$key\\n$pre{$key}\\n$struc{$key}{struc} ($struc{$key}{mfe})\\n";\r\n-\tnext if(! (exists $pre_read{$key}));\r\n-\tmy @array=@{$pre_read{$key}};\r\n-\t@array=sort{$a->[3]<=> $b->[3]} @array;\r\n-\t\r\n-\tmy $length=length($pre{$key});\r\n-\r\n-\tmy $maxline=-1;my $max=0; ### storage the maxinum express read line\r\n-\tmy $totalReadsNo=0; \r\n-\tmy @not_over=(); ### new read format better for moRs analysis\r\n-\r\n-####print out Aln file start\r\n-\tfor (my $i=0;$i<@array;$i++) {\r\n-\t\tmy $maps=$array[$i][3]+1;\r\n-\t\tmy $mape=$array[$i][3]+length($array[$i][4]);\r\n-\t\tmy $str="";\r\n-\t\t$str .= "." x ($maps-1);\r\n-\t\t$str .=$array[$i][4];\r\n-\t\t$str .="." x ($length-$mape);\r\n-\t\t$str .=" ";\r\n-\r\n-\t\t$array[$i][0]=~/:([\\d|_]+)_x(\\d+)$/;\r\n-\t\tmy @samp'..b'b{$i};\r\n-#\t\t\tif($a>$b){\r\n-#\t\t\t\t$startpos=$b-$n+2;\r\n-#\t\t\t\t$endpos=$b-$n+2+($end-$start);\r\n-#\t\t\t}\r\n-#\t\t\tif($a<$b){\r\n-\t\t\t\t$endpos=$b+$n+2;\r\n-\t\t\t\tif($endpos>length($structure)){$endpos=length($structure);}\r\n-\t\t\t\t$startpos=$b+$n+2-($end-$start);\r\n-\t\t\t\tif($startpos<1){$startpos=1;}\r\n-#\t\t\t}\r\n-\t\t\tlast;\r\n-\t\t}\r\n-\t\t$n++;\r\n-\t}\r\n-\treturn ($startpos,$endpos);\r\n-}\r\n-sub attachPre{\r\n-\topen IN, "<$pre_file_name";\r\n-\tmy $name;\r\n-\twhile (my $aline=<IN>) {\r\n-\t\tchomp $aline;\r\n-\t\tif ($aline=~/^>(\\S+)/) {\r\n-\t\t\t$name=$1;\r\n-\t\t\tnext;\r\n-\t\t}\r\n-\t\t$pre{$name} .=$aline;\r\n-\t}\r\n-\tclose IN;\r\n-}\r\n-sub readPosOnPre{\r\n-\topen IN,"<read_mapped.bwt";\r\n-\twhile (my $aline=<IN>) {\r\n-\t\tchomp $aline;\r\n-\t\tmy @tmp=split/\\t/,$aline;\r\n-\t\tmy $id=lc($tmp[2]);\r\n-\t\tpush @{$pre_read{$tmp[2]}},[@tmp];\r\n-\t}\r\n-\tclose IN;\r\n-}\r\n-sub maturePosOnPre{\r\n-\topen IN,"<mature_mapped.bwt";\r\n-\twhile (my $aline=<IN>) {\r\n-\t\tchomp $aline;\r\n-\t\tmy @tmp=split/\\t/,$aline;\r\n-\t\tmy $mm=$tmp[0];\r\n-#\t\t$mm=~s/\\-3P|\\-5P//i;\r\n-\t\t$mm=lc($mm);\r\n-\t\tmy $pm=$tmp[2];\r\n-\t\t$pm=lc($pm);\r\n-\r\n-#\t\tnext if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a\r\n-\t\tnext if($mm!~/$pm/);\r\n-#\t\tprint "$tmp[2]\\t$tmp[0]\\n";\r\n-#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream;\r\n-#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0);\r\n-#\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream;\r\n-\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1;\r\n-\t\t$pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]);\r\n-\t}\r\n-\tclose IN;\r\n-}\r\n-sub mapping{\r\n- my $err;\r\n-## build bowtie index\r\n- #print STDERR "building bowtie index\\n";\r\n- $err = `bowtie-build $pre_file_name miRNA_precursor`;\r\n-\r\n-## map mature sequences against precursors\r\n- #print STDERR "mapping mature sequences against index\\n";\r\n-\t$err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature > mature_mapped.bwt 2> run.log`;\r\n-\r\n-## map reads against precursors\r\n- #print STDERR "mapping read sequences against index\\n";\r\n- $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa > read_mapped.bwt 2> run.log`;\r\n-\r\n-}\r\n-\r\n-sub subseq{\r\n-\tmy $seq=shift;\r\n-\tmy $beg=shift;\r\n-\tmy $end=shift;\r\n-\tmy $strand=shift;\r\n-\t\r\n-\tmy $subseq=substr($seq,$beg-1,$end-$beg+1);\r\n-\tif ($strand eq "-") {\r\n-\t\t$subseq=revcom($subseq);\r\n-\t}\r\n-\treturn uc $subseq;\r\n-}\r\n-\r\n-sub revcom{\r\n-\tmy $seq=shift;\r\n-\t$seq=~tr/ATCGatcg/TAGCtagc/;\r\n-\t$seq=reverse $seq;\r\n-\treturn uc $seq;\r\n-}\r\n-\r\n-sub Time{\r\n- my $time=time();\r\n- my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];\r\n- $month++;\r\n- $year+=1900;\r\n- if (length($sec) == 1) {$sec = "0"."$sec";}\r\n- if (length($min) == 1) {$min = "0"."$min";}\r\n- if (length($hour) == 1) {$hour = "0"."$hour";}\r\n- if (length($day) == 1) {$day = "0"."$day";}\r\n- if (length($month) == 1) {$month = "0"."$month";}\r\n- #print "$year-$month-$day $hour:$min:$sec\\n";\r\n- return("$year-$month-$day-$hour-$min-$sec");\r\n-}\r\n-\r\n-sub usage{\r\n-print <<"USAGE";\r\n-Version $version\r\n-Usage:\r\n-$0 -r -p -m -mis -t -e -f -tag -o -time\r\n-mandatory parameters:\r\n--p precursor.fa miRNA precursor sequences from miRBase # must be absolute path\r\n--m mature.fa miRNA sequences from miRBase # must be absolute path\r\n--r reads.fa your read sequences #must be absolute path\r\n-\r\n--o output directory\r\n-\r\n-options:\r\n--mis [int] number of allowed mismatches when mapping reads to precursors, default 0\r\n--t [int] threads number,default 1\r\n--e [int] number of nucleotides upstream of the mature sequence to consider, default 2\r\n--f [int] number of nucleotides downstream of the mature sequence to consider, default 5\r\n--tag [string] sample marks# eg. sampleA;sampleB;sampleC\r\n--time sting #make directory time,default is the local time\r\n--h help\r\n-USAGE\r\n-exit(1);\r\n-}\r\n-\r\n' |
b |
diff -r ca05d68aca13 -r c75593f79aa9 rfam.pl --- a/rfam.pl Thu Nov 13 22:43:35 2014 -0500 +++ b/rfam.pl Wed Dec 03 01:54:29 2014 -0500 |
b |
@@ -12,7 +12,7 @@ use File::Basename; my %opts; -GetOptions(\%opts,"i=s","ref=s","index:s","v:i","p:i","o=s","time:s","h"); +GetOptions(\%opts,"i=s","ref=s","index:s","v:i","p:i","o=s","h"); if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments &usage; } @@ -31,10 +31,8 @@ my $time=Time(); -if (defined $opts{'time'}) { - $time=$opts{'time'}; -} -my $mapdir=$dir."/rfam_match_".$time; + +my $mapdir=$dir."/rfam_match"; if(not -d $mapdir){ mkdir $mapdir; } @@ -95,7 +93,6 @@ -p/--threads <int> number of alignment threads to launch (default: 1) -o output directory --time sting #make directory time,default is the local time -h help USAGE exit(1); |
b |
diff -r ca05d68aca13 -r c75593f79aa9 tool_dependencies.xml --- a/tool_dependencies.xml Thu Nov 13 22:43:35 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
b |
@@ -1,48 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="fastx_toolkit" version="0.0.13"> - <repository changeset_revision="ec66ae4c269b" name="package_fastx_toolkit_0_0_13" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> - <package name="bowtie" version="0.12.7"> - <repository changeset_revision="9f9f38617a98" name="package_bowtie_0_12_7" owner="devteam" toolshed="https://toolshed.g2.bx.psu.edu" /> - </package> - <set_environment version="1.0"> - <environment_variable action="set_to" name="SCRIPT_PATH">$REPOSITORY_INSTALL_DIR</environment_variable> - </set_environment> - <!--package name="R" version="3.0.1"> - <repository name="package_r_3_0_1" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package!--> - - <package name="ViennaRNA" version="2.1.8"> - <install version="1.0"> - <actions> - <action type="download_by_url">http://www.tbi.univie.ac.at/RNA/packages/source/ViennaRNA-2.1.8.tar.gz</action> - <action type="shell_command">./configure --prefix=$INSTALL_DIR </action> - <action type="shell_command">make</action> - <action type="shell_command">make install</action> - <action type="set_environment"> - <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable> - </action> - </actions> - </install> - </package> - - <package name="SVG" version="2.59"> - <install version="1.0"> - <actions> - <action type="download_by_url">http://www.cpan.org/authors/id/S/SZ/SZABGAB/SVG-2.59.tar.gz</action> - <action type="make_directory">$INSTALL_DIR/lib/perl5</action> - <action type="shell_command"> - perl Makefile.PL INSTALL_BASE=$INSTALL_DIR && - make && - make install - </action> - <action type="set_environment"> - <environment_variable action="append_to" name="PERL5LIB">$INSTALL_DIR/lib/perl5/:$INSTALL_DIR/lib/perl5/x86_64-linux-gnu-thread-multi/</environment_variable> - </action> - </actions> - </install> - </package> - - -</tool_dependency> |