# HG changeset patch # User m-zytnicki # Date 1366643287 14400 # Node ID 440ceca58672a32dbefc73f5bc29651f8b815be2 # Parent c79b9ae3f65fc30d2c04a42a16cbe6f9c5b436ca Uploaded diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/CleanTranscriptFile.xml --- a/SMART/galaxy/CleanTranscriptFile.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/CleanTranscriptFile.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - - Clean a transcript file so that it is useable for S-MART. + + Clean a transcript file so that it is useable for S-MART. ../Java/Python/CleanTranscriptFile.py -i $formatType.inputFileName #if $formatType.FormatInputFileName == 'gff': -f gff @@ -57,4 +57,17 @@ + + + + + + + + + + + A GFF/GTF file (please consult http://www.sequenceontology.org/gff3.shtml to know more about the GFF3 format, and http://mblab.wustl.edu/GTF22.html for the GTF format) may contain different sources of information: chromosome size, genes, transcripts, etc. S-MART mostly works on transcripts. This scripts filters the input file to keep the information you really want, based on the feature (3rd column). + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/Clusterize.xml --- a/SMART/galaxy/Clusterize.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/Clusterize.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - - Clusterizes the reads when their genomic intervals overlap. + + Clusterize features when their genomic intervals overlap. ../Java/Python/clusterize.py -i $formatType.inputFileName #if $formatType.FormatInputFileName == 'bed': @@ -10,8 +10,6 @@ -f gff2 #elif $formatType.FormatInputFileName == 'gff3': -f gff3 - #elif $formatType.FormatInputFileName == 'csv': - -f csv #elif $formatType.FormatInputFileName == 'sam': -f sam #elif $formatType.FormatInputFileName == 'gtf': @@ -21,7 +19,6 @@ $colinear $normalize -d $distance - $log $outputFileLog @@ -31,7 +28,6 @@ - @@ -47,9 +43,6 @@ - - - @@ -58,16 +51,20 @@ - - - - + + + - - log - + + +The script clusterizes the input genomic data. Two features are clusterized when their genomic intervals overlap. The output is a GFF3 file, where each element is a cluster. The number of elements in the cluster is given by the tag **nbElements**. The name of a cluster is the concatation of the names of its reads (like **read1--read2--read3**). Note that if the size of the name of the cluster exceeds 100 characters, it is truncated to the first 100 characters. + +Some options may clusterize the features which are closer than a given distance. + +By default, the tool clusterizes all features which overlap (or nearly overlap), even if they are on different strands. If you want to clusterize the features which are on the same strand only, you can specify it. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/CollapseReads.xml --- a/SMART/galaxy/CollapseReads.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/CollapseReads.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Merges two reads if they have exactly the same genomic coordinates. + Merges two genomic features if they have exactly the same genomic coordinates. ../Java/Python/CollapseReads.py -i $formatType.inputFileName #if $formatType.FormatInputFileName == 'bed': @@ -49,11 +49,16 @@ - + + +Merge two input genomic coordinates iff they are exactly the same. If two or more genomic coordinates are merged, the tag **nbElements** is updated accordingly. As a consequence, all the reads which are exactly the same appear as one genomic coordinate. + +This is especially useful for short RNA sequencing (where you want to count the number of read per miRNA, siRNA, etc.) or 5' capped short reads. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/CompareOverlappingSmallQuery.xml --- a/SMART/galaxy/CompareOverlappingSmallQuery.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/CompareOverlappingSmallQuery.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - - Provide the queries that overlap with a reference, when the query is small. + + Provide the queries that overlap with a reference, when the query data set is small. ../Java/Python/CompareOverlappingSmallQuery.py -i $formatType.inputFileName1 #if $formatType.FormatInputFileName1 == 'bed': @@ -36,13 +36,21 @@ #if $OptionMinOverlap.present == 'Yes': -m $OptionMinOverlap.minOverlap #end if + #if $OptionPcOverlapQuery.present == 'Yes': + -p $OptionPcOverlapQuery.minOverlap + #end if + #if $OptionPcOverlapRef.present == 'Yes': + -P $OptionPcOverlapRef.minOverlap + #end if #if $OptionCollinearOrAntiSens.OptionCA == 'Collinear': -c - #elif $OptionCollinearOrAntiSens.OptionCA == 'Antisense': + #elif $OptionCollinearOrAntiSens.OptionCA == 'AntiSens': -a #end if $InvertMatch $NotOverlapping + $OptionInclusionQuery + $OptionInclusionRef @@ -104,7 +112,7 @@ - + @@ -125,24 +133,68 @@ + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - - + + + + +This script may be the most important one. It basically compares two sets of transcripts and keeps those from the first set which overlap with the second one. The first set is considered as the query set (basically, your data) and the second one is the reference set (RefSeq data, for example). + +It is vital to understand that it will output the elements of the first file which overlap with the elements of the second one. + +Various modifiers are also available: + +-Invert selection (report those which do not overlap). + +-Restrict to colinear / anti-sense overlapping data. + +-Keep the query data even if they do not strictly overlap with the reference data, but are located not further away than *n* nucleotide from some reference data. + +-Keep the query data with are strictly included into reference data, meaning that a query transcript such that at least 1 nucleotide does not overlap with reference data will not be presented as a solution. + +The mechanism of shrinking and extending is also useful to make a fine grain comparison. For example, if you want to keep those such that the TSS is overlapping the reference set, you just shrink the query set to 1 nucleotide. Now, if you want to keep those which are overlapping you data or located 2kb downstream of it, just extend the query data in the downstream direction, and you will have what you want. You can also extend in the opposite direction to get the possible transcript factor sites which are upstream. + +Some option reverses the selection. Put in other words, it performs the comparison as usual, and outputs all those query data which do not overlap. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/CompareOverlappingSmallRef.xml --- a/SMART/galaxy/CompareOverlappingSmallRef.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/CompareOverlappingSmallRef.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - - Provide the queries that overlap with a reference, when the reference is small. + + Provide the queries that overlap with a reference, when the reference dataset is small. ../Java/Python/CompareOverlappingSmallQuery.py -i $formatType.inputFileName1 #if $formatType.FormatInputFileName1 == 'bed': @@ -155,8 +155,8 @@ - - + + @@ -170,11 +170,31 @@ - - + + + + +This script may be the most important one. It basically compares two sets of transcripts and keeps those from the first set which overlap with the second one. The first set is considered as the query set (basically, your data) and the second one is the reference set (RefSeq data, for example). + +It is vital to understand that it will output the elements of the first file which overlap with the elements of the second one. + +Various modifiers are also available: + +-Invert selection (report those which do not overlap). + +-Restrict to colinear / anti-sense overlapping data. + +-Keep the query data even if they do not strictly overlap with the reference data, but are located not further away than *n* nucleotide from some reference data. + +-Keep the query data with are strictly included into reference data, meaning that a query transcript such that at least 1 nucleotide does not overlap with reference data will not be presented as a solution. + +The mechanism of shrinking and extending is also useful to make a fine grain comparison. For example, if you want to keep those such that the TSS is overlapping the reference set, you just shrink the query set to 1 nucleotide. Now, if you want to keep those which are overlapping you data or located 2kb downstream of it, just extend the query data in the downstream direction, and you will have what you want. You can also extend in the opposite direction to get the possible transcript factor sites which are upstream. + +Some option reverses the selection. Put in other words, it performs the comparison as usual, and outputs all those query data which do not overlap. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile.xml --- a/SMART/galaxy/ConvertTranscriptFile.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/ConvertTranscriptFile.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,12 +1,10 @@ - + Convert a file from a format to another. ../Java/Python/convertTranscriptFile.py -i $inputFormatType.inputFileName #if $inputFormatType.FormatInputFileName == 'gff3': -f gff3 #elif $inputFormatType.FormatInputFileName == 'bed': -f bed - #elif $inputFormatType.FormatInputFileName == 'gff2': - -f gff2 #elif $inputFormatType.FormatInputFileName == 'bam': -f blast #elif $inputFormatType.FormatInputFileName == 'sam': @@ -16,10 +14,6 @@ #end if -g $outputFormatType.outFormat - #if $optionSequence.choose == 'Yes': - -s $optionSequence.value - #end if - -n $name $strand @@ -31,7 +25,6 @@ - @@ -42,9 +35,6 @@ - - - @@ -61,7 +51,6 @@ - @@ -71,8 +60,6 @@ - - @@ -85,18 +72,6 @@ - - - - - - - - - - - - @@ -105,7 +80,7 @@ - + @@ -115,5 +90,6 @@ +Simple conversion tool. diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_BedToCsv.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_BedToCsv.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Bed File to Csv File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f bed -o $outputFile -g csv yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_BedToGff2.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_BedToGff2.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Bed File to Gff2 File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f bed -o $outputFile -g gff yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_BedToGff3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_BedToGff3.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Bed File to Gff3 File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f bed -o $outputFile -g gff3 yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_BedToSam.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_BedToSam.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Bed File to Sam File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f bed -o $outputFile -g sam yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_BlastToCsv.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_BlastToCsv.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Blast (-m 8) File to Csv File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f blast -o $outputFile -g csv yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_BlastToGff2.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_BlastToGff2.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Blast (-m 8) File to Gff2 File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f blast -o $outputFile -g gff2 yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_BlastToGff3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_BlastToGff3.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Blast (-m 8) File to Gff3 File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f blast -o $outputFile -g gff3 yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_BlastToSam.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_BlastToSam.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Blast (-m 8) File to Sam File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f blast -o $outputFile -g sam yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_FastqToFasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_FastqToFasta.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Fastq File to Fasta File. + ../Java/Python/fastqToFasta.py -i $inputFile -o $outputFile 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_Gff2ToCsv.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_Gff2ToCsv.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Gff2 File to Csv File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f gff2 -o $outputFile -g csv yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_Gff2ToGff3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_Gff2ToGff3.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Gff2 File to Gff3 File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f gff2 -o $outputFile -g gff3 yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_Gff2ToSam.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_Gff2ToSam.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Gff2 File to Sam File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f gff2 -o $outputFile -g sam yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_Gff3ToCsv.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_Gff3ToCsv.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Gff3 File to Csv File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f gff3 -o $outputFile -g csv yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_Gff3ToGff2.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_Gff3ToGff2.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Gff3 File to Gff2 File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f gff3 -o $outputFile -g gff2 yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_Gff3ToSam.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_Gff3ToSam.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Gff3 File to Sam File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f gff3 -o $outputFile -g sam yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_Gff3ToWig.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_Gff3ToWig.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Gff3 File to Wig File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f gff3 -o $outputFile -g wig yes 2>$logFile + + + + + + + + + + + + \ No newline at end of file diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_SamToCsv.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_SamToCsv.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Sam File to Csv File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f sam -o $outputFile -g csv yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_SamToGff2.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_SamToGff2.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Sam File to Gff2 File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f sam -o $outputFile -g gff2 yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/ConvertTranscriptFile_SamToGff3.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/ConvertTranscriptFile_SamToGff3.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,15 @@ + + Convert Sam File to Gff3 File. + ../Java/Python/convertTranscriptFile.py -i $inputFile -f sam -o $outputFile -g gff3 yes 2>$logFile + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/CountReadGCPercent.xml --- a/SMART/galaxy/CountReadGCPercent.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/CountReadGCPercent.xml Mon Apr 22 11:08:07 2013 -0400 @@ -11,6 +11,7 @@ +Count the GC% of a FASTA file. diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/GetDifferentialExpression.xml --- a/SMART/galaxy/GetDifferentialExpression.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/GetDifferentialExpression.xml Mon Apr 22 11:08:07 2013 -0400 @@ -51,7 +51,6 @@ $simple $adjusted - #if $optionSimplePara.simplePara == 'Yes': -S $optionSimplePara.paraValue #end if @@ -63,7 +62,6 @@ #if $optionFDR.FDR == 'Yes': -d $optionFDR.FDRValue #end if - $plot $outputFilePNG @@ -154,8 +152,8 @@ - - + + @@ -170,7 +168,7 @@ - + @@ -193,18 +191,21 @@ - - - - plot - - example: python GetDifferentialExpression.py -i input1 -f gff3 -j input2 -g gff3 -k ref -l gff3 -o output.gff3 +This tool compares two sets of data and find the differential expression. One very important component of the tool is the reference set. Actually, to use the tool, you need the two input sets of data, of course, and the reference set. The reference set is a set of genomic coordinates and, for each interval, it will count the number of feature on each sample and compute the differential expression. For each reference interval, it will output the direction of the regulation (up or down, with respect to the first input set), and a *p*-value from a Fisher exact test. + +This reference set seems boring. Why not computing the differential expression without this set? The answer is: the differential expression of what? I cannot guess it. Actually, you might want to compare the expression of genes, of small RNAs, of transposable elements, of anything... So the reference set can be a list of genes, and in this case, you can compute the differential expression of genes. But you can also compute many other things. + +Suppose that you cluster the data of your two input samples (you can do it with the *clusterize* and the *mergeTranscriptLists* tools). You now have a list of all the regions which are transcribed in at least one of the input samples. This can be your reference set. This reference set is interesting since you can detect the differential expression of data which is outside any annotation. + +Suppose now that you clusterize using a sliding window the two input samples (you can do it with the *clusterizeBySlidingWindows* and the *mergeSlidingWindowsClusters* tools). You can now select all the regions of a given size which contain at least one read in one of the two input samples (do it with *selectByTag* and the tag **nbElements**). Again, this can be an other interesting reference set. + +In most cases, the sizes of the two input samples will be different, so you should probably normalize the data, which is an available option. The ---rather crude--- normalization increases the number of data in the least populated sample and decreases the number of data in the most populated sample to the average number of data. diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/GetFlanking.xml --- a/SMART/galaxy/GetFlanking.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/GetFlanking.xml Mon Apr 22 11:08:07 2013 -0400 @@ -52,9 +52,6 @@ #end if -o $outputFile - - - @@ -117,7 +114,7 @@ - + @@ -131,8 +128,8 @@ - - + + @@ -175,5 +172,20 @@ + +This tool prints the elements from the second set of genomic intervals which are closest to (in other words, are flanking) the elements from the first set. You can also play on different parameters: + +- restrict the search to downstream or upstream elements, or print downstream and upstream elements, + +- only consider collinear flanking elements, + +- only consider anti-sense flanking elements, + +- only consider elements which are close enough (using some given distance), + +- only consider flanking elements which do not overlap with the reference element. + +Notice that elements from the second sets may be printed at most once, whether they are the flanking element of several elements from the first or not. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/SelectByTag.xml --- a/SMART/galaxy/SelectByTag.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/SelectByTag.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,17 +1,13 @@ - Keeps the genomic coordinates such that a value of a given tag. + Keep the genomic coordinates such that a value of a given tag. ../Java/Python/SelectByTag.py -i $formatType.inputFileName - #if $formatType.FormatInputFileName == 'bed': - -f bed #elif $formatType.FormatInputFileName == 'gff': -f gff #elif $formatType.FormatInputFileName == 'gff2': -f gff2 #elif $formatType.FormatInputFileName == 'gff3': -f gff3 - #elif $formatType.FormatInputFileName == 'sam': - -f sam #elif $formatType.FormatInputFileName == 'gtf': -f gtf #end if @@ -37,16 +33,11 @@ - - - - - @@ -56,9 +47,6 @@ - - - @@ -67,12 +55,12 @@ - + - + @@ -84,7 +72,7 @@ - + @@ -96,14 +84,14 @@ - + - + @@ -119,4 +107,13 @@ + +The script reads a list of genomic coordinates and output all the features with specific tag values. If you want to know more about tags, please consult the GFF format page: http://www.sequenceontology.org/gff3.shtml + +The tools reads the input file, and more specifically the tag that you specified. You can mention a lower and a upper bound for its value, or a specific value, and the tool will print all the features such that the tags are between the specified bounds or matches the string. + +A tag has to be present for each feature. If not, you can specify a default value which will be used if the tag is absent. + +This tool can be used to select the clusters with a minimum number of elements (the tag **nbElements** counts the number of elements per clusters) or to select the reads which have mapped less than *n* times (the tag **nbOccurrences** counts the number of mappings per read). + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/WrappGetLetterDistribution.py --- a/SMART/galaxy/WrappGetLetterDistribution.py Fri Apr 19 10:13:11 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,97 +0,0 @@ -#! /usr/bin/env python - -import os -import sys -import getopt -from pyRepetUnit.commons.checker.CheckerException import CheckerException - -SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] - -class WrappGetLetterDistribution(object): - - def __init__(self): - self._inputFileName = "" - self._inputFileFormat = "" - self._outputFileName = "tmpOutputFile" - self._csv = False - - def help( self ): - print - print "usage: %s [ options ]" % ( sys.argv[0] ) - print "options:" - print " -h: this help" - print " -i: input file" - print " -f: 'fasta' or 'fastq'" - print " -c: CSV output file" - print " -a: first PNG output file" - print " -b: second PNG output file" - print - print "Exemple:" - print - print "1:\n\tpython WrappGetLetterDistribution.py -i inputFile.fasta -f fasta -c outputFile1.csv -a outputFile2.png -b outputFile3.png" - print - print "2:\n\tpython WrappGetLetterDistribution.py -i inputFile.fastq -f fastq -c outputFile1.csv -a outputFile2.png -b outputFile3.png" - print - print - - - def setAttributesFromCommandLine(self): - try: - opts, args = getopt.getopt( sys.argv[1:], "hi:f:a:b:c:" ) - except getopt.GetoptError, err: - print str(err); sys.exit(1) - for o, a in opts: - if o == "-h": - self.help() - sys.exit(0) - if o == "-i": - self._inputFileName = a - elif o == "-f": - self._inputFileFormat = a - elif o == "-c": - self._outputFileNameCSV = a - self._csv = True - elif o == "-a": - self._outputFileNamePNG = a - elif o == "-b": - self._outputFileNamePerNtPNG = a - - def checkAttributes(self): - lMsg = [] - if self._inputFileName == "" and not os.path.exists(self._inputFileName): - lMsg.append("ERROR: This input file doesn't exist!") - if self._inputFileFormat == "": - lMsg.append("ERROR: No input file format specified in option!") - if self._outputFileNamePNG == "": - lMsg.append("ERROR: No output file.png specified in option!") - if self._outputFileNamePerNtPNG == "": - lMsg.append("ERROR: No output filePerNt.png specified in option!") - if self._outputFileNameCSV == "" and self._csv == True : - lMsg.append("ERROR: No output file.csv specified in option!") - - print ">>> lMsg " + str(lMsg) - if lMsg != []: - exp = CheckerException() - exp.setMessages(lMsg) - raise (exp) - - def _cleanWorkingDir(self, cDir): - os.system("rm %s/tmpData* %s/tmpScript*" % (cDir, cDir)) - - def wrapp(self): - self.checkAttributes() - cDir = os.getcwd() - - if self._csv == True: - os.system("python %s/Java/Python/getLetterDistribution.py -i %s -f %s -o %s/%s -c" % (SMART_PATH, self._inputFileName, self._inputFileFormat, cDir, self._outputFileName)) - os.system("mv %s/%s.csv %s" % (cDir, self._outputFileName, self._outputFileNameCSV)) - os.system("mv %s/%s.png %s" % (cDir, self._outputFileName, self._outputFileNamePNG)) - os.system("mv %s/%sPerNt.png %s" % (cDir, self._outputFileName, self._outputFileNamePerNtPNG)) - - self._cleanWorkingDir(cDir) - -if __name__ == '__main__': - launcher = WrappGetLetterDistribution() - launcher.setAttributesFromCommandLine() - launcher.wrapp() - diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/WrappGetLetterDistribution.xml --- a/SMART/galaxy/WrappGetLetterDistribution.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/WrappGetLetterDistribution.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Calculate distribution for each nucleotide per position for all short reads (S-MART) + Calculate distribution for each nucleotide per position for all short reads WrappGetLetterDistribution.py -i $inputFileName #if $formatType.FormatInputFileName == 'fasta': @@ -28,6 +28,18 @@ - + + + + + + + + + + +The script gets the nucleotide distribution of the input sequence list. It outputs two files. The first file shows the nucleotide distribution of the data. More precisely, a point (*x*, *y*) on the curve **A** shows that *y* sequences have *x*% of **A**. + +The second plot shows the average nucleotide distribution for each position of the read. You can use it to detect a bias in the first nucleotides, for instance. A point *x*, *y* on the curve **A** shows that at the position *x*, there are *y*% of **A**. A point (*x*, *y*) on the curve **#** tells you that *y*% of the sequences contain not less than *x* nucleotides. By definition, this latter line is a decreasing function. It usually explains why the tail of the other curves are sometimes erratic: there are few sequences. + - diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/__init__.py diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/changeGffFeatures.xml --- a/SMART/galaxy/changeGffFeatures.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/changeGffFeatures.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,16 +1,20 @@ - Changes one feature name by an other name (the feature name can be found on the 3rd column). + Change a feature in a GFF file (the feature is the 3rd column). ../Java/Python/changeGffFeatures.sh $inputFile $inputFeature $outputFeature >$outputFile - - + + + + + This script changes the third column of a GFF3 file (please refer to http://www.sequenceontology.org/gff3.shtml to know more about this format). + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/changeTagName.xml --- a/SMART/galaxy/changeTagName.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/changeTagName.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,9 +1,7 @@ - Changes the name of tag of a list of transcripts. + Change the name of a tag in a GFF file. ../Java/Python/changeTagName.py -i $formatType.inputFileName - #if $formatType.FormatInputFileName == 'bed': - -f bed #elif $formatType.FormatInputFileName == 'gff': -f gff #elif $formatType.FormatInputFileName == 'gff2': @@ -21,14 +19,10 @@ - - - - @@ -40,15 +34,15 @@ - - - - - + + + + Change the name of a tag in the 9th field of a GFF3 file (please consult http://www.sequenceontology.org/gff3.shtml to know more about this format). + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/cleanGff.xml --- a/SMART/galaxy/cleanGff.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/cleanGff.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Cleans a GFF file as given by NCBI and outpus a Gff3 file. + Clean a GFF file (e.g. as given by NCBI) and produces a new GFF3 file, understood by S-MART. ../Java/Python/cleanGff.py -i $inputFile -t $type -o $outputFile @@ -7,12 +7,15 @@ - + + + A GFF file (please consult http://www.sequenceontology.org/gff3.shtml to know more about it) may contain different sources of information: chromosome size, genes, transcripts, etc. S-MART mostly works on transcripts. This scripts filters the input GFF3 to keep the information you really want, based on the feature (3rd column). + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/clusterize.xml --- a/SMART/galaxy/clusterize.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/clusterize.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Clusterizes the reads when their genomic intervals overlap. + Clusterize features when their genomic intervals overlap. ../Java/Python/clusterize.py -i $formatType.inputFileName #if $formatType.FormatInputFileName == 'bed': @@ -10,8 +10,6 @@ -f gff2 #elif $formatType.FormatInputFileName == 'gff3': -f gff3 - #elif $formatType.FormatInputFileName == 'csv': - -f csv #elif $formatType.FormatInputFileName == 'sam': -f sam #end if @@ -29,7 +27,6 @@ - @@ -44,24 +41,17 @@ - - - - - - - + + + - - log - diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/clusterizeBySlidingWindows.xml --- a/SMART/galaxy/clusterizeBySlidingWindows.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/clusterizeBySlidingWindows.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Produces a GFF3 file that clusters a list of transcripts using a sliding window. Cluster the data into regions (defined by size and overlap with next region) and keep only highest peaks. + Produces a GFF3 file that clusters a list of transcripts using a sliding window. Cluster the data into regions (defined by size and overlap with next region). ../Java/Python/clusterizeBySlidingWindows.py -i $formatType.inputFileName #if $formatType.FormatInputFileName == 'bed': @@ -73,16 +73,16 @@ - - + + - + - + @@ -90,12 +90,18 @@ - + - + + + + + + + @@ -114,19 +120,19 @@ - - - - - - excel - - - plot - + + +Sliding windows are a convenient ways to clusterize data mapped on the genome. There are two important parameters of a sliding window: the size of the window and the size of the overlap. + +By default, sliding windows count the number of reads in each window. However, you can basically merge any information which is contained in the tags. You can compute the average, sum, median, max or min of the tags for each window. For instance, every window can contain the average cluster size, if you merge clusters instead of reads. + +The output file is a GFF3 file, where each element is a window. There is a special tag for each window, whose name is **nbElements** if you counted the number of transcripts per sliding window. However, if you performed a **min** (resp. **max**, **sum**, **median**, **average**) operation on the tags **value** of the transcripts, then the tag of the window will be **minValue** (resp. **maxValue**, **sumValue**, **medValue**, **avgValue**). You can also specify the name of your tag (which is actually advised: **nbReadsInSample1** will always be more informative than **nbElements**). + +You also have different option, which can select the *n* % highest regions, or the regions with at least *n* features in it, or even the regions with at least *n* unique features. This last option is useful when you want to cluster the reads which have mapped only once, for instance. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/compareOverlapping.xml --- a/SMART/galaxy/compareOverlapping.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/compareOverlapping.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,4 +1,4 @@ - + Print all the transcripts from a first file which overlap with the transcripts from a second file. ../Java/Python/CompareOverlapping.py -i $formatType.inputFileName1 @@ -107,7 +107,7 @@ - + @@ -139,58 +139,54 @@ - - - + - + - + - + - + - + - + - + - - - + @@ -201,9 +197,8 @@ - - + @@ -215,7 +210,7 @@ - + @@ -227,7 +222,7 @@ - + @@ -239,7 +234,7 @@ - + @@ -275,14 +270,38 @@ - - - + + + + - - + +This script may be the most important one. It basically compares two sets of transcripts and keeps those from the first set which overlap with the second one. The first set is considered as the query set (basically, your data) and the second one is the reference set (RefSeq data, for example). + +It is vital to understand that it will output the elements of the first file which overlap with the elements of the second one. + +Various modifiers are also available: + +-Restrict query / reference set to the first nucleotide. Useful to check if the TSS of one set overlap with the other one. + +-Extend query / reference set on the 5' / 3' direction. Useful to check if one set is located upstream / downstream the other one. + +-Include introns in the comparison. + +-Invert selection (report those which do not overlap). + +-Restrict to colinear / anti-sense overlapping data. + +-Keep the query data even if they do not strictly overlap with the reference data, but are located not further away than *n* nucleotide from some reference data. + +-Keep the query data with are strictly included into reference data, meaning that a query transcript such that at least 1 nucleotide does not overlap with reference data will not be presented as a solution. + +The mechanism of shrinking and extending is also useful to make a fine grain comparison. For example, if you want to keep those such that the TSS is overlapping the reference set, you just shrink the query set to 1 nucleotide. Now, if you want to keep those which are overlapping you data or located 2kb downstream of it, just extend the query data in the downstream direction, and you will have what you want. You can also extend in the opposite direction to get the possible transcript factor sites which are upstream. + +Some option reverses the selection. Put in other words, it performs the comparison as usual, and outputs all those query data which do not overlap. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/computeCoverage.xml --- a/SMART/galaxy/computeCoverage.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/computeCoverage.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,4 +1,4 @@ - + Compute the coverage of a set with respect to another set. ../Java/Python/ComputeCoverage.py -i $formatType.inputFileName1 @@ -103,5 +103,8 @@ + +This tool considers a query and a reference files, and gives the coverage of the query file by the reference. The output file is similar to the query file, where a tag **coverage** has been added. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/coordinatesToSequence.xml --- a/SMART/galaxy/coordinatesToSequence.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/coordinatesToSequence.xml Mon Apr 22 11:08:07 2013 -0400 @@ -59,4 +59,7 @@ + +You can use this tool, if you just want to convert your mapping data to genomic coordinates, without any filtering. It requires a genomic coordinates file together with its format, an output format (GFF3, BED, etc...), the genome, and prints you the corresponding file. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/getDifference.xml --- a/SMART/galaxy/getDifference.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/getDifference.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Gets all the regions of the genome, except the one given or get all the elements from the first set which does not ovelap with the second set (at the nucleotide level). + Gets all the regions of the genome, except the one given in an annotation file. Alternatively, it may also give all the elements from the first set which does not ovelap with the second set (at the nucleotide level). ../Java/Python/getDifference.py -i $formatType.inputFileName1 #if $formatType.FormatInputFileName1 == 'bed': @@ -31,7 +31,6 @@ -g gtf #end if - $split #if $OptionSequence.option == "Yes": @@ -102,8 +101,7 @@ - - + @@ -124,4 +122,9 @@ + +This tools has two different (but similar) uses. When given two sets of transcripts, it trims the elements of the set so that they do not overlap with the second set. + +When only one set of transcripts is given, together with a reference genome, it produces a list of transcripts which complements the first set. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/getDistance.xml --- a/SMART/galaxy/getDistance.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/getDistance.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Give the distances between every data from the first input set and the data from the second input set + Give the distances between every data from the first input set with respect to the data from the second input set. ../Java/Python/getDistance.py -i $formatType.inputFileName1 #if $formatType.FormatInputFileName1 == 'bed': @@ -40,16 +40,6 @@ -a #end if - #if $OptionFirstNucl5.FirstNu5 == "Yes": - -s $OptionFirstNucl5.first5File1 - -S $OptionFirstNucl5.first5File2 - #end if - - #if $OptionFirstNucl3.FirstNu3 == "Yes": - -e $OptionFirstNucl3.first3File1 - -E $OptionFirstNucl3.first3File2 - #end if - #if $OptionMinDistance.MinD == "Yes": -m $OptionMinDistance.minDistance #end if @@ -77,8 +67,6 @@ #end if -o $outputFilePng - $outputDistance $outputFileDistance - @@ -140,14 +128,12 @@ - - - - + + @@ -159,34 +145,8 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -198,7 +158,7 @@ - + @@ -267,9 +227,18 @@ - - outputDistance - + +Give the distances between every data from the first input set and the data from the second input set. It outputs the size distribution. Each point (*x*, *y*) tells you that there exists *y* pairs of elements which are separated by *x* nucleotides. + +The general algorithm is the following. For each element of the first input set, it finds the closest element of the second set and computes the distance between the two elements. The distance is zero if the two elements overlap. This distance may not exist if the element of the first input set is alone on its chromosome (or contig). + +Actually, considering an element from the first input set, the algorithm will look at the vicinity of this element (1kb by default). You can increase the size of the vicinity using the appropriate option. + +As in *compare overlapping*, you can shrink or extend your sets of genomic coordinates, so that you can get the distance between starts of reads and starts or genes, for instance. You can also compute the distance from elements which are on the same strand only (which is not the case by default) or on the opposite strand only. + +You have several options for the output plot. You can first choose the region on the *x*-axis you want to plot. You can also display histograms instead of line plot. In this case, the data are summed into buckets, whose sizes are given as an option. For instance, a bucket of size *s* at the point (*x*, *y*) means that there are *y* pairs of elements which are separated by *x* to *x + s* nucleotides. + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/getDistribution.xml --- a/SMART/galaxy/getDistribution.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/getDistribution.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Get Distribution: Get the distribution of the genomic coordinates on a genome. + Get Distribution: Get the distribution of the genomic coordinates along a genome. ../Java/Python/GetDistribution.py -i $formatType.inputFileName #if $formatType.FormatInputFileName == 'bed': @@ -58,7 +58,6 @@ $bothStrands $average - -n $names $normalize $csv $outputCSV $gff $outputGFF @@ -74,7 +73,6 @@ - @@ -90,9 +88,6 @@ - - - @@ -104,7 +99,7 @@ - + @@ -115,8 +110,20 @@ + + + + + + + + + + + + - + @@ -128,7 +135,7 @@ - + @@ -140,7 +147,7 @@ - + @@ -152,7 +159,7 @@ - + @@ -187,18 +194,6 @@ - - - - - - - - - - - - @@ -211,27 +206,35 @@ + + - - - - - - + + + + + + + + + + + + + - - csv - - - - gff - - This script gives a .tar out file, if you want to take look at the results, you have to download it. +This script gives a .tar out file, if you want to take look at the results, you have to download it. + +Print a density profile of the data for each chromosome, see Figure~\ref{fig:getDistribution}. You have to provide the reference genome, to know the sizes of the chromosomes. You can also provide the number of points (called *bins*) you want per chromosome. + +By default, only one curve is plotted per chromosome, but you can plot one curve per strand and per chromosome (the minus strand will be plotted with non-positive values on the *y*-axis). + +If you want, you can also plot a specific region, by mentionning the chromosome, the start and the end positions of the region. diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/getExons.xml --- a/SMART/galaxy/getExons.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/getExons.xml Mon Apr 22 11:08:07 2013 -0400 @@ -69,4 +69,16 @@ + + + + + + + + + + +Provide all the exons of an annotation file. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/getIntrons.xml --- a/SMART/galaxy/getIntrons.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/getIntrons.xml Mon Apr 22 11:08:07 2013 -0400 @@ -53,4 +53,16 @@ + + + + + + + + + +Provide all the introns of an annotation file. + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/getSizes.xml --- a/SMART/galaxy/getSizes.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/getSizes.xml Mon Apr 22 11:08:07 2013 -0400 @@ -21,7 +21,6 @@ -b $OptionY.yLabValue #end if $barPlot - $excel $excelOutput @@ -84,7 +83,7 @@ - + @@ -119,17 +118,16 @@ - - - - - - - excel - + + + +Get the sequence/annotation size distribution. A point (*x*, *y*) means that *y* elements have a size of *x* nucleotides. + +When your mapping include exon/intron structures, you can decide to count the size of the introns, the sizes of the exons or the size of the first exons. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/getWigData.xml --- a/SMART/galaxy/getWigData.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/getWigData.xml Mon Apr 22 11:08:07 2013 -0400 @@ -5,13 +5,21 @@ - - - + + + + + +Reads a transcript list, computes the average value of some WIG data (please consult http://genome.ucsc.edu/goldenPath/help/wiggle.html to know more about this format) for each transcript and adds a tag corresponding to this average value to the transcript. + +The script finds all the data which correspond to the genomic coordinates of a transcript, average these data and store the result into a tag. Then, the transcripts are written in an output file, together with the tag. + +You can then plot your data using *plotTranscriptList.py*. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/getWigDistance.xml --- a/SMART/galaxy/getWigDistance.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/getWigDistance.xml Mon Apr 22 11:08:07 2013 -0400 @@ -5,13 +5,21 @@ - - - - + + + + + + +Plots the average data contained in a set of WIG files (please consult http://genome.ucsc.edu/goldenPath/help/wiggle.html to know more about this format) around the first nucleotides of a annotation file. + +The tool needs an transcript list, some WIG files, and a distance. For each transcript, it collects all the values around its first nucleotide, the radius being given by the distance. Then, it computes the average value for each position. A point (*x*, *y*) means that the average value in the WIG file for a nucleotide distant by *x* nucleotides from the first nucleotide of an input transcript is *y*. + +You can possibly use a log scale for the *y*-axis. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/getWigProfile.xml --- a/SMART/galaxy/getWigProfile.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/getWigProfile.xml Mon Apr 22 11:08:07 2013 -0400 @@ -64,7 +64,12 @@ - + + +Computes the average distribution of the WIG data (please consult http://genome.ucsc.edu/goldenPath/help/wiggle.html to know more about this format) along the transcripts given in input, and possibly before and after the transcripts. + +The main inputs of the functions are a file containing a list of transcripts (or any sets of genomic interval) and a directory containing a set of WIG files (one file per chromosome, or one file per chromosome and per strand). The function then computes the WIG profile of each transcript. The user can also define a region around the transcripts that should also be plotted (in this case, the profile will include the WIG values which overlap with the transcript as well as the 5' and 3' regions). Since the transcript do not necessarily have the same sizes, all profiles will be extended or shrinked to fit in a size which is given by the user. If the resulting profile is a bit bumpy, the user can also smoothen the curve by using a linear smoothing function (the size of the smoothing window is given by the user). Finally, the user may want to plot the WIG data for the opposite strand too (if the strand specific WUG data are available). + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/mapperAnalyzer.xml --- a/SMART/galaxy/mapperAnalyzer.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/mapperAnalyzer.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Read the output of an aligner, print statistics and possibly translate into BED or GBrowse formats. + Read the output of an aligner, print statistics and possibly translate into GFF, BED or GBrowse formats. ../Java/Python/mapperAnalyzer.py -i $formatType.inputFileName1 #if $formatType.FormatInputFileName1 == 'bed': @@ -89,7 +89,6 @@ - @@ -183,4 +182,21 @@ + +Maybe the first program you may use. It reads a set of mapping given by the tool you have used to map your data on the reference genome and translate it to a set of genomic coordinates. You also have the possibility to extract only those that you are interested in (few matches in the genome, few errors in the mapping, etc.). You can also select those reads which map less than a given of times in the genome. Moreover, you can output the data in various different formats, which you can use to visualize them *via* UCSC genome browser or GBrowse. Unmatched reads can be written in an other file, in case you would like to try to map them with another tool (may sometimes work!). + +You can filter your data according to: + +- number of errors in the mapping + +- number of occurrences of the mapping in the genome + +- size of the read mapped + +- number of gaps in the mapping + +The script needs an input file (your mapped reads) together with its format and the read sequences file together with its format (FASTA or FASTQ). If you want, you can also append the results of this script to another GFF3 file. This is useful when the GFF3 file is the result of the mapping using another tool. + +By default, any gap in the alignment to the reference sequence is treated like an exon. You can decide to remove this feature by merging short introns (actually, gaps). + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/mergeSlidingWindowsClusters.xml --- a/SMART/galaxy/mergeSlidingWindowsClusters.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/mergeSlidingWindowsClusters.xml Mon Apr 22 11:08:07 2013 -0400 @@ -99,5 +99,9 @@ + + +Sliding windows are also useful to compare two (or more!) sets of data. This can be very valuable when you want to compare differential expression in two different conditions. When you have two different sliding windows sets, this function merges them into one, where each window contains the two pieces of information. You may want to plot the data afterwards using the *plot transcript list* function. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/mergeTranscriptLists.xml --- a/SMART/galaxy/mergeTranscriptLists.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/mergeTranscriptLists.xml Mon Apr 22 11:08:07 2013 -0400 @@ -45,9 +45,6 @@ #end if -o $outputFileGff - - - @@ -145,4 +142,9 @@ + +The script is similar to *compare overlapping*, except that when data of two different sets overlap, they are merged. You can use the same parameters as *compare overlapping* and use them to look for transcription on both strands, for example. + +Optionally, you can also add to the output all the elements from the first set which do not overlap with the second set. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/modifyGenomicCoordinates.xml --- a/SMART/galaxy/modifyGenomicCoordinates.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/modifyGenomicCoordinates.xml Mon Apr 22 11:08:07 2013 -0400 @@ -66,7 +66,7 @@ - + @@ -78,7 +78,7 @@ - + @@ -122,5 +122,14 @@ +This tool reads a list of transcripts and modifies each feature by: + +- shrinking it to the $ n $ first nucleotides or the *n* last nucleotides, or + +- extending it to $ n $ nucleotides towards the 5' direction (upstream) or the 3' direction (downstream). + +Note that the 5' or 3' direction depends on the orientation of the feature (the 5' end of a transcript located on the minus strand is on the right hand of this transcript!). + +The tool needs a transcript file, its format, and outputs a new transcript file. diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/modifySequenceList.xml --- a/SMART/galaxy/modifySequenceList.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/modifySequenceList.xml Mon Apr 22 11:08:07 2013 -0400 @@ -43,4 +43,7 @@ + + This tool reads a list of sequences (in multi-FASTA/Q format) that you provide and shrinks each sequence to the *n* first nucleotides or the *n* last nucleotides. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/plotCoverage.xml --- a/SMART/galaxy/plotCoverage.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/plotCoverage.xml Mon Apr 22 11:08:07 2013 -0400 @@ -169,7 +169,7 @@ - + @@ -260,6 +260,12 @@ - This script gives a .tar out file, if you want to take look at the results, you have to download it. +Plot the coverage of the first set of genomic coordinates with respect to the second set of genomic coordinates. For each element of the second set (we will suppose that they are annotated genes), it computes the number of elements of the first set (reads, for instance) which overlap it. + +Alternatively, if the first file is in GFF format, and contains the **Target** file, you can omit the second file. However, a fasta file corresponding to the second file should be given (to compute the size of the reference elements). + +The tool produces two plots per gene. The first plot gives the coverage: a point (*x*, *y*) $ means that *y* reads cover the *x*th nucleotide of the gene. The second figure displays the (possibly spliced) gene in black, and the overlapping reads (blue is colinear, red is anti-sense). + +This script gives a .tar out file, if you want to take look at the results, you have to download it. diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/plotTranscriptList.xml --- a/SMART/galaxy/plotTranscriptList.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/plotTranscriptList.xml Mon Apr 22 11:08:07 2013 -0400 @@ -2,16 +2,12 @@ Plot some information from a list of transcripts. ../Java/Python/plotTranscriptList.py -i $formatType.inputFileName - #if $formatType.FormatInputFileName == 'bed': - -f bed - #elif $formatType.FormatInputFileName == 'gff': + #if $formatType.FormatInputFileName == 'gff': -f gff #elif $formatType.FormatInputFileName == 'gff2': -f gff2 #elif $formatType.FormatInputFileName == 'gff3': -f gff3 - #elif $formatType.FormatInputFileName == 'sam': - -f sam #elif $formatType.FormatInputFileName == 'gtf': -f gtf #end if @@ -33,10 +29,7 @@ -m $optionyLab.labVal #end if - #if $optionyLog.log == 'Yes': - -l $optionyLog.logVal - #end if - + $optionLog.log -s $shape -b $bucket @@ -46,16 +39,11 @@ - - - - - @@ -65,16 +53,13 @@ - - - - - + + @@ -92,7 +77,7 @@ - + @@ -117,16 +102,13 @@ - + - - + + + + - - - - - @@ -138,4 +120,15 @@ + +Plot the data attached as tags in a transcript list. This can be used for displaying the comparison of different sets of sliding windows, for instance. + +The tool reads the tags of a transcript file (actually, a GFF3 file). It considers more specifically the tag names that you specify as parameter. If you use only one tag name, you can display a line plot. In this case, you have to specify a bucket size *s* (which is by defaut 1) and a point (*x*, *y*) tells you that there are *y* transcripts with tag values *x* to *x + s*. + +You can display could plots if you use two tag names. Each point represents the values of the two tags of a transcript. If you use three variables, the third variable will be the color of the point. You can also use a log scale and name the axes of the plot. + +Each transcript must contain the tags which are specified. If not, you should provide a default value, which is used when the tag is not present. + +If you use a cloud plot, you can compute the Spearman's rho to quantify a correlation between your two tag values. + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/removeExonLines.sh --- a/SMART/galaxy/removeExonLines.sh Fri Apr 19 10:13:11 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -#!/bin/bash -sed '/exon/d' $1 diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/restrictFromSize.xml --- a/SMART/galaxy/restrictFromSize.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/restrictFromSize.xml Mon Apr 22 11:08:07 2013 -0400 @@ -2,7 +2,9 @@ Select the elements of a list of sequences or transcripts with a given size. ../Java/Python/restrictFromSize.py -i $formatType.inputFileName - #if $formatType.FormatInputFileName == 'bed': + #if $formatType.FormatInputFileName == 'fasta': + -f fasta + #elif $formatType.FormatInputFileName == 'bed': -f bed #elif $formatType.FormatInputFileName == 'gff': -f gff @@ -29,6 +31,7 @@ + @@ -36,6 +39,9 @@ + + + @@ -87,7 +93,7 @@ - command example: restrictFromSize.py -i cis_e10_cluster20InSeed2515_nbEUp10.gff3 -f gff -o cis_e10_cluster20InSeed2515_nbEUp10_lgUp50 -m 50 +Reads a list of sequences or genomic coordinates and outputs those which are longer and / or shorter than a given size ---which you provide. diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/restrictTranscriptList.xml --- a/SMART/galaxy/restrictTranscriptList.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/restrictTranscriptList.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Keep the coordinates which are located in a given position. + Select the features which are located in a given locus. ../Java/Python/restrictTranscriptList.py -i $formatType.inputFileName #if $formatType.FormatInputFileName == 'bed': -f bed @@ -75,7 +75,7 @@ - + @@ -87,7 +87,7 @@ - + @@ -105,4 +105,20 @@ + + + + + + + + + + + + + +Reads a list of genomic coordinates and outputs those which on a given chromosome and / or between two given positions. + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/test/CollapseReads.xml --- a/SMART/galaxy/test/CollapseReads.xml Fri Apr 19 10:13:11 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ - - Merges two reads if they have exactly the same genomic coordinates. - - ../Java/Python/CollapseReads.py -i $formatType.inputFileName - #if $formatType.FormatInputFileName == 'bed': - -f bed - #elif $formatType.FormatInputFileName == 'gff': - -f gff - #elif $formatType.FormatInputFileName == 'gff2': - -f gff2 - #elif $formatType.FormatInputFileName == 'gff3': - -f gff3 - #end if - - -$strand - -o $outputFileGff - --galaxy - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/test/Test_F_WrappGetLetterDistribution.py --- a/SMART/galaxy/test/Test_F_WrappGetLetterDistribution.py Fri Apr 19 10:13:11 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,91 +0,0 @@ -import unittest -import os -from commons.core.utils.FileUtils import FileUtils -from SMART.galaxy.WrappGetLetterDistribution import WrappGetLetterDistribution - -SMART_PATH = "%s/SMART" % os.environ["REPET_PATH"] -SMART_DATA = SMART_PATH + "/data" - -class Test_F_WrappGetLetterDistribution(unittest.TestCase): - - - def setUp(self): - self._dirTest = "%s/galaxy/test" % SMART_PATH - self._iwrappFastq = WrappGetLetterDistribution() - self._iwrappFasta = WrappGetLetterDistribution() - self._expOutputCSV = "expOutputTomate.csv" - - def test_wrappFasta(self): - self._iwrappFasta._inputFileName = "%s/SR1.fasta" % SMART_DATA - self._iwrappFasta._outputFileNamePrefix = "%s/galaxy/test/TomateFasta_res" % SMART_PATH - self._iwrappFasta._outputFileNamePNG = "%s/galaxy/test/TomateFasta_res.png" % SMART_PATH - self._iwrappFasta._outputFileNamePerNtPNG = "%s/galaxy/test/TomateFasta_resPerNt.png" % SMART_PATH - self._iwrappFasta._outputFileNameCSV = "%s/galaxy/test/TomateFasta_res.csv" % SMART_PATH - self._iwrappFasta._inputFileFormat = "fasta" - self._iwrappFasta._csv = True - if not(FileUtils.isEmpty(self._iwrappFasta._inputFileName)): - self._iwrappFasta.wrapp() - self.assertTrue(os.path.exists(self._iwrappFasta._outputFileNamePNG)) - self.assertTrue (os.path.exists(self._iwrappFasta._outputFileNamePerNtPNG)) - self.assertTrue (os.path.exists(self._iwrappFasta._outputFileNameCSV)) - self.assertTrue(FileUtils.are2FilesIdentical(self._iwrappFasta._outputFileNameCSV,self._expOutputCSV)) - else: - print "Problem : the input fasta file %s is empty!" % self._inputFileFasta - - -# def test_wrappFasta_withoutCSV_Opt(self): -# self._iwrappFasta._inputFileName = "%s/SR1.fasta" % SMART_DATA -# self._iwrappFasta._outputFileNamePrefix = "%s/galaxy/test/TomateFasta_res" % SMART_PATH -# self._iwrappFasta._outputFileNamePNG = "%s/galaxy/test/TomateFasta_res.png" % SMART_PATH -# self._iwrappFasta._outputFileNamePerNtPNG = "%s/galaxy/test/TomateFasta_resPerNt.png" % SMART_PATH -# self._iwrappFasta._outputFileNameCSV = "%s/galaxy/test/TomateFasta_res.csv" % SMART_PATH -# self._iwrappFasta._inputFileFormat = "fasta" -# self._iwrappFasta._csv = False -# if not(FileUtils.isEmpty(self._iwrappFasta._inputFileName)): -# self._iwrappFasta.wrapp() -# self.assertTrue(os.path.exists(self._iwrappFasta._outputFileNamePNG)) -# self.assertTrue (os.path.exists(self._iwrappFasta._outputFileNamePerNtPNG)) -# else: -# print "Problem : the input fasta file %s is empty!" % self._inputFileFasta -# os.system("rm %s/galaxy/test/*_res*.png" %SMART_PATH) -# os.system("rm %s/galaxy/test/*_res.csv" %SMART_PATH) -# -# -# def test_wrappFastq(self): -# self._iwrappFastq._inputFileName = "%s/SR1.fastq" % SMART_DATA -# self._iwrappFastq._outputFileNamePrefix = "%s/galaxy/test/TomateFastq_res" % SMART_PATH -# self._iwrappFastq._outputFileNamePNG = "%s/galaxy/test/TomateFastq_res.png" % SMART_PATH -# self._iwrappFastq._outputFileNamePerNtPNG = "%s/galaxy/test/TomateFastq_resPerNt.png" % SMART_PATH -# self._iwrappFastq._outputFileNameCSV = "%s/galaxy/test/TomateFastq_res.csv" % SMART_PATH -# self._iwrappFastq._inputFileFormat = "fastq" -# self._iwrappFastq._csv = True -# if not(FileUtils.isEmpty(self._iwrappFastq._inputFileName)): -# self._iwrappFastq.wrapp() -# self.assertTrue(os.path.exists(self._iwrappFastq._outputFileNamePNG)) -# self.assertTrue (os.path.exists(self._iwrappFastq._outputFileNamePerNtPNG)) -# self.assertTrue (os.path.exists(self._iwrappFastq._outputFileNameCSV)) -# self.assertTrue(FileUtils.are2FilesIdentical(self._iwrappFastq._outputFileNameCSV,self._expOutputCSV)) -# else: -# print "Problem : the input fastq file %s is empty!" % self._inputFileFastq -# -# -# def test_wrappFastq_withoutCSV_Opt(self): -# self._iwrappFastq._inputFileName = "%s/SR1.fastq" % SMART_DATA -# self._iwrappFastq._outputFileNamePrefix = "%s/galaxy/test/TomateFastq_res" % SMART_PATH -# self._iwrappFastq._outputFileNamePNG = "%s/galaxy/test/TomateFastq_res.png" % SMART_PATH -# self._iwrappFastq._outputFileNamePerNtPNG = "%s/galaxy/test/TomateFastq_resPerNt.png" % SMART_PATH -# self._iwrappFastq._outputFileNameCSV = "%s/galaxy/test/TomateFastq_res.csv" % SMART_PATH -# self._iwrappFastq._inputFileFormat = "fastq" -# self._iwrappFastq._csv = False -# if not(FileUtils.isEmpty(self._iwrappFastq._inputFileName)): -# self._iwrappFastq.wrapp() -# self.assertTrue(os.path.exists(self._iwrappFastq._outputFileNamePNG)) -# self.assertTrue (os.path.exists(self._iwrappFastq._outputFileNamePerNtPNG)) -# else: -# print "Problem : the input fastq file %s is empty!" % self._inputFileFastq -# os.system("rm %s/galaxy/test/*_res*.png" %SMART_PATH) -# os.system("rm %s/galaxy/test/*_res.csv" %SMART_PATH) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/test/__init__.py diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/testArgum.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/testArgum.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,24 @@ + + To test the arguments from shell. + +../testArgu.sh $test_out +#for $i in $replicate_groups +#for $j in $i.replicates +$j.bam_alignment:#slurp +#end for +#end for + >> $Log_File + + + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/testR.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/testR.xml Mon Apr 22 11:08:07 2013 -0400 @@ -0,0 +1,19 @@ + + Differential expression analysis for sequence count data (DESeq) + ../DiffExpAnal/testR.sh $inputFile $columnsOfGeneName $columnsOfCondition1 $columnsOfCondition2 $outputFileCSV $outputFilePNG 2>$outputLog + + + + + + + + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/trimAdaptor.xml --- a/SMART/galaxy/trimAdaptor.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/trimAdaptor.xml Mon Apr 22 11:08:07 2013 -0400 @@ -33,7 +33,15 @@ noAdaptor - + + + + + + + + + diff -r c79b9ae3f65f -r 440ceca58672 SMART/galaxy/trimSequences.xml --- a/SMART/galaxy/trimSequences.xml Fri Apr 19 10:13:11 2013 -0400 +++ b/SMART/galaxy/trimSequences.xml Mon Apr 22 11:08:07 2013 -0400 @@ -1,5 +1,5 @@ - Remove the 5' and/or 3' adaptors of a list of reads. + Remove the 5' and/or 3' adapters of a list of reads. ../Java/Python/trimSequences.py -i $inputFile -f fastq #if $OptionFPADP.FPADP == "Yes": -5 $OptionFPADP.fivePAdaptor @@ -7,10 +7,7 @@ #if $OptionTPADP.TPADP == "Yes": -3 $OptionTPADP.threePAdaptor #end if - #if $OptionError.Error == "Yes": - -e $OptionError.ErrorVal - #end if - + -e $errors $indels $noAdaptor5p $noAdaptorFile5p $noAdaptor3p $noAdaptorFile3p @@ -23,7 +20,7 @@ - + @@ -35,7 +32,7 @@ - + @@ -46,23 +43,10 @@ - - - - - - - - - - - - + - - - - + + @@ -78,4 +62,21 @@ + + + + + + + + + + + + + + + +This function removes the adaptor from the 5' or 3' end of your reads. It can even recognize the adaptators which are partially present. You can specify whether you are ready to accept indels or not. +