diff pyCRAC/pyReadCounters.xml @ 0:19b20927172d draft

Uploaded
author swebb
date Tue, 18 Jun 2013 09:11:00 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyCRAC/pyReadCounters.xml	Tue Jun 18 09:11:00 2013 -0400
@@ -0,0 +1,359 @@
+<tool id ="pyReadCounters" name="pyReadCounters" force_history_refresh="True">
+	<requirements>
+			<requirement type="package">pyCRAC</requirement>
+		</requirements>
+	<command interpreter="perl"> 
+	pyReadCounters.pl
+	-f $ftype.input
+	--file_type $ftype.file_type
+	--gtf $addGTF.gtf
+	#if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.disc.discard == "discard":
+		--discarded $discarded 
+	#end if#
+	#if ($ftype.file_type == "novo" or $ftype.file_type == "sam") and $ftype.addAlignOpt.alignoptions == "edit":
+		--alignOpt
+		--align_quality $ftype.addAlignOpt.align_quality
+		--align_score $ftype.addAlignOpt.align_score   
+		#if int($ftype.addAlignOpt.max) > 0:												   
+			--max $ftype.addAlignOpt.max							  
+		#end if#	 
+		--distance $ftype.addAlignOpt.d
+		--length $ftype.addAlignOpt.length
+		$ftype.addAlignOpt.unique	
+		$ftype.addAlignOpt.blocks
+		$ftype.addAlignOpt.mutations
+	#end if#
+	#if $addOpt.options == "edit":
+		--options
+		--range $addOpt.range
+		$addOpt.ignore
+		--overlap $addOpt.overlap
+	#end if#
+
+	--stats $stats
+	--hittable $hittable
+	--intronUTRoverlap $intronUTRoverlap
+
+	#if $ftype.file_type == "novo" or $ftype.file_type == "sam":
+		--countoutput $countoutput
+	#end if#
+
+	--id $stats.id
+	</command>
+	<version_command>/usr/local/bin/pyReadCounters.py --version</version_command>
+	<inputs>
+        <conditional name="addGTF">
+            <param name="gtfFile" type="select"	 label="Choose GTF File from">
+                <option value="default" selected="true">Defaults</option>
+                <option value="other">History</option>
+            </param>
+            <when value="default">
+                <param name="gtf" type="select"	 label="GTF File --gtf" help="GTF file containing gene ID co-ordinates">
+                    <options from_data_table="pycrac_gtf"/>
+                </param>
+            </when>
+            <when value="other">
+                <param format="GTF" name="gtf" type="data" label="GTF File --gtf" help="GTF file containing gene ID co-ordinates"/>
+            </when>
+        </conditional>
+		<conditional name="ftype">
+			<param name="file_type" type="select"  label="Input File Type --file_type" help="Use .novo or .sam input files">
+				<option value="novo" selected="true">Novo</option>
+				<option value="sam">Sam/Bam</option>
+				<option value="gtf">GTF</option>
+			</param>
+			<when value="novo">
+				<param format="tabular" name="input" type="data" label="Input File --input_file" help="Alignment file of type .novo" />
+				<conditional name="disc">
+				  <param name="discard" type="select"  label="Print discarded reads to a separate file">
+					<option value="" selected="true">OFF</option>
+					<option value="discard">ON</option>
+				  </param>
+				  <when value="discard">
+				  </when>
+				  <when value="">
+				  </when>
+				</conditional>
+		    <conditional name="addAlignOpt">
+                    <param name="alignoptions" type="select"  label="Alignment Options">
+                        <option value="default" selected="true">Default</option>
+                        <option value="edit">Edit</option>
+                    </param>
+                    <when value="edit">
+                        <param name="mutations" type="select" label="Option for selecting type of mutations to report --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to select specific mutations that you want to have reported in the GTF output file.">
+                            <option value="" selected="true">Off</option>
+                            <option value="--mutations delsonly">deletions</option>
+                            <option value="--mutations subsonly">substitutions</option>
+                            <option value="--mutations TC">T->C substitutions</option>
+                            <option value="--mutations nomuts">no mutations</option>
+                        </param>
+                        <param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" >
+                            <validator type="in_range" min="0" message="Please enter a value >= 0"/>
+                        </param>
+                        <param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" >
+                            <validator type="in_range" min="0" message="Please enter a value >= 0"/>
+                        </param>
+                        <param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" >
+                            <validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/>
+                        </param>
+                        <param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads">
+                            <validator type="in_range" min="1" message="Please enter a value >= 0"/>
+                        </param>
+                        <param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000">
+                            <validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/>
+                        </param>
+                        <param name="unique" type="select"	label="Remove reads with multiple alignment locations --unique">
+                            <option value="" selected="true">OFF</option>
+                            <option value="--unique">ON</option>
+                        </param>
+                        <param name="blocks" type="select"	label="Only count reads with same start and end coords once --blocks">
+                            <option value="" selected="true">OFF</option>
+                            <option value="--blocks">ON</option>
+                        </param>
+                    </when>
+                    <when value="default">
+                    </when>
+				</conditional>
+			</when>
+			<when value="sam">
+				<param format="sam,bam" name="input" type="data" label="Input File --input_file" help="Alignment file of type .sam or .bam" />
+				<conditional name="disc">
+				  <param name="discard" type="select"  label="Print discarded reads to a separate file">
+					<option value="" selected="true">OFF</option>
+					<option value="discard">ON</option>
+				  </param>
+				  <when value="discard">
+				  </when>
+				  <when value="">
+				  </when>
+				</conditional>
+				<conditional name="addAlignOpt">
+					<param name="alignoptions" type="select" label="Alignment Options">
+						<option value="default" selected="true">Default</option>
+						<option value="edit">Edit</option>
+					</param>
+					<when value="edit">
+						<param name="mutations" type="select" label="Option for selecting type of mutations to report --mutations" help="cross-linking sites are often highlighted by deletions and/or substitutions in the reads. You can use this option to select specific mutations that you want to have reported in the GTF output file.">
+							<option value="" selected="true">Off</option>
+							<option value="--mutations delsonly">deletions</option>
+							<option value="--mutations subsonly">substitutions</option>
+							<option value="--mutations TC">T->C mutations</option>
+							<option value="--mutations nomuts">no mutations</option>
+						</param>
+						<param format="integer" name="align_quality" type="integer" label="Align Quality --align_quality " value="0" size="5" >
+							<validator type="in_range" min="0" message="Please enter a value >= 0"/>
+						</param>
+						<param format="integer" name="align_score" type="integer" label="Align Score --align_score " value="0" size="5" >
+							<validator type="in_range" min="0" message="Please enter a value >= 0"/>
+						</param>
+						<param format="integer" name="max" type="integer" label="Mapped reads to read from input file --max" help="Set to 0 to align all reads." value="0" size="10" >
+							<validator type="in_range" min="0" max="100000000" message="Please enter a value between 1 and 100000000 or 0 to align all reads"/>
+						</param>
+						<param format="integer" name="d" type="integer" label="Distance --distance " value="1000" size="6" help="Set the maximum number of bp allowed between two non-overlapping paired reads">
+							<validator type="in_range" min="1" message="Please enter a value >= 0"/>
+						</param>
+						<param format="integer" name="length" type="integer" label="Set the maximum length of reads --length" value="1000" size="7" help="Set the read length threshold between 15 and 1000">
+							<validator type="in_range" min="15" max="1000" message="Please enter a value between 15 and 1000"/>
+						</param>
+						<param name="unique" type="select"	label="Remove reads with multiple alignment locations --unique">
+						  <option value="" selected="true">OFF</option>
+						  <option value="--unique">ON</option>
+						</param>
+						<param name="blocks" type="select"	label="Only count reads with same start and end coords once --blocks">
+						  <option value="" selected="true">OFF</option>
+						  <option value="--blocks">ON</option>
+						</param>
+					</when>
+					<when value="default">
+					</when>
+				</conditional>
+			</when>
+			<when value="gtf">
+				<param format="gtf" name="input" type="data" label="Input File --input_file" help="File of type .gtf" />
+			</when>
+		</conditional>
+		<conditional name="addOpt">
+		<param name="options" type="select"	 label="Standard Options">
+			<option value="default" selected="true">Default</option>
+			<option value="edit">Edit</option>
+		</param>	
+		<when value="edit">
+			<param format="integer" name="range" type="integer" label="Range --range" value="0" size="5" help="Manually set the length of the 5' and 3' UTRs 0>50000">
+				<validator type="in_range" min="0" max="50000" message="Please enter a value between 0 and 50000"/>
+			</param>
+			<param name="ignore" type="select" label="Ignore strand information? --ignorestrand">
+				<option value="" selected="true">No</option>
+				<option value="--ignorestrand">Yes</option>
+			</param>
+			<param format="integer" name="overlap" type="integer" label="Overlap --overlap" value="1" size="5" help="Sets the number of nucleotides a read has to overlap with a gene before it is considered a hit. ">
+				<validator type="in_range" min="1" message="Please enter a positive integer"/>
+			</param>
+		</when>
+		<when value="default">
+		</when>
+		</conditional>	
+			<param name="label" type="text" format="txt" size="30" value="pyReadCounters" label="Enter output file label -o" />
+	</inputs>
+	<outputs>
+		<data format="tabular" name="stats" label="${label.value}_file_statistics.txt"/>							 
+		<data format="tabular" name="hittable" label="${label.value}_hittable.txt"/>
+		<data format="gtf" name="intronUTRoverlap" label="${label.value}_intron_and_UTR_overlap.txt"/>
+		<data format="gtf" name="countoutput" label="${label.value}_count_output.gtf">
+			<filter>ftype['file_type'] == "novo" or ftype['file_type'] == "sam"</filter>
+		</data>
+		<data format="txt" name="discarded" label="${label.value}_discarded.txt">
+			<filter>(ftype['file_type'] == "novo" or ftype['file_type'] == "sam") and ftype['disc']['discard'] ==  "discard"</filter>
+		</data> 
+	</outputs>
+	<help>
+
+.. class:: infomark
+
+**pyReadCounters**
+
+pyReadCounters is part of the pyCRAC_ package. Produces a gene hittable file, two GTF output files showing to which genomic features the reads overlap.
+Finally the tool produces a read statistics file that provides information about the complexity of your dataset.
+
+**Output file examples**
+
+A hittable file::
+
+    # generated by pyReadCounters version 1.1.0, Mon Apr 16 20:34:22 2012
+    # /usr/local/bin/pyReadCounters.py -f RNAseq_data.novo -c 1 --unique
+    # total number of reads 12534556
+    # total number of paired reads  10947376
+    # total number of single reads  483095
+    # total number of mapped reads: 11430471
+    # total number of overlapping genomic features  7019550
+    #       sense   5960669
+    #       anti-sense      1058881
+    # feature       sense_overlap anti-sense_overlap  number of reads
+    
+    ## protein_coding       3190701
+    YEF3        49930       3629        24221
+    PMA1        32621       2650        21776
+    COX1        24559       1037        15174
+    TFP1        21539       1689        13506
+    HSC82       21177       1458        12729
+    ADH1        20245       1467        11351
+    AI5_ALPHA   20022       918         13101
+    AI4         19390       886         12638
+    AI3         17823       798         11473
+    AI2         17590       790         11297
+    RPL10       16822       1113        8797
+    ENO2        16336       1125        8913
+    TEF1        15578       1333        5450
+
+An example of a GTF 'count_output' file::
+
+    ##gff-version 2
+    # generated by Counters version 1.2.0, Tue Jan  8 22:47:29 2013
+    # pyReadCounters.py -f PAR_CLIP_unique.novo --mutations=TC -v
+    # total number of reads:	2455251
+    # total number of paired reads:	0
+    # total number of single reads:	2455251
+    # total number of mapped reads:	2455251
+    # total number of overlapping genomic features:	5153943
+    #	sense:	2640600
+    #	anti-sense:	2513343
+    chrXIV	reads	exon	661572	661605	2	+	.   gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661596S;
+    chrXIV	reads	exon	661720	661738	1	+	.   gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661726S;
+    chrXIV	reads	exon	661839	661878	4	+	.   gene_id "INT_0_6716,YNR016C"; gene_name "INT_0_6716,ACC1"; # 661875S;
+    
+This output file also reports whether a read contains a mutation. 
+    
+For example::
+    
+    # 661596S
+    
+Indicates that the read had a nucleotide substitution ("S") at genomic coordinate 661596. The chromosome name can be found in the first column. 
+
+.. _pyCRAC: http://sandergranneman.bio.ed.ac.uk/Granneman_Lab/pyCRAC_software.html
+		
+------
+
+**Parameter list**
+
+File input options::
+
+	-f FILE, --input_file=FILE
+						provide the path to your novo, SAM/BAM or gtf data
+						file. Default is standard input. Make sure to specify
+						the file type of the file you want to have analyzed
+						using the --file_type option!
+	-o OUTPUT_FILE, --output_file=OUTPUT_FILE
+						Use this flag to override the standard file names. Do
+						NOT add an extension.
+	--file_type=FILE_TYPE
+						use this option to specify the file type (i.e.
+						'novo','sam' or 'gtf'). This will tell the program
+						which parsers to use for processing the files. Default
+						= 'novo'
+	--gtf=annotation_file.gtf
+						type the path to the gtf annotation file that you want
+						to use
+
+Common pyCRAC options::
+
+		--ignorestrand						
+											To ignore strand information and all reads overlapping
+						with genomic features will be considered sense reads.
+						Useful for analysing ChIP or RIP data
+	--overlap=1					
+												sets the number of nucleotides a read has to overlap
+						with a gene before it is considered a hit. Default =
+						1 nucleotide
+	-r 100, --range=100
+						allows you to add regions flanking the genomic
+						feature. If you set '-r 50' or '--range=50', then the
+						program will add 50 nucleotides to each feature on
+						each side regardless of whether the GTF file has genes
+						with annotated UTRs
+
+Options for SAM/BAM and Novo files::
+
+	--mutations=delsonly
+						Use this option to only track mutations that are of
+						interest. For CRAC data this is usually deletions
+						(--mutations=delsonly). For PAR-CLIP data this is
+						usually T-C mutations (--mutations=TC). Other options
+						are\: do not report any mutations: --mutations=nomuts.
+						Only report specific base mutations, for example only
+						in T's, C's and G's :--mutations=[TCG]. The brackets
+						are essential. Other nucleotide combinations are also
+						possible
+	--align_quality=100, --mapping_quality=100
+						with these options you can set the alignment quality
+						(Novoalign) or mapping quality (SAM) threshold. Reads
+						with qualities lower than the threshold will be
+						ignored. Default = 0
+	--align_score=100					
+												with this option you can set the alignment score
+						threshold. Reads with alignment scores lower than the
+						threshold will be ignored. Default = 0
+	--unique							
+												with this option reads with multiple alignment
+						locations will be removed. Default = Off
+	--blocks					
+												with this option reads with the same start and end
+						coordinates on a chromosome will be counted as one
+						cDNA. Default = Off
+	-m 100000, --max=100000
+						maximum number of mapped reads that will be analyzed.
+						Default = All
+	-d 1000, --distance=1000
+						this option allows you to set the maximum number of
+						base-pairs allowed between two non-overlapping paired
+						reads. Default = 1000
+	--discarded=FILE					
+												prints the lines from the alignments file that were
+						discarded by the parsers. This file contains reads
+						that were unmapped (NM), of poor quality (i.e. QC) or
+						paired reads that were mapped to different chromosomal
+						locations or were too far apart on the same
+						chromosome. Useful for debugging purposes
+	-l 100, --length=1000					
+												to set read length threshold. Default = 1000
+
+	</help>
+</tool>