Mercurial > repos > yqiancolumbia > ctk_test

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CIMS.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,45 @@
+<tool id="CIMS" name="CIMS analysis">
+	<description></description>
+
+	<command interpreter="perl">
+			/home/galaxy/tools/CTK/CIMS.pl  $bigFileFlag -n $iterationNum -p -v --outp $mutPosFile
+			$tagBedFile $mutationBedFile $outFile
+	</command>
+
+	<inputs>
+
+	<param type="data" format="bed" name="tagBedFile" label="Input file in BED format of unique tags"/>
+	<param type="data" format="bed" name="mutationBedFile" label="Input file in BED format of filtered mutation file corresponding to the tags above"/>
+	<param name="bigFileFlag" type="boolean" truevalue="-big" falsevalue="" checked="yes" label="The tag BED is a big file (over 6M lines)" />
+	<param name="iterationNum" type="integer" value="10" label="Number of iterations of permutation" />
+
+	</inputs>
+
+	<outputs>
+	<data name="outFile" format="tabular" label="CIMS analysis on ${on_string}"/>
+	<data name="mutPosFile" format="tabular" label="Mutation position summary on ${on_string}"/>
+	</outputs>
+
+	<help>
+
+.. class:: infomark
+
+**What this tool does**
+
+Cross-linking induced mutation site (CIMS) analysis
+
+This tool identifies clustered mutation sites and assigns statistical significance of the clustering based on a permutation based procedure.  This procedure shuffles the position of mutations, but preserves the distribution of CLIP tags and the position of each mutation relative to the 5' end of each tag (specified in the the 5th column of the mutation BED file).
+
+-----
+
+.. class:: warningmark
+
+**Requirement on input data**
+
+* Make sure the NAME(4th) column of the two BED files gives the same tag names.
+* Make sure the position of each mutation relative to the read is in the score(5th) column, provided by the iterative alignment tools (currently only for novoalign).
+
+	</help>
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CITS.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,45 @@
+<tool id="CITS" name="CITS analysis">
+  <description></description>
+
+  <command interpreter="perl">
+	/home/galaxy/tools/CTK/CITS.pl -v $bigFile -p $pvalue $multitest --gap $gap $uniqTagBed $uniqMutationTypeBed $output
+  </command>
+
+  <inputs>
+        <param type="data" format="bed" name="uniqTagBed" label="Input file in BED format of unique tags"/>
+        <param type="data" format="bed" name="uniqMutationTypeBed" label="Input file in BED format of deletions, insertions, or substitutions corresponding to the tags above" />
+
+        <param name="bigFile" type="boolean" truevalue="-big" falsevalue="" checked="yes" label="This is a big file (over 6M lines)" />
+        <param name="pvalue" type="float" value="0.001" label="Threshold of p-value of CITS" />
+	<param name="multitest" type="boolean" truevalue="--multi-test" falsevalue="" checked="yes" label="Do Bonferroni multiple test correction" />
+        <param name="gap" type="integer" value="25" label="Minimum gaps allowed between peaks (no merge cluster peaks if less than 0)" />
+  </inputs>
+
+  <outputs>
+        <data name="output" format="bed" label="CITS analysis on ${on_string}"/>
+  </outputs>
+
+  <help>
+
+.. class:: warningmark
+
+Only certain variations of the CLIP protocol allow you to perform CITS analysis (e.g. iCLIP, BrdU CLIP, etc).
+
+-----
+
+.. class:: infomark
+
+**What this tool does**
+
+Cross-linking induced trunction site (CITS) analysis
+
+Similar to CIMS analysis, This tool identifies clustered truncation sites and assigns statistical significance of the clustering based on a permutation based procedure.
+
+-----
+
+Reference:
+
+Weyn-Vanhentenryck,S.,M.*, Mele,A.*, Yan,Q.*, Sun,S., Farny,N., Zhang,Z., Xue,C., Herre,M., Silver,P.A., Zhang,M.Q., Krainer,A.R., Darnell,R.B. †, Zhang,C. † 2014. HITS-CLIP and integrative modeling define the Rbfox splicing-regulatory network linked to brain development and autism. Cell Rep. 10.1016/j.celrep.2014.02.005.
+
+   </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/T2C.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,39 @@
+<tool id="T2C" name="Get the distribution of the different substitutions">
+  <description></description>
+
+  <command interpreter="python">
+	/home/galaxy/tools/CTK/T2C.py $input $output_T2Cfiltered $output_substitutions $output_frequencies
+  </command>
+
+  <inputs>
+        <param name="input" type="data" format="tabular" label="Input unique mutation file of PAR-CLIP 4SU in tabular format"/>
+  </inputs>
+
+  <outputs>
+        <data name="output_T2Cfiltered" format="tabular" label="Tag_uniq_T2C file on ${on_string}" />
+        <data name="output_substitutions" format="tabular" label="Tag_uniq_sub file on ${on_string}" />
+        <data name="output_frequencies" format="tabular" label="Tag_uniq_freq file on ${on_string}" />
+  </outputs>
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+It may be useful to get the distribution of the different substitutions that are prevalent. The reason for this is because PAR-CLIP 4SU is known for inducing substitutions (more so than deletions and insertions), specifically by inducing a T to C transition on the forward strand (same as A to G on negative strand).
+There are three output files, one of which will be used in the CIMS analysis moving forward.
+
+-----
+
+**Output**
+
+(1) T2Cfiltered file = filtered to only contain T to C substitutions, which we will use in CIMS analysis downstream
+
+(2) substitutions file = filtered to only contain ALL substitutions (as opposed to all mutations)
+
+(3) frequencies file = contains a summary of the frequency of all substitutions
+
+Confirm that most of the substitutions are in fact T to C by looking at the frequency column in the frequencies file.
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bed2annotation.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,78 @@
+<tool id="bed2annotation" name="Annotate genomic intervals">
+	<description></description>
+
+	<command interpreter="perl">
+	/home/galaxy/tools/CTK/bed2annotation.pl -v
+	-conf /home/galaxy/tools/CTK/annotation.loc
+	$bigFileFlag $ssFlag -dbkey $dbkey $geneFlag $miRNAFlag $rmskFlag $regionFlag
+
+#if $customFeature.annotateCustomFeature =="yes":
+	-custom $customFeature.customFeatureBedFile --custom-name "$customFeature.customFeatureName" --custom-summary $customFeature.customSummaryMethod
+#end if
+ -summary $outputSummary $inputBed $outputAnnotation
+	</command>
+	<inputs>
+		<param type="data" format="bed" name="inputBed" label="Select a BED file to be annotated"/>
+		<param name="bigFileFlag" type="boolean" truevalue="-big" falsevalue="" checked="yes" label="Big file (over 6M lines)" />
+                <param name="dbkey" type="select" label="Genome build name">
+                                        <option value="hg19">hg19</option>
+                                        <option value="mm10">mm10</option>
+                </param>
+		<param name="ssFlag" type="boolean" truevalue="-ss" falsevalue="" checked="yes" label="Consider the two strands separately when possible" />
+                <param name="regionFlag" type="boolean" truevalue="-region" falsevalue="" checked="yes" label="Identify the genomic region where the peaks are located, it will give the percentage of tags mapped to CDS, 3'UTR, introns, etc.
+" />
+		<param name="geneFlag" type="boolean" truevalue="-gene" falsevalue="" checked="no" label="Annotate overlapping genes (RefSeq/UCSC known genes)" />
+		<param name="miRNAFlag" type="boolean" truevalue="-miRNA" falsevalue="" checked="no" label="Annotate overlapping microRNAs" />
+		<param name="rmskFlag" type="boolean" truevalue="-rmsk" falsevalue="" checked="no" label="Annotate overlapping RepeatMasked sequences (type and %)" />
+		<conditional name="customFeature">
+			<param name="annotateCustomFeature" type="select" label="Do you want to also annotate custom feature in history?">
+				<option value="yes">Yes</option>
+				<option value="no" selected="true">No</option>
+			</param>
+			<when value="yes">
+				<param type="data" format="bed" name="customFeatureBedFile" label="Select a BED file with custom features"/>
+				<param type="text" name="customFeatureName" value="custom_feature" size="80" label="Name your custom features (No space, no special character)"/>
+				<param name="customSummaryMethod" type="select" label="Select how you would like to combine multiple items of the annotation">
+					<option value="all" selected="true">List the name of all</option>
+					<option value="max_num">List the name and score of the one with the max score</option>
+					<option value="min_num">List the name and score of the one with the min score</option>
+					<option value="max_overlap">List the name and overlap of the one with the max overlap proportion</option>
+				</param>
+			</when>
+			<when value="no">
+			</when>
+		</conditional>
+
+	</inputs>
+
+	<outputs>
+		<data name="outputAnnotation" format="tabular" label="Annotate internals on ${on_string}" />
+		<data name="outputSummary" format="tabular" label="Summary of the annotation on ${on_string}" />
+	</outputs>
+	<help>
+
+.. class:: infomark
+
+**What the tool does**
+
+Functional annotation of CLIP tags.
+
+It will take as input files in BED format of unique CLIP tags (with or without RGB color) and annotate genomic intervals with various features built in or from history.
+
+Note 1: The input can be CLIP tags, CLIP cluster, CIMS, or anything else.  However, if the input is big, it might involve extensive computational load. Therefore, it is recommended that tags are clustered for some of the annotations (such as overlapping genes, rmsk, etc).
+
+Note 2: Strand is not considered for RepeatMasked regions even when you choose to separate the two strands (because for some repeats, such as simple_repeat or micro satellites, strand does not make sense).
+
+-----
+
+**Output files**
+
+There are two output files.
+
+The first output file has the detailed annotation, with intervals exactly in the same order as the input file, so that you can easily put different types of information together.
+
+The second output file is a summary of annotation.
+
+	</help>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bed2rgb.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,31 @@
+<tool id="bed2rgb" name="Add different colors">
+  <description>to different replicates</description>
+  <command interpreter="perl">
+                    /home/galaxy/tools/CTK/bed2rgb.pl -v -col $color $inputfile $outputfile
+  </command>
+  <inputs>
+        <param name="inputfile" format="bed"  type="data" label="Input BED file" />
+	<param name="color"  type="text" value="" label="Color by name or RGB (red; green; blue; ...; R,G,B; see help below for more information)" />
+  </inputs>
+  <outputs>
+        <data name="outputfile" format="bed" label="Add RGB colors on ${on_string}">
+        </data>
+  </outputs>
+
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will add different colors to BED files in RGB format.
+
+It will take as input files in BED format of unique CLIP tags and output files in BED format of RGB colored unique CLIP tags.
+
+Repeat the step for all uniq.bed files to generate RGB-bed files, but use different RGB colors "x,x,x" (e.g. 128,0,0).
+See http://www.rapidtables.com/web/color/RGB_Color.htm or other charts.
+
+After getting the unique tags of each library, one might concatenate biological replicates for both BED files and mutation files, which are distinguished by different colors using this tool.
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bedExt.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,28 @@
+<tool id="bedExt" name="Get the position before the start site">
+  <description>as a potential cross link site that causes truncation</description>
+  <command interpreter="perl">
+	/home/galaxy/tools/CTK/bedExt.pl -n $NeighborRegion -l $LeftExtension -r $RightExtension -v $input $output
+  </command>
+
+  <inputs>
+        <param name="input" type="data" format="bed" label="Input BED file in which CLIP tags with deletions were removed"/>
+        <param name="NeighborRegion" type="text" value="up" label="Get neighbor region relative to up|down|r=100"/>
+        <param name="LeftExtension" type="integer" value="-1" label="Extension on the left, with sign 0"/>
+        <param name="RightExtension" type="integer" value="-1" label="Extension on the right, with sign 0"/>
+  </inputs>
+
+  <outputs>
+        <data name="output" format="bed" label="Output file in BED format extended around start site as a potential cross link site that causes truncation" />
+  </outputs>
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will extract sequences specied by a bed file.
+
+Here it  will take as input files in BED format in which CLIP tags with deletions were removed and output files in BED format extended around start site as a potential cross link site that causes truncation.
+  </help>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq2collapse.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,34 @@
+<tool id="fastq2collapse" name="Collapse exact PCR duplicates">
+	<description>in FASTQ</description>
+	<command interpreter="perl">
+		/home/galaxy/tools/CTK/fastq2collapse.pl -v  $input $output
+	</command>
+	<inputs>
+		<param type="data" format="fastq" name="input" label="Input FASTQ file (.gz file accepted)"/>
+	</inputs>
+
+	<outputs>
+		<data name="output" format="fastq" label="Collapse exact PCR duplicates on ${on_string}" />
+	</outputs>
+
+	<help>
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will collapse exact duplicate sequences.
+
+It will take as input files in FASTQ format of filtered and trimmed reads and output files in FASTQ format in which exact PCR duplicates have been collapsed.
+
+-----
+
+.. class:: warningmark
+
+For big files (e.g. those from HiSeq), the program is memory intensive, so run one job at a time in this case.
+
+
+
+	</help>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastqFilter.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,102 @@
+<tool id="fastqFilter" name="Filter FASTQ files">
+  	<description></description>
+  	<command interpreter="perl">
+		/home/galaxy/tools/CTK/fastq_filter.pl
+		#if $index.indexRequired == "yes":
+		-index $index.sequence
+		#end if
+		-maxN $MaxN -v -if sanger -f $Filter  -of $OutputFormat $inputfile
+		$outputfile
+  	</command>
+
+  	<inputs>
+        <param name="inputfile" format="fastq"  type="data" label="Input Sanger FASTQ file (.gz file accepted; see help below for more information)" />
+
+	<conditional name="index">
+		<param name="indexRequired" type="select" label="Filter by sample index (see help below for parameter suggestion)" >
+		<option value="yes">Yes</option>
+		<option value="no" selected="true">No</option>
+		</param>
+		<when value="yes">
+			<param name="sequence" type="text" value="" label="Index position and sequence" />
+		</when>
+		<when value="no">
+		</when>
+	</conditional>
+
+    	<param name="Filter"  type="text" value="" label="Quality score filter string; format: Method:Start-End:Score (zero-based; see help below for parameter suggestion)" />
+	<param name="MaxN" type="integer" value="-1" label="Max number of N in sequence (default off - value less than 0) " />
+	<param name="OutputFormat" type="select" label="Output data type">
+		<option value="fastq">FASTQ</option>
+		<option value="fasta">FASTA</option>
+	</param>
+
+  	</inputs>
+
+	<outputs>
+	<data name="outputfile" format="fastq" label="Read quality filtering on ${on_string}">
+		<change_format>
+			<when input="OutputFormat" value="fasta" format="fasta" />
+		</change_format>
+	</data>
+	</outputs>
+
+	<help>
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will extract reads passing quality filters.
+
+It will take as input Sanger FASTQ files and output FASTQ/A files of filtered reads.
+
+-----
+
+**FASTQ format**
+
+Check quality score in the FASTQ file for the right format.
+
+Reference https://en.wikipedia.org/wiki/FASTQ_format#Quality :
+
+* Sanger format can encode a Phred quality score from 0 to 93 using ASCII 33 to 126.
+* Solexa/Illumina 1.0 format can encode a Solexa/Illumina quality score from -5 to 62 using ASCII 59 to 126.
+
+See http://www.asciitable.com/ for ASCII table.
+
+-----
+
+**Filter by sample index (optional)**
+
+For users who would like to start from a FASTQ file consisting of multiple libraries.
+
+For example:
+
+If you have six samples with indexes GTCA, GCATG, ACTG, AGCT, GCATC, TCGA, you can extract reads for each library with indicated index sequences (e.g. GTCA, etc.) starting from position 0 in the read. For example, you could specify 0:GTCA, etc.
+
+-----
+
+**How to set the filter**
+
+You can apply multiple filtering criteria based on the quality scores for each read. They are separated by commas.
+
+Each critieron is composed of four components (e.g. method1:start1-end1:score1,method2:start2-end2:score2)
+
+1. Method: min or mean, which means requirement on minimal or mean score of a region
+2. Start:  the first nucleotide to consider (0-based)
+3. End:    the last nucleotide to consider (0-based)
+4. score:  the threshold required
+
+**Parameter suggestion**
+
+* For Standard CLIP protocol filtering: mean:0-29:20 (this specifies a mean score of 20 or above in the first 30 bases, which includes 5 positions with sample indexes and the random barcode, followed by 25 positions with the actual CLIP tag).
+* For iCLIP/BrdU CLIP filtering: mean:0-38:20 (this specifies a mean score of 20 or above in the first 39 bases, which includes 14 positions with sample indexes and the random barcode, followed by 25 positions with the actual CLIP tag).
+
+The reason to filter as such is because low quality reads can introduce mapping errors and background. They will inflate the number of unique tags after removal of PCR duplicates.
+
+For example:
+
+When you have degenerate barcode at the first 5 nucleotides, you can use min:0-4:20,mean:5-29:20, which means the first 5 nucleotides have a minimal quality score of 20 and the next 25 nucleotides (i.e., the first 25 nucleotides of RNA) have a mean score of 20.
+
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getMutationType.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,38 @@
+<tool id="getMutationType" name="Get specific types of mutations">
+  <description></description>
+  <command interpreter="perl">
+	/home/galaxy/tools/CTK/getMutationType.pl -mutationType $mutationType $input_tag_uniq_mutation_txt $output_tag_uniq_bed
+  </command>
+
+  <inputs>
+        <param name="mutationType" type="select" label="Select a specific type of mutations ">
+                <option value="deletion">deletion</option>
+		<option value='substitution'>substitution</option>
+                <option value="insertion">insertion</option>
+        </param>
+        <param name="input_tag_uniq_mutation_txt" type="data" format="tabular" label="Input file in tabular format of unique mutations"/>
+  </inputs>
+
+  <outputs>
+	<data name="output_tag_uniq_bed" format="bed" label="Get ${mutationType.value_label} mutation type on ${on_string}">
+        </data>
+  </outputs>
+
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+Get specific types of mutations, such as deletions, substitutions, and insertions around the cross-linked mutation site.
+
+-----
+
+**Substitutions, deletions, and insertions**
+
+Based on the analysis performed so far on Nova, brPTB, Hu CLIP experiments, it appears that only deletions represent bona fide cross-linking induced mutations, while substitutions are mostly (if not all) due to sequencing/alignment errors, polymorphisms, or (potentially) RNA editing. It is estimated that ~7-25% of CLIP tags harbor deletion sites, frequently overlapping the binding motif. Therefore, it is highly recommended to do separate analysis of different types of mutations.
+
+However, it is not excluded that different proteins might differ due to the nature of amino-acid-nucleotide interaction and cross-linking. Therefore, analysis of more proteins will be very interesting.
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/notes.txt	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,117 @@
+https://galaxyproject.org/admin/data-integration/
+
+su -l galaxy
+
+prerequisites
+-------------
+cd tools
+ln -s /home/yq2139/czlab_src/CTK/ CTK
+ln -s /home/yq2139/czlab_src/ngs/ ngs
+ln -s /home/yq2139/czlab_src/plib/ plib
+rsync -avzP yq2139@intron.c2b2.columbia.edu:/ifs/data/c2b2/cz_lab/tools/fastx_toolkit_0.0.14/ .
+#rsync -avzP yq2139@intron.c2b2.columbia.edu:/ifs/data/c2b2/cz_lab/tools/bwa-0.7.12/ .
+
+# IMPORTANT add /home/galaxy/tools/fastx_toolkit_0.0.14/bin to PATH in ~/.bash_profile
+# same for samtools installed later
+
+cd /home/galaxy/galaxy_data
+rsync -avzP yq2139@intron.c2b2.columbia.edu:/ifs/data/c2b2/cz_lab/genomes/mm10/bwa/* genomes/mm10/bwa/
+rsync -avzP yq2139@intron.c2b2.columbia.edu:/ifs/data/c2b2/cz_lab/genomes/hg19/bwa/* genomes/hg19/bwa/
+rsync -avzP yq2139@intron.c2b2.columbia.edu://ifs/data/c2b2/cz_lab/genomes/mm10/annotation/rmsk.bed /home/galaxy/galaxy_data/genomes/mm10/annotation/
+rsync -avzP yq2139@intron.c2b2.columbia.edu://ifs/data/c2b2/cz_lab/genomes/mm10/annotation/rmsk.RNA.bed /home/galaxy/galaxy_data/genomes/mm10/annotation/
+rsync -avzP yq2139@intron.c2b2.columbia.edu://ifs/data/c2b2/cz_lab/genomes/hg19/annotation/rmsk.RNA.bed /home/galaxy/galaxy_data/genomes/hg19/annotation/
+rsync -avzP yq2139@intron.c2b2.columbia.edu://ifs/data/c2b2/cz_lab/genomes/hg19/annotation/rmsk.bed /home/galaxy/galaxy_data/genomes/hg19/annotation/
+
+mkdir /home/galaxy/galaxy_data/cache
+# to temporarily store cache for fastq2collapse.pl etc.
+
+install bwa + bam2sam
+---------------------
+# ref https://biostar.usegalaxy.org/p/24896/#24919
+# steps: http://intron.c2b2.columbia.edu:8888/ - admin - search tool shed - galaxy main tool shed - bwa - preview and install
+# if uninstall tools, one need to remove corresponding directories, and “reset metadata” in the admin section
+
+# modify tool-data/bwa_mem_index.loc :
+	hg19bwa hg19    hg19  /home/galaxy/galaxy_data/genomes/hg19/bwa/hg19.fa
+	mm10bwa mm10    mm10  /home/galaxy/galaxy_data/genomes/mm10/bwa/mm10.fa
+
+# modify integrated_tool_panel.xml to change showing order
+
+# add these below (taken from galaxy/config/shed_tool_conf.xml) to config/tool_conf.xml :
+
+    <tool file="../shed_tools/toolshed.g2.bx.psu.edu/repos/devteam/bwa/4d82cf59895e/bwa/bwa.xml" g
+uid="toolshed.g2.bx.psu.edu/repos/devteam/bwa/bwa/0.7.16.2">
+      <tool_shed>toolshed.g2.bx.psu.edu</tool_shed>
+        <repository_name>bwa</repository_name>
+        <repository_owner>devteam</repository_owner>
+        <installed_changeset_revision>4d82cf59895e</installed_changeset_revision>
+        <id>toolshed.g2.bx.psu.edu/repos/devteam/bwa/bwa/0.7.16.2</id>
+        <version>0.7.16.2</version>
+    </tool>
+
+    <tool file="../shed_tools/toolshed.g2.bx.psu.edu/repos/devteam/bam_to_sam/af7c50162f0b/bam_to_sam/bam_to_sam.xml" guid="toolshed.g2.bx.psu.edu/repos/devteam/bam_to_sam/bam_to_sam/2.0">
+      <tool_shed>toolshed.g2.bx.psu.edu</tool_shed>
+        <repository_name>bam_to_sam</repository_name>
+        <repository_owner>devteam</repository_owner>
+        <installed_changeset_revision>af7c50162f0b</installed_changeset_revision>
+        <id>toolshed.g2.bx.psu.edu/repos/devteam/bam_to_sam/bam_to_sam/2.0</id>
+        <version>2.0</version>
+    </tool>
+
+
+problems fixed
+---------------
+# (done) redirect stderr to stdout (my $msgio = $outBedFile eq '-' ? *STDERR :  *STDOUT;) : fastq_filter.pl stripBarcode.pl parseAlignment.pl fastq2collapse.pl  tag2peak.pl tag2profile.pl CIMS.pl tag2cluster.pl
+# (no need if correctly added environment variables) add a new data type "sai"; ref https://biostar.usegalaxy.org/p/24983/ and  https://galaxyproject.org/admin/datatypes/adding-datatypes/
+	step1 add the line below to the datatypes section in ./config/datatypes_conf.xml.sample file:
+		<datatype extension="sai" type="galaxy.datatypes.binary:Sai" subclass="True"/>
+	step2 In binary.py file, add:
+		class Sai( Binary ):
+		"""Class describing a Sai file"""
+		file_ext = "sai"
+	In registry.py file, add:
+		'sai' : binary.Sai(),
+# (done) bed2annotation.pl problem: "cat: write error: Broken pipe"
+# (done) bedExt.pl problem: /data/galaxy/database/files/000/dataset_241.dat already exists
+
+to upload file >2GB
+--------------------
+#upload the file using rsync or scp to c2b2 server /ifs/scratch/c2b2/cz_lab/web_data/galaxy_tmp
+# Then specify the link to the file in galaxy following the example below:
+# https://zhanglab.c2b2.columbia.edu/data/galaxy_tmp/HepG2.RBFOX2.rep1.R2.fastq.gz
+
+
+test
+----
+cd /home/galaxy/galaxy_test/ctk
+rsync -avzP yq2139@intron.c2b2.columbia.edu://mnt/chromatin/archive_proj/cz2294/CLIP_comparison/BrainRbfox/fastq/Fox1_1.fastq.gz .
+#Fox1_1.fastq.gz is std; Fox1_3.fastq.gz is brdu
+
+gzip -cd Fox1_3.fastq.gz |head -1000000 > Fox1_3.1000000lines.raw.fastq
+perl /home/galaxy/tools/CTK/fastq_filter.pl -v -if sanger -f mean:0-38:20 -maxN -1 -of fastq Fox1_3.1000000lines.raw.fastq Fox1_3.1000000lines.filtered.fastq >& fastqfilter.log
+fastx_clipper -a TCGTATGCCGTCTTCTGCTTG  -l 29 -n -v -i Fox1_3.1000000lines.filtered.fastq -o Fox1_3.1000000lines.trim1.fastq >& fastx_clipper.log
+fastq_quality_trimmer -i Fox1_3.1000000lines.trim1.fastq -v -t 5 -l 29 -o Fox1_3.1000000lines.trim2.fastq >& fastq_quality_trimmer.log
+perl /home/galaxy/tools/CTK/fastq2collapse.pl Fox1_3.1000000lines.trim2.fastq -v Fox1_3.1000000lines.trim2.c.fastq >& fastq2collapse.log
+perl /home/galaxy/tools/CTK/stripBarcode.pl -v -format fastq -len 14 Fox1_3.1000000lines.trim2.c.fastq Fox1_3.1000000lines.trim2.c.tag.fastq >& stripBarcode.log
+/home/galaxy/tools/bwa-0.7.12/bwa aln -t 4 -n 0.06 -q 20 /home/galaxy/galaxy_data/genomes/mm10/bwa/mm10.fa Fox1_3.1000000lines.trim2.c.tag.fastq > Fox1_3.1000000lines.sai
+# can't make log for bwa aln or the sai file is empty
+/home/galaxy/tools/bwa-0.7.12/bwa samse  /home/galaxy/galaxy_data/genomes/mm10/bwa/mm10.fa Fox1_3.1000000lines.sai Fox1_3.1000000lines.trim2.c.tag.fastq > Fox1_3.1000000lines.sam
+perl /home/galaxy/tools/CTK/parseAlignment.pl -v --map-qual 1 --min-len 18 --mutation-file Fox1_3.1000000lines.mutation.txt Fox1_3.1000000lines.sam Fox1_3.1000000lines.tag.bed >& parseAlignment.log
+perl /home/galaxy/tools/CTK/tag2collapse.pl -v -big --random-barcode -EM 30 --seq-error-model alignment -weight --weight-in-name --keep-max-score --keep-tag-name Fox1_3.1000000lines.tag.bed Fox1_3.1000000lines.tag.uniq.bed >& tag2collapse.log
+perl /home/galaxy/tools/CTK/selectRow.pl -q 3 -f 3 Fox1_3.1000000lines.mutation.txt  Fox1_3.1000000lines.tag.uniq.bed   > Fox1_3.1000000lines.tag.uniq.mutation.txt
+perl /home/galaxy/tools/CTK/bed2annotation.pl -conf /home/galaxy/tools/CTK/annotation.loc -dbkey mm10 -ss -big -region  -v Fox1_3.1000000lines.tag.uniq.bed  Fox1_3.1000000lines.tag.uniq.annot.txt >& annot.log
+perl /home/galaxy/tools/CTK/tag2peak.pl -big -ss -v --valley-seeking -p 0.05 --valley-depth 0.9 --dbkey mm10 --multi-test Fox1_3.1000000lines.tag.uniq.bed Fox1_3.1000000lines.pool.tag.uniq.peak.sig.bed >& tag2peak.log
+perl /home/galaxy/tools/CTK/tag2profile.pl -v -ss -exact -of bedgraph  Fox1_3.1000000lines.tag.uniq.bed Fox1_3.1000000lines.tag.uniq.bedgraph >& bedgraph_tag2profile.log
+awk '{if($9=="-") {print $0}}'  Fox1_3.1000000lines.tag.uniq.mutation.txt | cut -f 1-6 > Fox1_3.1000000lines.tag.uniq.del.bed
+perl /home/galaxy/tools/CTK/CIMS.pl -n 10 -p -v Fox1_3.1000000lines.tag.uniq.bed Fox1_3.1000000lines.tag.uniq.del.bed Fox1_3.1000000lines.tag.uniq.del.CIMS.txt >& cims.log
+perl /home/galaxy/tools/CTK/removeRow.pl -q 3 -f 3 -v Fox1_3.1000000lines.tag.uniq.bed Fox1_3.1000000lines.tag.uniq.del.bed > Fox1_3.1000000lines.tag.uniq.clean.bed
+perl /home/galaxy/tools/CTK/bedExt.pl -n up -l -1 -r -1 -v Fox1_3.1000000lines.tag.uniq.clean.bed Fox1_3.1000000lines.tag.uniq.clean.trunc.bed >& bedExt.log
+perl /home/galaxy/tools/CTK/tag2cluster.pl -big -s -maxgap "-1" -of bed -v Fox1_3.1000000lines.tag.uniq.bed Fox1_3.1000000lines.tag.uniq.cluster.0.bed >& tag2cluster.log
+awk '{if($5>2) {print $0}}' Fox1_3.1000000lines.tag.uniq.cluster.0.bed > Fox1_3.1000000lines.tag.uniq.cluster.bed
+perl /home/galaxy/tools/CTK/tag2peak.pl -big -ss -v --prefix "CITS" -gap 25 -p 0.001 -gene Fox1_3.1000000lines.tag.uniq.cluster.bed Fox1_3.1000000lines.tag.uniq.clean.trunc.bed Fox1_3.1000000lines.tag.uniq.clean.CITS.s30.bed >& CITS.log
+
+check
+-----
+cd /home/yq2139/mnt_prj/CTK_testing/filtering
+diff <(cat /home/galaxy/galaxy_test/ctk/testing_std/Fox1_1.1000000lines.filtered.fastq) <(head  -982696 Fox1_1.fastq)|head
+diff <(cat /home/galaxy/galaxy_test/ctk/testing_std/Fox1_1.1000000lines.trim.fastq) <(head  -895544 Fox1_1.trim.fastq)|head
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parseAlignment.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,83 @@
+<tool id="parseAlignment" name="Parse alignment">
+  <description>in SAM format to BED format</description>
+
+  <command interpreter="perl">
+	/home/galaxy/tools/CTK/parseAlignment.pl -v
+	#if $MAPQ.MAPQRequired == "yes":
+		--map-qual $MAPQ.MapQual
+	#end if
+	#if $MinLen.MinLenRequired == "yes":
+		--min-len $MinLen.MinimalReadLength
+	#end if
+	#if $IndeltoEnd.IndeltoEndRequired == "yes":
+		--indel-to-end $IndeltoEnd.NucleotidesfromIndeltoEnd
+	#end if
+	$SplitDel $IndelinScore --mutation-file $outputMutationFile $input $outputBedFile
+  </command>
+
+  <inputs>
+        <param name="input"  format="sam"  type="data" label="Input SAM file (.gz file accepted)" />
+	<conditional name="MAPQ">
+                <param name="MAPQRequired" type="select" label="Min MAPQ score (e.g. to keep only uniquely mapped reads)">
+                <option value="yes" selected="True">Yes</option>
+                <option value="no">No</option>
+                </param>
+                <when value="yes">
+                        <param name="MapQual" type="integer" value="1" label="Input an integer" />
+                </when>
+                <when value="no">
+                </when>
+        </conditional>
+
+        <conditional name="MinLen">
+                <param name="MinLenRequired" type="select" label="Minimal read length to report">
+                <option value="yes" selected="True">Yes</option>
+                <option value="no">No</option>
+                </param>
+                <when value="yes">
+                        <param name="MinimalReadLength" type="integer" value="18" label="Input an integer" />
+                </when>
+                <when value="no">
+                </when>
+        </conditional>
+
+        <conditional name="IndeltoEnd">
+                <param name="IndeltoEndRequired" type="select" label="Nucleotides from indel to end">
+                <option value="yes">Yes</option>
+                <option value="no" selected="True">No</option>
+                </param>
+                <when value="yes">
+                        <param name="NucleotidesfromIndeltoEnd" type="integer" value="" label="Input an integer" />
+                </when>
+                <when value="no">
+                </when>
+        </conditional>
+
+        <param name="SplitDel" type="boolean" truevalue="--split-del" falsevalue="" checked="no" label="Split oligo deletion into single nucleotides"/>
+        <param name="IndelinScore" type="boolean" truevalue="--indel-in-score" falsevalue="" checked="no" label="Count indels in as mismatches reported in the score column"/>
+  </inputs>
+
+  <outputs>
+	<data name="outputBedFile" format="bed" label="Parse tag bed file on ${on_string}"></data>
+	<data name="outputMutationFile" format="tabular" label="Parse tag mutation file on ${on_string}"></data>
+  </outputs>
+
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will parse alignment in SAM format to BED format (both tags and mutations).
+
+It will take as input files in SAM format after alignment and output BED files that keeps only unique mappings (with MAPQ >=1, specific for BWA) and a minimal mapping size of 18 nt. It will also generate mutation files.
+
+Note 1: In the tag bed file, the 5′ column records the number of mismatches (substitutions) in each read.
+
+Note 2: The parsing script relies on MD tags, which is an optional field without strict definition in SAM file format specification. Some aligners might have a slightly different format than bwa in how they report mismatches.
+
+Note 3: Other aligners might not use a positive MAPQ as an indication of unique mapping. Another useful option is "nucleotides from indel to end", which specifies the number of nucleotides towards the end from which indels should not be called (default=5 nt).
+
+    </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/removeLowQualityBases.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,39 @@
+<tool id="removeLowQualityBases" name="Remove low quality bases">
+  <description></description>
+
+  <command>
+	fastq_quality_trimmer -i $input -v -l $MinimumLength $CompressOutput -t $Qualitythreshold -o $output
+  </command>
+
+  <inputs>
+	<param name="input" type="data" format="fastq" label="Input FASTQ file"/>
+	<param name="Qualitythreshold" type="integer" value="5" label="Quality threshold - nucleotides with lower quality will be trimmed (from the end of the sequence)"/>
+	<param name="MinimumLength"  type="integer" value="29" label="Minimum length - sequences shorter than this (after trimming) will be discarded (0 = no minimum length)"/>
+        <param name="CompressOutput" type="boolean" truevalue="-z" falsevalue="" checked="no" label="Compress output with GZIP"/>
+  </inputs>
+
+  <outputs>
+        <data name="output" format="fastq" />
+  </outputs>
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool (fastq_quality_trimmer) will remove extremely low quality bases (e.g. score less than 5).
+
+It will take as input FASTQ files and output FASTQ files with low quality bases removed.
+
+It is a part of the Fastx Toolkit.
+
+-----
+
+**Parameter suggestion for minimum length**
+
+* For standard CLIP: discard sequences shorter than 20 nucleotides.
+* For BrdU CLIP: discard sequences shorter than 29 nucleotides.
+
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/removeRNA.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,34 @@
+<tool id="removeRNA" name="Remove tags from rRNA">
+  <description>and other repetitive RNA</description>
+  <command interpreter="perl">
+	/home/galaxy/tools/CTK/tagoverlap.pl -v  $big -region /home/galaxy/tools/CTK/annotation/genomes/${genome}/annotation/rmsk.RNA.bed  $completeOverlap $reverse $keepTagName $keepScore $sepStr $nonRedundant $input $output
+  </command>
+  <inputs>
+        <param name="input" format="bed" type="data" label="Input BED file (.gz files accepted)" />
+	<param name="genome" type="select" label="Genome build name" >
+                <option value="hg19">hg19</option>
+                <option value="mm10">mm10</option>
+        </param>
+	<param name="big" type="boolean" truevalue="-big" falsevalue="" checked="yes" label="Either region or tag file is big" />
+	<param name="completeOverlap" type="boolean" truevalue="--complete-overlap" falsevalue="" checked="yes" label="Requires complete overlap of the tag with the region" />
+	<param name="reverse" type="boolean" truevalue="-r" falsevalue="" checked="yes" label="Reverse mode to print tags without ovrlap with the region" />
+	<param name="keepTagName" type="boolean" truevalue="--keep-tag-name" falsevalue="" checked="yes" label="Keep tag name" />
+	<param name="keepScore" type="boolean" truevalue="--keep-score" falsevalue="" checked="yes" label="Keep tag score" />
+	<param name="sepStr"  type="boolean" truevalue="-ss" falsevalue="" checked="no" label="Separate strand" />
+	<param name="nonRedundant" type="boolean" truevalue="--non-redundant" falsevalue="" checked="no" label="Remove duplicate tags in output" />
+  </inputs>
+  <outputs>
+        <data name="output" format="bed" label="Remove tags from rRNA on ${on_string}">
+        </data>
+  </outputs>
+
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+Remove tags from rRNA and other repetitive RNA. The reason we do this is because ribosomal RNA is a contaminant.
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/removeRow.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,34 @@
+<tool id="removeRow" name="Remove CLIP tags with deletions">
+  <description></description>
+
+  <command interpreter="perl">
+	/home/galaxy/tools/CTK/removeRow.pl -q $QueryColumnID -f $FilterColumnID $IgnoreCase $ReverseMode  $input1 $input2  &gt; $output
+  </command>
+
+  <inputs>
+	<param type="data" format="bed" name="input1" label="Input file in BED format of unique CLIP tags"/>
+	<param type="data" format="bed" name="input2" label="Input file in BED format of CIMS" />
+
+	<param name="QueryColumnID" type="integer" value="3" label="Query column id (zero-based)"/>
+	<param name="FilterColumnID" type="integer" value="3" label="Filter column id (zero-based)"/>
+
+	<param name="IgnoreCase" type="boolean" truevalue="-i" falsevalue="" checked="no" label="Ignore case" />
+	<param name="ReverseMode" type="boolean" truevalue="-r" falsevalue="" checked="no" label="Reverse mode" />
+  </inputs>
+
+  <outputs>
+	<data name="output" format="bed" />
+  </outputs>
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+Since the vast majority of deletions are introduced because of cross linking, tags with deletions are treated as read-through tags and removed.
+
+This tool will remove selected rows from a file. Here it will output file in BED format in which CLIP tags with deletions are removed.
+
+Note: The parameters 3 and 3 indicate the columns in the two input files.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/stripBarcode.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,84 @@
+<tool id="stripBarcode" name="Strip 5' degenerate barcode">
+  <description></description>
+  <command interpreter="perl">
+	/home/galaxy/tools/CTK/stripBarcode.pl -len $Len -format $InputFormat
+	#if $LinkerStart.startRequired == "yes":
+		--barcode-start-with $LinkerStart.startwith
+	#end if
+	#if $LinkerEnd.endRequired == "yes":
+		--barcode-end-with $LinkerEnd.endwith
+	#end if
+	-v $input $output
+  </command>
+
+  <inputs>
+        <param name="InputFormat" type="select" label="Input FASTA or FASTQ format?">
+		<option value="fastq">FASTQ</option>
+		<option value="fasta">FASTA</option>
+	</param>
+	<param format="fastq,fastq.gz" name="input" type="data" label="Input file that contains 5' degenerate barcodes (.gz file accepted)">
+		<change_format>
+			<when input="InputFormat" value="fasta" format="fasta,fasta.gz" />
+		</change_format>
+        </param>
+
+	<param name="Len" type="integer" value="" label="Length of degenerate barcode (also include sample index)" />
+
+	<conditional name="LinkerStart">
+		<param name="startRequired" type="select" label="Filter sequences based on the starting nucleotides in the barcode">
+		<option value="yes">Yes</option>
+		<option value="no" selected="True">No</option>
+		</param>
+		<when value="yes">
+			<param name="startwith" type="text" value="" label="Starting nucleotides (case sensitive)" />
+		</when>
+		<when value="no">
+		</when>
+	</conditional>
+
+	<conditional name="LinkerEnd">
+		<param name="endRequired"  type="select" label="Filter sequences based on the ending nucleotides in the barcode">
+		<option value="yes">Yes</option>
+		<option value="no" selected="True">No</option>
+		</param>
+		<when value="yes">
+			<param name="endwith" type="text" value="" label="Ending nucleotides (case sensitive)" />
+		</when>
+		<when value="no">
+		</when>
+	</conditional>
+  </inputs>
+
+  <outputs>
+	<data name="output" format="fastq" label="Strip 5' degenerate barcode on ${on_string}">
+	</data>
+  </outputs>
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+The 5' degenerate barcode is used to distinguish PCR duplicates from bona fide unique tags mapped to the same coordinates. This tool will strip random barcode sequences and attach that in sequence ids.
+
+It will take as input files in FASTQ format that contains 5' degenerate barcodes and output files in FASTQ format with barcodes removed and attached to sequence ids.
+
+If you have extra barcodes next to the degenerate barcode, include the barcode in the length.
+
+Examples::
+
+        >seq1
+        ACGTATTTTTTT
+
+will become::
+
+        >seq1#ACGTA
+        TTTTTTT
+
+-----
+
+**Filter sequences**
+
+You can also filter your sequences based on the starting or ending nucleotides in the barcode, which is useful for quality control purposes or multiplex samples.
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tag2cluster.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,26 @@
+<tool id="tag2cluster" name="Cluster genomic regions">
+	<command interpreter="perl">
+		/home/galaxy/tools/CTK/tag2cluster.pl -v $bigFile $strand -maxGap $MaxGap $input $output
+	</command>
+	<inputs>
+		<param type="data" format="bed" name="input" label="Input BED file of unique CLIP tags (.gz file accepted)"/>
+
+		<param name="bigFile" type="boolean" truevalue="-big" falsevalue="" checked="no" label="This is a big file (over 6M lines)" />
+		<param name="strand" type="boolean" truevalue="-s" falsevalue="" checked="yes" label="Same strand required" />
+		<param name="MaxGap" type="integer" value="-1" label="The max gap to be considered as an overlap" />
+
+	</inputs>
+	<outputs>
+		<data name="output" format="bed" />
+	</outputs>
+	<help>
+.. class:: infomark
+
+**What it does**
+
+This tool will cluster overlapping tags or nearby regions into clusters and report tag count in each cluster.
+It will take as input files in BED format of unique CLIP tags and output files in BED format of clustered overlapping CLIP tags.
+
+	</help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tag2collapse.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,102 @@
+<tool id="tag2collapse" name="Collapse PCR duplicates">
+	<description>using coordinates</description>
+
+	<command interpreter="perl">
+		/home/galaxy/tools/CTK/tag2collapse.pl $bigFile -v --weight-in-name --keep-max-score --keep-tag-name
+
+		#if $randomBarcode.hasRandomBarcode == "yes":
+		--random-barcode -EM $randomBarcode.confidence --seq-error-model $randomBarcode.seqErrorModel
+		#end if
+
+		$weightFlag $input $output
+	</command>
+	<inputs>
+		<param type="data" format="bed" name="input" label="Input BED file"/>
+		<param name="bigFile" type="boolean" truevalue="-big" falsevalue="" checked="yes" label="Big file (over 6M lines)" />
+		<param name="weightFlag" type="boolean" truevalue="-weight" falsevalue="" checked="yes" label="Consider the weight of each tag - each read has a weight representing its exact copy number in the raw data (see help below for more information)" />
+
+
+		<conditional name="randomBarcode">
+			<param name="hasRandomBarcode" type="select" label="Is there degenerate barcode (i.e., UMI) attached to the id? (no collapse for different barcodes; see help below for more information)">
+				<option value="yes">Yes</option>
+				<option value="no">No</option>
+			</param>
+			<when value="yes">
+				<param name="seqErrorModel" type="select" label="How should sequencing error be estimated with sequencing error model?">
+				<option value="alignment" selected="yes">From mismatches in alignment</option>
+				<option value="em-local">From degenerate barcode using an EM algorithm</option>
+				</param>
+				<param name="confidence" type="integer" value="30" label="Confidence score for the EM algorithm" />
+			</when>
+			<when value="no">
+			</when>
+		</conditional>
+	</inputs>
+
+	<outputs>
+		<data name="output" format="bed" label="Collapse PCR duplicates on ${on_string}"/>
+	</outputs>
+
+	<help>
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool collaspes tags according to the start position.
+
+It will take as input files in BED format of tags and output files in BED format of unique tags to eliminate potential PCR duplicates.
+
+It can run in two modes:
+
+1. No degenerate barcode.  In this mode, tags with the same starting coordinates are collapsed and only one is kept.
+2. With degenerate barcode.  In this mode, tags that are mapped to the same position but carry different degenerate barcodes still have a chance to be kept (see below for details).
+
+-----
+
+**Consider the weight of each tag if you collapsed exact duplicates (reads with exactly the same sequences)**
+
+The tool will then consider a "weight" that represents the copy number of each tag.  This weight can be given in the score (5th) column, or attached to the tag NAME (before the degenerate barcode sequence (e.g. READ1#10#ACGTA).
+
+-----
+
+.. class:: warningmark
+
+**Input data format (important)**
+
+The input file is the unambiguously mappable tags in a BED file.  However, the tool may need extra information embeded in the BED file depending on the parameters you use. These pieces of extra information are already there if you use the alignment tool provided.
+
+First, the tool tries to keep track of the copy number of each tag, if you collapsed exact duplicates before alignment (which is always recommended).  By default, the copy number was attached to the sequence name (4th column).  Therefore, a sequence id might read like this::
+
+	tag1#3
+
+which means tag1 has 3 exact copies and tag1 is the representative of the three. if your sample has degenerate barcode, the barcode sequence is also attached to the sequence id, after the copy number, so that an sequence id might read like this::
+
+	    tag1#3#AAAGG
+
+If you check "has weight", but do not check "Weight in name", the tool will use the number in the score (4th) column as the copy number. When your data do not have degenerate barcode, the tool will collapse all reads with the same genomic starting coordinates and sum up the copy number of tags in each position and save the information in the score column of the output unique tag BED file.  In this case, whether the copy number information was provided correctly or not will not affect the number and identity of the unique tags it reports, so it does not matter much if you do not care about the total copy number.
+
+However, if your sample has degenerate barcode, it is CRITICAL to specify the information correctly.
+
+
+.. class:: warningmark
+
+Update of input format in the new version (11/22/2010):
+
+A new method is introduced to give more accurate estimate of sequencing error, which is important for samples with degenerate barcode. Now there are two options to estimate sequencing errors, either from the degenerate barcode iteratively in the EM algorithm (the original method), or from mismatches detected during alignment (the new method). To use the new method, the number of mismatches has to be provided in the score column of the BED file (which is already there if you used the alignment tool provided).  In this case, if you collapse exact duplicates before alignment, the copy number must be attached to the sequence id (so that you need to check both "has weight" and "has weight in name").
+
+The results from the two methods are not dramatically different.  For new analysis, the new method (estimate sequencing error from alignment) is recommended and a confidence score of 30 should be suitable in most cases. However, if you want to keep your analysis consistent with previous data, you can still use the original method.
+
+The new method is set to be default on April 8, 2011.
+
+-----
+
+**How this tool determines unique reads with degenerate barcode**
+
+If the raw reads have random barcodes attached to the 5' end of each read, the barcode has to be striped before alignment.  The barcode is attached to NAME of each tag (e.g., READ1 will become READ1#ACGTA), which will be used here to determine tags that have the same starting coordinates, but have "sufficiently" distinct barcodes. This task is not trivial given that some tags can have thousands of copies in some CLIP experiments, so that sequencing errors are not negligible.
+
+To deal with the problem, the program uses an iterative Expectation-Maximization algorithm that estimates sequencing errors in degenerate barcode, or uses the sequencing error estimated from the mismatches detected during alignment (new, 11/22/2010). It models the copy number and identity of each barcode sequence, and infers which tag is generated by sequencing errors that lead to apparently different barcode.  The confidence measures the probability of each tag with the observed barcode are "bona fide" in the CLIP library, or generated by sequencing error. To be consistent with other sequence alignment programs, -log10(P) * 10  is reported, so that a score of 50 represents the tag has a chance of 10^(-5) to be generated by sequencing error.  The confidence score is in the score (5 column) of the output. If all tags mapped to a position have the same barcode (single-copy tag is a special case), an arbitrary score of 100 is given. Therefore, a confidence score threshold > 100 should never be used.
+
+	</help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tag2peak.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,151 @@
+<tool id="tag2peak" name="Find significant peaks">
+	<description>from a BED file with tags</description>
+	<command interpreter="perl">
+		/home/galaxy/tools/CTK/tag2peak.pl -v $bigFile  $separateStrand
+		#if $valleySeeking.requireVS == "yes":
+			--valley-seeking --valley-depth $valleySeeking.valleyDepth
+		#end if
+			--out-boundary $outputClusterBoundaries --out-half-PH $outputHalfPeakHeightBoundaries
+		#if $key.dbkeyrequired == "yes":
+			--dbkey $key.dbkey
+		#end if
+		#if $pvalue.requireP == "yes":
+			-p $pvalue.p $pvalue.multiTest
+		#end if
+		#if $minPeakHeight.minPeakHeightrequired == "yes":
+			-minPH $minPeakHeight.minHeight
+		#end if
+		#if $maxPeakHeight.maxPeakHeightrequired == "yes":
+			-maxPH $maxPeakHeight.maxHeight $maxPeakHeight.skipPeaks
+		#end if
+		#if $gap.gaprequired == "yes":
+		-gap $gap.mingap
+		#end if
+		$input  $outputPeaksCalled
+	</command>
+
+	<inputs>
+		<param type="data" format="bed" name="input" label="Input BED file of unique tags"/>
+                <param name="bigFile" type="boolean" truevalue="-big" falsevalue="" checked="yes" label="Big file (over 6M lines)" />
+                <param name="separateStrand" type="boolean" truevalue="-ss" falsevalue="" checked="yes" label="Consider the two strands separately" />
+		<conditional name="valleySeeking">
+                        <param name="requireVS" type="select" label="Find candidate peaks by valley seeking?">
+                                <option value="yes">Yes</option>
+                                <option value="no">No</option>
+                        </param>
+                        <when value="yes">
+                                <param name="valleyDepth" type="float" value="0.9" label="The depth of the valley relative to the peak. Valley-depth must be between 0.5 and 1. We choose 0.9 (default) as a more stringent value."/>
+                        </when>
+                        <when value="no">
+                        </when>
+                </conditional>
+
+		<conditional name="pvalue">
+			<param name="requireP" type="select" label="Significant peak calling?">
+				<option value="yes">Yes</option>
+				<option value="no">No</option>
+			</param>
+			<when value="yes">
+				<param name="p" type="float" value="0.05" label="Threshold of p-value to call peak" />
+				<param name="multiTest" type="boolean" truevalue="--multi-test" falsevalue="" checked="yes" label="Do Bonferroni multiple test correction?" />
+			</when>
+                        <when value="no">
+                        </when>
+                </conditional>
+
+		<conditional name="key">
+			<param name="dbkeyrequired" type="select" label="Use default built-in gene model to retrieve the default gene bed file?">
+                        	<option value="yes">Yes</option>
+				<option value="no">No</option>
+			</param>
+			<when value="yes">
+				<param name="dbkey" type="select" >
+					<option value="hg19">hg19</option>
+                        		<option value="mm10">mm10</option>
+                		</param>
+			</when>
+			<when value="no">
+			</when>
+		</conditional>
+
+                <conditional name="minPeakHeight">
+                        <param name="minPeakHeightrequired" type="select" label="Filter by min peak height?">
+                                <option value="yes">Yes</option>
+                                <option value="no" selected="True">No</option>
+                        </param>
+                        <when value="yes">
+                                <param  name="minHeight" type="integer" value="" label="Input an integer equal or greater than 2"/>
+                        </when>
+                        <when value="no">
+                        </when>
+                </conditional>
+
+                <conditional name="maxPeakHeight">
+                        <param name="maxPeakHeightrequired" type="select" label="Filter by max peak height?">
+                                <option value="yes">Yes</option>
+                                <option value="no" selected="True">No</option>
+                        </param>
+                        <when value="yes">
+                                <param name="maxHeight" type="integer" value="" label="Input an integer"/>
+				<param name="skipPeaks" type="boolean" truevalue="--skip-out-of-range-peaks" falsevalue="" checked="no" label="Skip peaks with PH is greater than maxPH"/>
+                        </when>
+                        <when value="no">
+                        </when>
+                </conditional>
+
+                <conditional name="gap">
+                        <param name="gaprequired" type="select" label="Merge peaks close to each other by specifying the distance between the peaks">
+                                <option value="yes">Yes</option>
+                                <option value="no" selected="True">No</option>
+                        </param>
+                        <when value="yes">
+                                <param name="mingap" type="integer" value="" label="Minimum gaps allowed between peaks, 25 recommended if merge" >
+                                </param>
+                        </when>
+                        <when value="no">
+                        </when>
+                </conditional>
+
+	</inputs>
+	<outputs>
+		<data name="outputPeaksCalled" format="bed" label="Cluster peak on ${on_string}" />
+		<data name="outputClusterBoundaries" format="bed" label="Cluster boundaries on ${on_string}" />
+		<data name="outputHalfPeakHeightBoundaries" format="bed" label="Cluster half peak height boundaries on ${on_string}" />
+	</outputs>
+
+	<help>
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will detect peaks from CLIP data.
+
+It will take unique CLIP tags in BED format as input files and detect peak of CLIP tag clusters that are statistically above the uniform background.
+
+The uniform background can be determined in two ways. If gene expression level is not provided, this tool assumes each gene will have the same number of tags, but in random positions in the gene. If gene expression level is provided (in the score column of the gene BED file), this tool will calculate the expected tag number in each gene based on the gene expression level, which are then distributed randomly across the gene.
+
+Note: If valley seeking is not enabled, it's a must to specify dbkey and peak merging is strongly recommended.
+
+-----
+
+**Output files**
+
+1. Files in BED format of cluster peaks called
+2. Files in BED format of cluster boundaries
+3. Files in BED format of cluster half peak height boundaries
+
+-----
+
+.. class:: warningmark
+
+**Input files**
+
+The gene BED file can be simple intervals or transcripts with exons and introns. Each gene should have one transcript (can be generated by the "Collapse transcripts" tool).
+
+When transcripts with exons and introns are provided, only the exonic regions are used to estimate the background.
+
+If you want to specify gene expression information, it has to be in the score column of the Gene BED file and the expression level MUST be in the linear scale (e.g. RPKM from RNA-seq or intensity from microarrays).  Also negative gene expression values are not allowed.
+
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tag2profile.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,23 @@
+<tool id="tag2profile" name="Generate bedgraph for visualization">
+  <description></description>
+  <command interpreter="perl">
+	/home/galaxy/tools/CTK/tag2profile.pl $bigFlag -v $ssFlag -exact -of bedgraph $weight  $input $output
+  </command>
+  <inputs>
+	<param name="input" type="data" format="bed" label="Input BED file (.gz file is allowed)"/>
+        <param name="bigFlag" type="boolean" truevalue="-big" falsevalue="" checked="yes" label="Big file"/>
+	<param name="ssFlag" type="boolean" truevalue="-ss" falsevalue="" checked="yes" label="Separate strand"/>
+	<param name="weight"  type="boolean" truevalue="-weight" falsevalue="" checked="no" label="Weight counts according to the score of each tag "/>
+  </inputs>
+  <outputs>
+	<data name="output" format="bedgraph" label="Generate bedgraph for visualization on ${on_string}">
+	</data>
+  </outputs>
+  <help>
+.. class:: infomark
+
+**What this tool does**
+
+This tool will take as input files in BED format and output bedgraph files for visualization in the genome browser.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trimming3.xml	Mon Apr 30 05:25:11 2018 -0400
@@ -0,0 +1,61 @@
+<tool id="trimming3" name="Trim 3' adapter">
+  <description></description>
+
+  <command>
+	fastx_clipper -a $Adapter -l $DiscardShort $DiscardNonclipped $DiscardClipped $AdapterOnly $KeepUnknown
+	#if $minAdapterAlignment.Mrequired =="yes":
+		-M $minAdapterAlignment.Min
+	#end if
+	-v -i $input 2>/dev/null | fastq_quality_trimmer -v -l $DiscardShort $CompressOutput -t $Qualitythreshold -o $output
+  </command>
+
+  <inputs>
+	<param name="input" type="data" format="fastq" label="Input FASTQ file"/>
+
+	<param name="Adapter" type="text" value="" label="Adapter sequence (the 3' adapter will vary for different CLIP protocol variations)"/>
+        <param name="DiscardShort" type="integer" value="" label="Discard sequences shorter than N nucleotides (see help below for parameter suggestion)"/>
+	<param name="DiscardNonclipped" type="boolean" truevalue="-c" falsevalue="" checked="no" label="Discard non-trimmed sequences (i.e. - keep only sequences which contained the adapter)" />
+        <param name="DiscardClipped" type="boolean" truevalue="-C" falsevalue="" checked="no" label="Discard trimmed sequences (i.e. - keep only sequences which did not contained the adapter)" />
+        <param name="AdapterOnly" type="boolean" truevalue="-k" falsevalue="" checked="no" label="Report Adapter-Only sequences"/>
+        <param name="KeepUnknown" type="boolean" truevalue="-n" falsevalue="" checked="yes"  label="Keep sequences with unknown nucleotides"/>
+	<conditional name="minAdapterAlignment">
+		<param name="Mrequired" type="select" label="Require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't trim it.">
+			<option value="yes">Yes</option>
+			<option value="no" selected="True">No</option>
+		</param>
+		<when value="yes">
+			<param name="Min" type="integer" value="" label="Input the length"/>
+		</when>
+		<when value="no">
+		</when>
+	</conditional>
+        <param name="Qualitythreshold" type="integer" value="5" label="Quality threshold - nucleotides with lower quality will be trimmed (from the end of the sequence)"/>
+        <param name="CompressOutput" type="boolean" truevalue="-z" falsevalue="" checked="no" label="Compress output with GZIP"/>
+
+  </inputs>
+
+  <outputs>
+	<data name="output" format="fastq" label="Trim 3' adapter on ${on_string} "/>
+  </outputs>
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+For long reads that are common now, collapsing before trimming is not very helpful. Therefore, trim the 3' adapter first.
+
+This tool (fastx_clipper and fastq_quality_trimmer) will take as input FASTQ files and output FASTQ files with 3' adapters and extremely low quality bases (e.g. score less than 5) removed.
+
+It is a part of FASTX Toolkit.
+
+-----
+
+**Parameter suggestion for discarding sequences**
+
+* For standard CLIP: discard sequences shorter than 20 nucleotides.
+* For BrdU CLIP: discard sequences shorter than 29 nucleotides.
+
+  </help>
+
+</tool>