# HG changeset patch # User john-mccallum # Date 1346987122 14400 # Node ID 84be1fe7e34a9e3f4949eaac2d73afb5b07d1a7a Uploaded diff -r 000000000000 -r 84be1fe7e34a count_cluster_size.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/count_cluster_size.xml Thu Sep 06 23:05:22 2012 -0400 @@ -0,0 +1,18 @@ + + + Get cluster size DNAclust output + awk 'OFS="\t" {print$1, NF}' $inputClusterFile > $outputfile + + + + + + + + + +Returns the number of members in a cluster by counting columns from DNA clust output + + + + diff -r 000000000000 -r 84be1fe7e34a cut_dnaclust.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cut_dnaclust.xml Thu Sep 06 23:05:22 2012 -0400 @@ -0,0 +1,25 @@ + + + Remove clusters below a certain depth + cut -f $depth- $inputFile | sed '/^$/d' | sort | uniq > $outputfile + + + + + + + + + +.. class:: infomark + +**TIP** + +:: + +This tool simply cuts off columns from the left +e.g. set to 2 to remove singletons, 50 to remove clusters with less than 50 reads + + + + diff -r 000000000000 -r 84be1fe7e34a dnaclust.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dnaclust.xml Thu Sep 06 23:05:22 2012 -0400 @@ -0,0 +1,34 @@ + + + Cluster sequences into OTUs using DNAclust + dnaclust -s $similarity -i $inputFastaFile > $outputfile + + + + + + + + + + + + +.. class:: infomark + +**TIP** + +see the DNACLUST documentation at http://dnaclust.sourceforge.net/ + + + +Each line will contain the ids of the sequences in each cluster, and the first id of each line is the cluster representative. + +Example: To cluster a set of 16S rRNA fragments at 0.98 similarity use: +./dnaclust file.fasta -l -s 0.98 > clusters + +You can optionally specify a k-mer length for the filter. The longer k-mers use more memory. Also the filter will be more specific with longer k-mers. The default log_4(median length) should be good for mo\ +st cases. + + + diff -r 000000000000 -r 84be1fe7e34a dnaclust2tab.awk --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dnaclust2tab.awk Thu Sep 06 23:05:22 2012 -0400 @@ -0,0 +1,10 @@ +#!/bin/awk -f +BEGIN { + FS="\t" + OFS="\t" +} +{ +OTU = $1 +{for (i=2;i + + Convert dnaclust to tabular + dnaclust2tab.awk $inputFile > $outputfile + + + + + + + +.. class:: infomark + +**TIP** + +This tool collapses dnaclust output into 2 column tabular form + + + + + diff -r 000000000000 -r 84be1fe7e34a fastaselect.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastaselect.xml Thu Sep 06 23:05:22 2012 -0400 @@ -0,0 +1,24 @@ + + + Get Fasta file of cluster centres from DNAclust output + cat inputClusterFile | fastaselect -c -f $inputFastaFile > $outputfile + + + + + + + + + + + + + +This tool returns a fasta file containing a subset of sequences from an input mult-fasta file and a list of identifiers. + +It was developed as part of the DNACLUST package http://dnaclust.sourceforge.net/ for use in retrieving cluster centres but is handy for any extraction of a sequence subset from Galaxy tabular output + + + +