# HG changeset patch
# User edward-kirton
# Date 1310695555 14400
# Node ID 6f967c3a3f7b8a97e8561eb48a1c3048d4b5ebab
Uploaded
diff -r 000000000000 -r 6f967c3a3f7b usearch/ublast.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usearch/ublast.xml Thu Jul 14 22:05:55 2011 -0400
@@ -0,0 +1,132 @@
+
+Compare query sequences to database
+usearch_wrapper.sh
+usearch
+--db $lib.file
+--blast6out $outfile
+--query $input
+--quiet
+--usersort
+$libonly
+--evalue $evalue
+$nousort
+--maxtargets $maxtargets
+--maxlen $maxlen
+--minlen $minlen
+#if $alphabet.select == 'nucleo':
+$alphabet.rev
+--match $alphabet.match
+--mismatch $alphabet.mismatch
+#end if
+$trunclabels
+$output_rejects
+--id $id
+--maxaccepts $maxaccepts
+--maxrejects $maxrejects
+--bump $bump
+--stepwords $stepwords
+$optimal
+$fastalign
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it Does**
+
+Compare each sequence in the query set to the sequences in the library database. USEARCH can perform nucleotide searches
+(like BLASTN), protein searches (like BLASTP), and translated searches (like BLASTX). Amino acid queries vs nucleotide
+libraries (like tBLASTN) are not supported in this version.
+
+**The libonly option**
+
+Use the libonly option to indicate when query sequences should NOT be added to the reference library (i.e. similar to Blast, etc.).
+
+Unselecting this option invokes the clustering behavior, whereby query sequences without hits are added to the library.
+
+**Author**
+
+Robert C. Edgar (bob@drive5.com)
+
+**Manual**
+
+http://www.drive5.com/usearch/usearch_docs.html
+
+
+
diff -r 000000000000 -r 6f967c3a3f7b usearch/uc2fastax.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usearch/uc2fastax.xml Thu Jul 14 22:05:55 2011 -0400
@@ -0,0 +1,23 @@
+
+Generate Fasta file from UC file
+usearch_wrapper.sh
+usearch --quiet --uc2fastax $uc_file --input $db_file --output $outfile
+
+
+
+
+
+
+
+
+**What it Does**
+
+Generates a Fasta file of the records in a UC file. The UC file may be filtered, for example to elminate low-abundance clusters.
+
+Example sequence Fasta header::
+
+ >5|99.5%|XXXX
+
+Where 5 is the cluster ID and 99.5% is the identity to the seed (will be * for the seed itself) and XXXX is the original read ID.
+
+
diff -r 000000000000 -r 6f967c3a3f7b usearch/uclust.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usearch/uclust.xml Thu Jul 14 22:05:55 2011 -0400
@@ -0,0 +1,96 @@
+
+Cluster sequences by similarity
+usearch_wrapper.sh
+usearch --quiet --uc $uc_outfile
+#if $seedsout.select == 'true':
+--seedsout $seeds_outfile
+#end if
+--cluster $input
+$usersort
+--maxlen $maxlen
+--minlen $minlen
+#if $alphabet.select == 'nucleo':
+$alphabet.rev
+--match $alphabet.match
+--mismatch $alphabet.mismatch
+#end if
+$trunclabels
+$output_rejects
+--id $id
+--maxaccepts $maxaccepts
+--maxrejects $maxrejects
+--bump $bump
+--stepwords $stepwords
+$optimal
+$fastalign
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ seedsout['select'] == 'true'
+
+
+
+**What it Does**
+
+De novo clustering of sequences. To do a database search and cluster sequences which don't match, use usearch tool instead.
+
+**Output**
+
+This tool generates the usearch "uc" logfile and a fasta file of dereplicated sequences, named by cluster ID.
+
+**Author**
+
+Robert C. Edgar (bob@drive5.com)
+
+**Manual**
+
+http://www.drive5.com/usearch/usearch_docs.html
+
+
diff -r 000000000000 -r 6f967c3a3f7b usearch/uclust_composition.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usearch/uclust_composition.pl Thu Jul 14 22:05:55 2011 -0400
@@ -0,0 +1,211 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $usage = <<'ENDHERE';
+NAME:
+ uclust_composition.pl
+PURPOSE:
+ To generate a table of cluster composition by barcode and optionally filter clusters.
+INPUT:
+ --uc : UCLUST results (*.uc file)
+ --barcodes : barcode sequences in either Tabular or Fasta format (optional)
+ --min : minimum fraction of reads (per barcode) in order to be included in output (default=0.00001 for 0.001%)
+OUTPUT:
+ --obs : table of cluster and abundance (per barcode if provided)
+ --filtered : filter UC file
+AUTHOR/SUPPORT:
+ ESKirton@LBL.gov
+ENDHERE
+
+my ( $help, $uc_infile, $outfile, $barcodes_infile, $uc_outfile );
+my $min = 0.00001;
+GetOptions(
+ 'uc=s' => \$uc_infile,
+ 'filtered=s' => \$uc_outfile,
+ 'outfile=s' => \$outfile,
+ 'barcodes=s' => \$barcodes_infile,
+ 'min=f' => \$min,
+ 'help' => \$help );
+if ($help) { print $usage; exit; }
+die("Infile required\n") unless $uc_infile and -f $uc_infile;
+die("Outfile required\n") unless $outfile;
+
+# LOAD BARCODES
+my %barcode_labels = (); # barcode sequence => name
+my @barcode_labels = ('UNKNOWN'); # column order
+my %barcode_to_index = ( 'UNKNOWN' => 0 ); # barcode sequence => array index (in @barcode_labels and @obs)
+if ($barcodes_infile)
+{
+ my $barcode;
+ my $label;
+ my $is_fasta = 0;
+ open( IN, "<$barcodes_infile" )
+ or die("Unable to open barcodes file, $barcodes_infile: $!\n");
+ while ()
+ {
+ chomp;
+ next if /^#/;
+ next unless $_;
+ if (/^>(\S+)/)
+ {
+ $label = $1;
+ $is_fasta = 1;
+ while ()
+ {
+ chomp;
+ $barcode = $_;
+ last;
+ }
+ }
+ elsif ($is_fasta)
+ {
+ die("Barcodes Fasta file must have entire sequence on only one line.\n");
+ }
+ else
+ {
+ ( $barcode, $label ) = split(/\t/);
+ $label = $barcode unless $label;
+ }
+ $label =~ s/[_ ]/./g;
+ $barcode = uc($barcode);
+ push @barcode_labels, $barcode_labels{$barcode} = $label;
+ $barcode_to_index{$barcode} = $#barcode_labels;
+ }
+ close IN;
+}
+
+# PARSE UCLUST FILE
+# NOTE: BARCODES MAY BE DISCOVERED HERE (I.E. NOT PRESPECIFIED ABOVE)
+my $tot_unknown_barcodes = 0;
+# $obs[$cluster] = [ obs for barcode 0, obs for barcode 1, ... ] with order matching @labels
+my @obs = ();
+my ( $index, $cluster, $counter, $barcode_label );
+open( IN, "<$uc_infile" ) or die($!);
+while ()
+{
+ chomp;
+ my @row = split(/\t/);
+ my $code = $row[0];
+ next unless $code eq 'C' or $code eq 'H';
+ $cluster = $row[1];
+ $obs[$cluster] = [] if !defined( $obs[$cluster] );
+ my $read_id = $row[8];
+ if ( $read_id =~ /^\S+#([ATCGN]+)\/?\d?/ )
+ {
+ my $barcode = uc($1);
+ if ( exists( $barcode_labels{$barcode} ) )
+ {
+ $barcode_label = $barcode_labels{$barcode};
+ $index = $barcode_to_index{$barcode};
+ }
+ elsif ( ! $barcodes_infile )
+ {
+ push @barcode_labels, $barcode_label = $barcode_labels{$barcode} = $barcode;
+ $barcode_to_index{$barcode} = $index = $#barcode_labels;
+ }
+ else
+ {
+ $barcode_label = 'UNKNOWN';
+ $index = 0;
+ ++$tot_unknown_barcodes;
+ }
+ }
+ else
+ {
+ $barcode_label = 'UNKNOWN';
+ $index = 0;
+ ++$tot_unknown_barcodes;
+ }
+
+ # INCREMENT COUNTER
+ if ( !defined( $obs[$cluster]->[$index] ) )
+ {
+ $obs[$cluster]->[$index] = 1;
+ }
+ else
+ {
+ $obs[$cluster]->[$index] += 1;
+ }
+}
+close IN;
+
+# DISCARD UNKNOWN CATEGORY IF N/A
+unless ($tot_unknown_barcodes)
+{
+ shift @barcode_labels;
+ for ( my $cluster = 0; $cluster <= $#obs; ++$cluster )
+ {
+ my $a_cluster_obs = $obs[$cluster];
+ shift @$a_cluster_obs;
+ }
+}
+
+# SUM
+my @totals = split( //, '0' x scalar(@barcode_labels) ); # total reads per barcode
+for ( my $cluster = 0; $cluster <= $#obs; ++$cluster )
+{
+ for ( my $i = 0; $i <= $#barcode_labels; $i++ )
+ {
+ $totals[$i] += $obs[$cluster]->[$i] if defined($obs[$cluster]->[$i]);
+ }
+}
+
+# FILTER BY PERCENT
+my @filtered_clusters = ();
+my @filtered_obs = ();
+for ( my $cluster = 0; $cluster <= $#obs; ++$cluster )
+{
+ my $okay = 0;
+ for ( my $i = 0; $i <= $#barcode_labels; $i++ )
+ {
+ my $tot = $totals[$i];
+ my $ob = $obs[$cluster]->[$i];
+ if ( $ob and $ob / $tot > $min )
+ {
+ $okay = 1;
+ last;
+ }
+ }
+ if ($okay) {
+ push @filtered_clusters, $cluster;
+ my $cluster_obsAR=$obs[$cluster];
+ for ( my $i = 0; $i <= $#barcode_labels; $i++ )
+ {
+ $cluster_obsAR->[$i] = 0 if !defined( $cluster_obsAR->[$i] );
+ }
+ push @filtered_obs, $cluster_obsAR;
+ }
+}
+
+## GENERATE OBSERVED COUNTS TABLE FILE
+open( OUT, ">$outfile" ) or die("Unable to write outfile, $outfile: $!\n");
+print OUT join( "\t", "#CLUSTER", @barcode_labels ), "\n";
+for (my $i=0; $i<=$#filtered_clusters; $i++) {
+ print OUT join( "\t", $filtered_clusters[$i], @{$filtered_obs[$i]} ), "\n";
+}
+close OUT;
+
+## GENERATE UC OUTFILE
+my $next_cluster=shift @filtered_clusters;
+exit unless defined($next_cluster);
+open( IN, "<$uc_infile" ) or die($!);
+open(OUT, ">$uc_outfile") or die($!);
+while ()
+{
+ my @row = split(/\t/);
+ my $code = $row[0];
+ next unless $code eq 'C';
+ my $cluster = $row[1];
+ next unless $cluster eq $next_cluster;
+ print OUT $_;
+ last unless @filtered_clusters;
+ $next_cluster=shift @filtered_clusters;
+ last unless defined($next_cluster);
+}
+close IN;
+close OUT;
+
+exit;
diff -r 000000000000 -r 6f967c3a3f7b usearch/uclust_composition.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usearch/uclust_composition.xml Thu Jul 14 22:05:55 2011 -0400
@@ -0,0 +1,37 @@
+
+Create a table of cluster composition by barcode
+uclust_composition.pl --uc $uc_infile --outfile $outfile --min $min --filtered $uc_outfile
+#if $barcodes.select == 'true':
+-barcodes $barcodes.file
+#end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool parses UCLUST output and creates a table of cluster ID and number of members per barcode, plus a UC file with only the seeds of the remaining clusters (use uc2fastax to generate a Fasta file of seeds).
+
+Barcodes are expected to be encoded in the read ID as per Illumina naming convention.
+
+You may predefine the barcodes in order to specify human-readable labels, otherwise the actual sequences will be used.
+
+
diff -r 000000000000 -r 6f967c3a3f7b usearch/usearch_sort.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usearch/usearch_sort.xml Thu Jul 14 22:05:55 2011 -0400
@@ -0,0 +1,31 @@
+
+Sort sequences by decreasing length
+usearch_wrapper.sh
+usearch --quiet $trunclabels --mergesort $input --output $output
+
+
+
+
+
+
+
+
+**What it Does**
+
+Sorts sequences by decreasing length.
+
+You should sort sequences before clustering because as the sequences are analyzed, unique sequences seed new clusters and
+it's best to use the longest sequence as the cluster representative. However for amplicons (i.e. rRNA OTU analysis),
+this is not the case. Instead, it is recommend that you sort by cluster abundance instead, as sequences with more copies
+will be least likely to contain sequencing errors. For this, use the 'usearch sort by abundance' tool.
+
+**Author**
+
+Robert C. Edgar (bob@drive5.com)
+
+**Manual**
+
+http://www.drive5.com/usearch/usearch_docs.html
+
+
+
diff -r 000000000000 -r 6f967c3a3f7b usearch/usearch_wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/usearch/usearch_wrapper.sh Thu Jul 14 22:05:55 2011 -0400
@@ -0,0 +1,8 @@
+#!/bin/bash
+OUTPUT=`$* 2>&1`
+if [ $? -ne 0 ]
+then
+ echo "ERROR: $OUTPUT" 1>&2
+ exit
+fi
+echo $OUTPUT