# HG changeset patch # User edward-kirton # Date 1310695555 14400 # Node ID 6f967c3a3f7b8a97e8561eb48a1c3048d4b5ebab Uploaded diff -r 000000000000 -r 6f967c3a3f7b usearch/ublast.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usearch/ublast.xml Thu Jul 14 22:05:55 2011 -0400 @@ -0,0 +1,132 @@ + +Compare query sequences to database +usearch_wrapper.sh +usearch +--db $lib.file +--blast6out $outfile +--query $input +--quiet +--usersort +$libonly +--evalue $evalue +$nousort +--maxtargets $maxtargets +--maxlen $maxlen +--minlen $minlen +#if $alphabet.select == 'nucleo': +$alphabet.rev +--match $alphabet.match +--mismatch $alphabet.mismatch +#end if +$trunclabels +$output_rejects +--id $id +--maxaccepts $maxaccepts +--maxrejects $maxrejects +--bump $bump +--stepwords $stepwords +$optimal +$fastalign + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it Does** + +Compare each sequence in the query set to the sequences in the library database. USEARCH can perform nucleotide searches +(like BLASTN), protein searches (like BLASTP), and translated searches (like BLASTX). Amino acid queries vs nucleotide +libraries (like tBLASTN) are not supported in this version. + +**The libonly option** + +Use the libonly option to indicate when query sequences should NOT be added to the reference library (i.e. similar to Blast, etc.). + +Unselecting this option invokes the clustering behavior, whereby query sequences without hits are added to the library. + +**Author** + +Robert C. Edgar (bob@drive5.com) + +**Manual** + +http://www.drive5.com/usearch/usearch_docs.html + + + diff -r 000000000000 -r 6f967c3a3f7b usearch/uc2fastax.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usearch/uc2fastax.xml Thu Jul 14 22:05:55 2011 -0400 @@ -0,0 +1,23 @@ + +Generate Fasta file from UC file +usearch_wrapper.sh +usearch --quiet --uc2fastax $uc_file --input $db_file --output $outfile + + + + + + + + +**What it Does** + +Generates a Fasta file of the records in a UC file. The UC file may be filtered, for example to elminate low-abundance clusters. + +Example sequence Fasta header:: + + >5|99.5%|XXXX + +Where 5 is the cluster ID and 99.5% is the identity to the seed (will be * for the seed itself) and XXXX is the original read ID. + + diff -r 000000000000 -r 6f967c3a3f7b usearch/uclust.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usearch/uclust.xml Thu Jul 14 22:05:55 2011 -0400 @@ -0,0 +1,96 @@ + +Cluster sequences by similarity +usearch_wrapper.sh +usearch --quiet --uc $uc_outfile +#if $seedsout.select == 'true': +--seedsout $seeds_outfile +#end if +--cluster $input +$usersort +--maxlen $maxlen +--minlen $minlen +#if $alphabet.select == 'nucleo': +$alphabet.rev +--match $alphabet.match +--mismatch $alphabet.mismatch +#end if +$trunclabels +$output_rejects +--id $id +--maxaccepts $maxaccepts +--maxrejects $maxrejects +--bump $bump +--stepwords $stepwords +$optimal +$fastalign + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + seedsout['select'] == 'true' + + + +**What it Does** + +De novo clustering of sequences. To do a database search and cluster sequences which don't match, use usearch tool instead. + +**Output** + +This tool generates the usearch "uc" logfile and a fasta file of dereplicated sequences, named by cluster ID. + +**Author** + +Robert C. Edgar (bob@drive5.com) + +**Manual** + +http://www.drive5.com/usearch/usearch_docs.html + + diff -r 000000000000 -r 6f967c3a3f7b usearch/uclust_composition.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usearch/uclust_composition.pl Thu Jul 14 22:05:55 2011 -0400 @@ -0,0 +1,211 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Getopt::Long; + +my $usage = <<'ENDHERE'; +NAME: + uclust_composition.pl +PURPOSE: + To generate a table of cluster composition by barcode and optionally filter clusters. +INPUT: + --uc : UCLUST results (*.uc file) + --barcodes : barcode sequences in either Tabular or Fasta format (optional) + --min : minimum fraction of reads (per barcode) in order to be included in output (default=0.00001 for 0.001%) +OUTPUT: + --obs : table of cluster and abundance (per barcode if provided) + --filtered : filter UC file +AUTHOR/SUPPORT: + ESKirton@LBL.gov +ENDHERE + +my ( $help, $uc_infile, $outfile, $barcodes_infile, $uc_outfile ); +my $min = 0.00001; +GetOptions( + 'uc=s' => \$uc_infile, + 'filtered=s' => \$uc_outfile, + 'outfile=s' => \$outfile, + 'barcodes=s' => \$barcodes_infile, + 'min=f' => \$min, + 'help' => \$help ); +if ($help) { print $usage; exit; } +die("Infile required\n") unless $uc_infile and -f $uc_infile; +die("Outfile required\n") unless $outfile; + +# LOAD BARCODES +my %barcode_labels = (); # barcode sequence => name +my @barcode_labels = ('UNKNOWN'); # column order +my %barcode_to_index = ( 'UNKNOWN' => 0 ); # barcode sequence => array index (in @barcode_labels and @obs) +if ($barcodes_infile) +{ + my $barcode; + my $label; + my $is_fasta = 0; + open( IN, "<$barcodes_infile" ) + or die("Unable to open barcodes file, $barcodes_infile: $!\n"); + while () + { + chomp; + next if /^#/; + next unless $_; + if (/^>(\S+)/) + { + $label = $1; + $is_fasta = 1; + while () + { + chomp; + $barcode = $_; + last; + } + } + elsif ($is_fasta) + { + die("Barcodes Fasta file must have entire sequence on only one line.\n"); + } + else + { + ( $barcode, $label ) = split(/\t/); + $label = $barcode unless $label; + } + $label =~ s/[_ ]/./g; + $barcode = uc($barcode); + push @barcode_labels, $barcode_labels{$barcode} = $label; + $barcode_to_index{$barcode} = $#barcode_labels; + } + close IN; +} + +# PARSE UCLUST FILE +# NOTE: BARCODES MAY BE DISCOVERED HERE (I.E. NOT PRESPECIFIED ABOVE) +my $tot_unknown_barcodes = 0; +# $obs[$cluster] = [ obs for barcode 0, obs for barcode 1, ... ] with order matching @labels +my @obs = (); +my ( $index, $cluster, $counter, $barcode_label ); +open( IN, "<$uc_infile" ) or die($!); +while () +{ + chomp; + my @row = split(/\t/); + my $code = $row[0]; + next unless $code eq 'C' or $code eq 'H'; + $cluster = $row[1]; + $obs[$cluster] = [] if !defined( $obs[$cluster] ); + my $read_id = $row[8]; + if ( $read_id =~ /^\S+#([ATCGN]+)\/?\d?/ ) + { + my $barcode = uc($1); + if ( exists( $barcode_labels{$barcode} ) ) + { + $barcode_label = $barcode_labels{$barcode}; + $index = $barcode_to_index{$barcode}; + } + elsif ( ! $barcodes_infile ) + { + push @barcode_labels, $barcode_label = $barcode_labels{$barcode} = $barcode; + $barcode_to_index{$barcode} = $index = $#barcode_labels; + } + else + { + $barcode_label = 'UNKNOWN'; + $index = 0; + ++$tot_unknown_barcodes; + } + } + else + { + $barcode_label = 'UNKNOWN'; + $index = 0; + ++$tot_unknown_barcodes; + } + + # INCREMENT COUNTER + if ( !defined( $obs[$cluster]->[$index] ) ) + { + $obs[$cluster]->[$index] = 1; + } + else + { + $obs[$cluster]->[$index] += 1; + } +} +close IN; + +# DISCARD UNKNOWN CATEGORY IF N/A +unless ($tot_unknown_barcodes) +{ + shift @barcode_labels; + for ( my $cluster = 0; $cluster <= $#obs; ++$cluster ) + { + my $a_cluster_obs = $obs[$cluster]; + shift @$a_cluster_obs; + } +} + +# SUM +my @totals = split( //, '0' x scalar(@barcode_labels) ); # total reads per barcode +for ( my $cluster = 0; $cluster <= $#obs; ++$cluster ) +{ + for ( my $i = 0; $i <= $#barcode_labels; $i++ ) + { + $totals[$i] += $obs[$cluster]->[$i] if defined($obs[$cluster]->[$i]); + } +} + +# FILTER BY PERCENT +my @filtered_clusters = (); +my @filtered_obs = (); +for ( my $cluster = 0; $cluster <= $#obs; ++$cluster ) +{ + my $okay = 0; + for ( my $i = 0; $i <= $#barcode_labels; $i++ ) + { + my $tot = $totals[$i]; + my $ob = $obs[$cluster]->[$i]; + if ( $ob and $ob / $tot > $min ) + { + $okay = 1; + last; + } + } + if ($okay) { + push @filtered_clusters, $cluster; + my $cluster_obsAR=$obs[$cluster]; + for ( my $i = 0; $i <= $#barcode_labels; $i++ ) + { + $cluster_obsAR->[$i] = 0 if !defined( $cluster_obsAR->[$i] ); + } + push @filtered_obs, $cluster_obsAR; + } +} + +## GENERATE OBSERVED COUNTS TABLE FILE +open( OUT, ">$outfile" ) or die("Unable to write outfile, $outfile: $!\n"); +print OUT join( "\t", "#CLUSTER", @barcode_labels ), "\n"; +for (my $i=0; $i<=$#filtered_clusters; $i++) { + print OUT join( "\t", $filtered_clusters[$i], @{$filtered_obs[$i]} ), "\n"; +} +close OUT; + +## GENERATE UC OUTFILE +my $next_cluster=shift @filtered_clusters; +exit unless defined($next_cluster); +open( IN, "<$uc_infile" ) or die($!); +open(OUT, ">$uc_outfile") or die($!); +while () +{ + my @row = split(/\t/); + my $code = $row[0]; + next unless $code eq 'C'; + my $cluster = $row[1]; + next unless $cluster eq $next_cluster; + print OUT $_; + last unless @filtered_clusters; + $next_cluster=shift @filtered_clusters; + last unless defined($next_cluster); +} +close IN; +close OUT; + +exit; diff -r 000000000000 -r 6f967c3a3f7b usearch/uclust_composition.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usearch/uclust_composition.xml Thu Jul 14 22:05:55 2011 -0400 @@ -0,0 +1,37 @@ + +Create a table of cluster composition by barcode +uclust_composition.pl --uc $uc_infile --outfile $outfile --min $min --filtered $uc_outfile +#if $barcodes.select == 'true': +-barcodes $barcodes.file +#end if + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool parses UCLUST output and creates a table of cluster ID and number of members per barcode, plus a UC file with only the seeds of the remaining clusters (use uc2fastax to generate a Fasta file of seeds). + +Barcodes are expected to be encoded in the read ID as per Illumina naming convention. + +You may predefine the barcodes in order to specify human-readable labels, otherwise the actual sequences will be used. + + diff -r 000000000000 -r 6f967c3a3f7b usearch/usearch_sort.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usearch/usearch_sort.xml Thu Jul 14 22:05:55 2011 -0400 @@ -0,0 +1,31 @@ + +Sort sequences by decreasing length +usearch_wrapper.sh +usearch --quiet $trunclabels --mergesort $input --output $output + + + + + + + + +**What it Does** + +Sorts sequences by decreasing length. + +You should sort sequences before clustering because as the sequences are analyzed, unique sequences seed new clusters and +it's best to use the longest sequence as the cluster representative. However for amplicons (i.e. rRNA OTU analysis), +this is not the case. Instead, it is recommend that you sort by cluster abundance instead, as sequences with more copies +will be least likely to contain sequencing errors. For this, use the 'usearch sort by abundance' tool. + +**Author** + +Robert C. Edgar (bob@drive5.com) + +**Manual** + +http://www.drive5.com/usearch/usearch_docs.html + + + diff -r 000000000000 -r 6f967c3a3f7b usearch/usearch_wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usearch/usearch_wrapper.sh Thu Jul 14 22:05:55 2011 -0400 @@ -0,0 +1,8 @@ +#!/bin/bash +OUTPUT=`$* 2>&1` +if [ $? -ne 0 ] +then + echo "ERROR: $OUTPUT" 1>&2 + exit +fi +echo $OUTPUT