# HG changeset patch # User Jim Johnson # Date 1372387028 18000 # Node ID 43724ea1c85fc15a7faec44f04259c07ce1f09d1 # Parent cca0838c15972b58a846d3da84b08fe4251bf538 Add cd-hit for protein fastas diff -r cca0838c1597 -r 43724ea1c85f cd_hit_est.xml --- a/cd_hit_est.xml Tue Feb 26 12:11:36 2013 -0600 +++ b/cd_hit_est.xml Thu Jun 27 21:37:08 2013 -0500 @@ -3,8 +3,13 @@ cd-hit + + cdhit_macros.xml + - cd-hit-est \$CDHIT_SITE_OPTIONS -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize $strand + cd-hit-est -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize $strand + #include source=$common_cdhit_options# + #include source=$runtime_tuning# @@ -22,6 +27,8 @@ + + @@ -29,12 +36,40 @@ + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + diff -r cca0838c1597 -r 43724ea1c85f cd_hit_protein.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cd_hit_protein.xml Thu Jun 27 21:37:08 2013 -0500 @@ -0,0 +1,115 @@ + + Cluster a protein dataset into representative sequences + + cd-hit + + + cdhit_macros.xml + + + cd-hit -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize + #include source=$common_cdhit_options# + #include source=$runtime_tuning# + + + + + + + + Suggested word size: + 5 for thresholds 0.7 ~ 1.0; + 4 for thresholds 0.6 ~ 0.7; + 3 for thresholds 0.5 ~ 0.6; + 2 for thresholds 0.4 ~ 0.5; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**CD-HIT** + +CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database. + +.. _CD-HIT: http://www.bioinformatics.org/cd-hit/ + +------ + +**Inputs** + +cd-hit requires a protein fasta dataset as input. + +------ + +**Outputs** + +A fasta datasets containing representative sequences. + +A text file listing the mapping of sequences to the representative sequences:: + + >Cluster 0 + 0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... * + >Cluster 1 + 0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80% + 1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84% + 2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... * + 3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84% + 4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63% + >Cluster 2 + 0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60% + 1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... * + 2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73% + 3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69% + + + + diff -r cca0838c1597 -r 43724ea1c85f cdhit_macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cdhit_macros.xml Thu Jun 27 21:37:08 2013 -0500 @@ -0,0 +1,168 @@ + + + + + + + + + + + + + + + + + + + + + + if set to 0, it takes the fasta defline and stops at first space + + + + if set to 0.9, the shorter sequences need to be at least 90% length of the representative of the cluster + + + + if set to 60, the length difference between the shorter sequences and the representative of the cluster can not be bigger than 60 + + + + + local sequence identity, calculated as : number of identical amino acids in alignment divided by the length of the alignment + You must set alignment coverage by length or fraction. + + + + + + + + + if set to 0.9, the alignment must covers 90% of the sequence + + + + if set to 60, and the length of the sequence is 400,then the alignment must be at least 340 (400-60) residues + + + + if set to 0.9, the alignment must covers 90% of the sequence + + + + if set to 60, and the length of the sequence is 400, then the alignment must be at least 340 (400-60) residues + + + + alignment must cover at least this value for both sequences + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r cca0838c1597 -r 43724ea1c85f test-data/cd_hit_protein_in.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cd_hit_protein_in.fasta Thu Jun 27 21:37:08 2013 -0500 @@ -0,0 +1,50 @@ +>sp|P00325|ADH1B_HUMAN Alcohol dehydrogenase 1B OS=Homo sapiens GN=ADH1B PE=1 SV=2 +MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT +PLPVILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP +RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG +YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE +CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ +NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF +DLLHSGKSIRTVLTF +>tr|K7D361|K7D361_PANTR Alcohol dehydrogenase 1B (Class I), beta polypeptide OS=Pan troglodytes GN=ADH1B PE=2 SV=1 +MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT +PLPAILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP +RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG +YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE +CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ +NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF +DLLHSGKSIRTVLTF +>sp|P00329|ADH1_MOUSE Alcohol dehydrogenase 1 OS=Mus musculus GN=Adh1 PE=2 SV=2 +MSTAGKVIKCKAAVLWELHKPFTIEDIEVAPPKAHEVRIKMVATGVCRSDDHVVSGTLVT +PLPAVLGHEGAGIVESVGEGVTCVKPGDKVIPLFSPQCGECRICKHPESNFCSRSDLLMP +RGTLREGTSRFSCKGKQIHNFISTSTFSQYTVVDDIAVAKIDGASPLDKVCLIGCGFSTG +YGSAVKVAKVTPGSTCAVFGLGGVGLSVIIGCKAAGAARIIAVDINKDKFAKAKELGATE +CINPQDYSKPIQEVLQEMTDGGVDFSFEVIGRLDTMTSALLSCHAACGVSVVVGVPPNAQ +NLSMNPMLLLLGRTWKGAIFGGFKSKDSVPKLVADFMAKKFPLDPLITHVLPFEKINEAF +DLLRSGKSIRTVLTF +>sp|P00338-2|LDHA_HUMAN Isoform 2 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA +MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG +EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI +IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV +HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKECRYTLGDPKGA +AILKSSDVISFHCLGYNRILGGGCACCPFYLICD +>sp|P00338-5|LDHA_HUMAN Isoform 5 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA +MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG +EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI +IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV +HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKEVHKQVVERVFT +E +>sp|P00340|LDHA_CHICK L-lactate dehydrogenase A chain OS=Gallus gallus GN=LDHA PE=1 SV=3 +MSLKDHLIHNVHKEEHAHAHNKISVVGVGAVGMACAISILMKDLADELTLVDVVEDKLKG +EMLDLQHGSLFLKTPKIISGKDYSVTAHSKLVIVTAGARQQEGESRLNLVQRNVNIFKFI +IPNVVKYSPDCKLLIVSNPVDILTYVAWKISGFPKHRVIGSGCNLDSARFRHLMGERLGI +HPLSCHGWIVGEHGDSSVPVWSGVNVAGVSLKALHPDMGTDADKEHWKEVHKQVVDSAYE +VIKLKGYTSWAIGLSVADLAETIMKNLRRVHPISTAVKGMHGIKDDVFLSVPCVLGSSGI +TDVVKMILKPDEEEKIKKSADTLWGIQKELQF +>sp|P19858|LDHA_BOVIN L-lactate dehydrogenase A chain OS=Bos taurus GN=LDHA PE=2 SV=2 +MATLKDQLIQNLLKEEHVPQNKITIVGVGAVGMACAISILMKDLADEVALVDVMEDKLKG +EMMDLQHGSLFLRTPKIVSGKDYNVTANSRLVIITAGARQQEGESRLNLVQRNVNIFKFI +IPNIVKYSPNCKLLVVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV +HPLSCHGWILGEHGDSSVPVWSGVNVAGVSLKNLHPELGTDADKEQWKAVHKQVVDSAYE +VIKLKGYTSWAIGLSVADLAESIMKNLRRVHPISTMIKGLYGIKEDVFLSVPCILGQNGI +SDVVKVTLTHEEEACLKKSADTLWGIQKELQF