Repository 'pileup_to_vcf'
hg clone https://toolshed.g2.bx.psu.edu/repos/jjohnson/pileup_to_vcf

Changeset 0:3890f8ba0e4d (2013-02-18)
Next changeset 1:bb0ac51f1b02 (2013-02-18)
Commit message:
Uploaded
added:
pileup_to_vcf.py
pileup_to_vcf.xml
test-data/test.pileup
b
diff -r 000000000000 -r 3890f8ba0e4d pileup_to_vcf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pileup_to_vcf.py Mon Feb 18 11:32:50 2013 -0500
[
b'@@ -0,0 +1,263 @@\n+#!/usr/bin/env python\n+"""\n+#\n+#------------------------------------------------------------------------------\n+#                         University of Minnesota\n+#         Copyright 2012, Regents of the University of Minnesota\n+#------------------------------------------------------------------------------\n+# Author:\n+#\n+#  James E Johnson\n+#  Jesse Erdmann\n+#\n+#------------------------------------------------------------------------------\n+"""\n+\n+"""\n+Generate a VCF file from a samtools pileup\n+filtering on read coverage, base call quality, and allele frequency\n+\n+Pileup Format\n+http://samtools.sourceforge.net/pileup.shtml\n+Columns:\n+chromosome, 1-based coordinate, reference base, the number of reads covering the site, read bases, base qualities\n+\n+VCF Format\n+http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41\n+The header line names the 8 fixed, mandatory columns. These columns are as follows:\n+CHROM POS ID REF ALT QUAL FILTER INFO\n+"""\n+\n+import sys,re,os.path\n+import optparse\n+from optparse import OptionParser\n+\n+vcf_header =  """\\\n+##fileformat=VCFv4.0\n+##source=pileup_to_vcf.pyV1.0\n+##INFO=<ID=DP,Number=1,Type=Integer,Description=\\"Total Depth\\">\n+##INFO=<ID=$SAF_header,Number=.,Type=Float,Description=\\"Specific Allele Frequency\\">\n+##FILTER=<ID=DP,Description=\\"Minimum depth of %s\\">\n+##FILTER=<ID=$SAF_header,Description=\\"Allele frequency of at least %s with base quality minimum %d\\">\n+#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\\\n+"""\n+\n+def __main__():\n+  #Parse Command Line\n+  parser = optparse.OptionParser()\n+  # files\n+  parser.add_option( \'-i\', \'--input\', dest=\'input\', help=\'The input pileup file (else read from stdin)\' )\n+  parser.add_option( \'-o\', \'--output\', dest=\'output\', help=\'The output vcf file (else write to stdout)\' )\n+  # filters\n+  parser.add_option( \'-c\', \'--min_coverage\', type=\'int\', default=\'0\', dest=\'min_coverage\', help=\'The minimum read coverage depth to report a variant [default 0]\' )\n+  parser.add_option( \'-r\', \'--report_depth\', type=\'choice\', choices=[\'all\',\'ref\',\'qual\'], default=\'ref\', dest=\'report_depth\', help=\'Coverage depth as: ref - reads covercovering base, all - reads spanning base, qual - reads with base call above min_base_quality\' )\n+  parser.add_option( \'-b\', \'--min_base_quality\', type=\'int\', default=\'0\', dest=\'min_base_quality\', help=\'The minimum base quality for a base call to be counted\' )\n+  parser.add_option( \'-f\', \'--min_allele_freq\', type=\'float\', default=\'.5\', dest=\'min_allele_freq\', help=\'The minimum frequency of an allele for it to be reported (default .5)\' )\n+  parser.add_option( \'-m\', \'--allow_multiples\', action="store_true", dest=\'allow_multiples\', default=False, help=\'Allow multiple alleles to be reported\' )\n+  parser.add_option( \'-s\', \'--snps_only\', action="store_true", dest=\'snps_only\', default=False, help=\'Only report SNPs, not indels\' )\n+  # select columns\n+  parser.add_option( \'-C\', \'--chrom_col\', type=\'int\', default=\'1\', dest=\'chrom_col\', help=\'The ordinal position (starting with 1) of the chromosome column\' )\n+  parser.add_option( \'-P\', \'--pos_col\', type=\'int\', default=\'2\', dest=\'pos_col\', help=\'The ordinal position (starting with 1) of the position column\' )\n+  parser.add_option( \'-R\', \'--ref_col\', type=\'int\', default=\'3\', dest=\'ref_col\', help=\'The ordinal position (starting with 1) of the reference base  column\' )\n+  parser.add_option( \'-D\', \'--coverage_col\', type=\'int\', default=\'4\', dest=\'coverage_col\', help=\'The ordinal position (starting with 1) of the read coverage depth column\' )\n+  parser.add_option( \'-B\', \'--base_call_col\', type=\'int\', default=\'5\', dest=\'base_call_col\', help=\'The ordinal position (starting with 1) of the base call column\' )\n+  parser.add_option( \'-Q\', \'--base_qual_col\', type=\'int\', default=\'6\', dest=\'base_qual_col\', help=\'The ordinal position (starting with 1) of the base quality column\' )\n+  # debug\n+  parser.add_option( \'-d\', \'--debug\', action="store_true", dest=\'debug\', de'..b'f qual >= min_base_quality:\n+            qdp += 1\n+        elif base in \'ACGTNacgtn\' : # SNP on forward/reverse strand\n+          rdp += 1\n+          qual = ord(quals[qi]) -33\n+          qi += 1\n+          # record snp variant\n+          if qual >= min_base_quality:\n+            qdp += 1\n+            snps[base.upper()] += 1\n+        elif base in \'+-\' : # indel \n+          rdp += 1\n+          qdp += 1 # no related quality score, so count it\n+          m = re.match(indel_len_pattern,bases[bi:]) \n+          indel_len_str = m.group(0) # get the indel len string\n+          bi += len(indel_len_str) # increment the index by the len of the len string\n+          indel_len = int(indel_len_str)\n+          indel = bases[bi:bi+indel_len].upper() # get the indel bases\n+          bi += indel_len # increment the index by the len of the indel \n+          if base == \'+\':\n+            # record insertion variant\n+            if indel in insertions: \n+              insertions[indel] += 1\n+            else:\n+              insertions[indel] = 1\n+          elif base == \'-\': \n+            # record deletion variant\n+            max_deletion = max(indel_len,max_deletion)\n+            if indel in deletions: \n+              deletions[indel] += 1\n+            else:\n+              deletions[indel] = 1\n+        elif base == \'^\' : #start of read (followed by read quality char)\n+          read_mapping_qual = ord(bases[bi]) - 33 \n+          bi += 1\n+        elif base == \'$\' : #end of read\n+          pass # no associated quality in either bases or quals\n+      dp = rdp if dp_as == \'ref\' else qdp if dp_as == \'qual\' else adp\n+      if debug: print >> sys.stderr, "snps: %s\\tins: %s\\tdel: %s" % (snps,insertions,deletions)\n+      # output any variants that meet the depth coverage requirement\n+      vcf_pos = pos\n+      vcf_ref = ref\n+      suffix = \'\'\n+      alts = []\n+      safs = []\n+      # If we have any deletions to report, need to modify the ref string\n+      if debug: print >> sys.stderr, "max deletions: %s" % deletions\n+      if report_indels:\n+        for k,v in deletions.items():\n+          saf = v/float(dp)\n+          if saf >= min_allele_freq:\n+            if len(k) > len(suffix):\n+              suffix = k\n+        vcf_ref = (ref + suffix).upper()\n+      if debug: print >> sys.stderr, "snps: %s" % snps\n+      for k,v in snps.items():\n+        if debug: print >> sys.stderr, "snp: %s %d" % (k,v)\n+        saf = v/float(dp)\n+        if saf >= min_allele_freq:\n+          alts.append(k + suffix)\n+          safs.append(saf)\n+      if debug: print >> sys.stderr, "insertions: %s" % insertions\n+      if report_indels:\n+        for k,v in insertions.items():\n+          saf = v/float(dp)\n+          if saf >= min_allele_freq:\n+            alts.append(ref + k + suffix)\n+            safs.append(saf)\n+        if debug: print >> sys.stderr, "deletions: %s" % deletions\n+        for k,v in deletions.items():\n+          saf = v/float(dp)\n+          if saf >= min_allele_freq:\n+            alts.append(vcf_ref[:len(vcf_ref) - len(k)])   # TODO alt will be a substring of vcf_ref,  test this\n+            safs.append(saf)\n+      if len(alts) > 0:\n+        vcf_id = "."\n+        vcf_qual = "." \n+        vcf_filter = "PASS"\n+        # if not allow_multiples, report only the most freq alt\n+        if not allow_multiples and len(alts) > 1:\n+          max_saf = max(safs)\n+          for i,v in enumerate(safs):\n+            if v == max_saf:\n+              vcf_alt = alts[i]\n+          info = "SAF=%f;DP=%d" % (max_saf,dp)\n+        # if allow_multiples\n+        else:\n+          vcf_alt = \',\'.join(alts)\n+          isafs = []\n+          for saf in safs:\n+            isafs.append("SAF=%f" % saf)\n+          info = "%s;DP=%d" % (\',\'.join(isafs),dp)\n+        print >> outputFile,"%s\\t%d\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s" % (chrom,vcf_pos,vcf_id,vcf_ref,vcf_alt,vcf_qual,vcf_filter,info)\n+  except Exception, e:\n+    print >> sys.stderr, "failed: %s" % e\n+    exit(1)\n+\n+if __name__ == "__main__": __main__()\n+\n'
b
diff -r 000000000000 -r 3890f8ba0e4d pileup_to_vcf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pileup_to_vcf.xml Mon Feb 18 11:32:50 2013 -0500
b
@@ -0,0 +1,112 @@
+<tool id="pileup_to_vcf" name="Pileup to VCF" version="2.0">
+  <description>Converts a pileup to VCF with filtering</description>
+  <command interpreter="python">pileup_to_vcf.py -i $input_file -o $output_file 
+    #if $min_cvrg.__str__  != '':
+      --min_coverage $min_cvrg 
+    #end if
+    #if $min_base_qual.__str__  != '':
+      --min_base_qual $min_base_qual
+    #end if
+    #if $min_var_pct.__str__  != '':
+      --min_allele_freq $min_var_pct 
+    #end if
+    #if $depth_as.__str__ != 'None':
+      --report_depth $depth_as
+    #end if
+    $allow_multiples
+    $snps_only
+    #if $cols.select_order == 'yes' :
+      #if $chrom_col.__str__  != '':
+        --chrom_col $chrom_col 
+      #end if
+      #if $pos_col.__str__  != '':
+        --pos_col $pos_col 
+      #end if
+      #if $ref_col.__str__  != '':
+        --ref_col $ref_col 
+      #end if
+      #if $cvrg_col.__str__  != '':
+        --coverage_col $cvrg_col 
+      #end if
+      #if $base_call_col.__str__  != '':
+        --base_call_col $base_call_col 
+      #end if
+      #if $base_qual_col.__str__  != '':
+        --base_qual_col $base_qual_col 
+      #end if
+    #end if
+  </command>
+  <inputs>
+    <param name="input_file" type="data" format="pileup,tabular" label="Source File" optional="false"/>
+    <conditional name="cols">
+      <param name="select_order" type="select" label="Set column positions for non-standard pileup">
+        <option value="no" selected="true">Use the default pileup columns</option>
+        <option value="yes">Select the column position that represents each pileup column</option>
+      </param>
+      <when value="no"/>
+      <when value="yes">
+        <param name="chrom_col" type="data_column" data_ref="input_file" label="Chromosome Column"/>
+        <param name="pos_col" type="data_column" data_ref="input_file" label="Position Column"/>
+        <param name="ref_col" type="data_column" data_ref="input_file" label="Reference Base Column"/>
+        <param name="cvrg_col" type="data_column" data_ref="input_file" label="Depth Column"/>
+        <param name="base_call_col" type="data_column" data_ref="input_file" label="Base Call Column"/>
+        <param name="base_qual_col" type="data_column" data_ref="input_file" label="Base Quality Column"/>
+      </when>
+    </conditional>
+    <param name="min_base_qual" type="integer" label="Minimum Base Quality" optional="true" value="20" help="Don't consider a read if the base call quality is below this threshold"/>
+    <param name="min_cvrg" type="integer" label="Minimum Coverage Depth" optional="true" value="5" help="Any position below the threshold will be omitted from the resulting VCF"/>
+    <param name="min_var_pct" type="float" label="Minimum Frequency of a Specific Allele" option="true" value="0.5" help="If an allele does not meet the minimum frequency it will be omitted from the resulting VCF."/>
+    <param name="allow_multiples" type="boolean" truevalue="-m" falsevalue="" chacked="true" label="Allow Multiple Alleles for a Position?" 
+           help="Multiple alleles may be output in the VCF if the allowable frequency is below 0.5, otherwise only one will be reported"/>
+    <param name="snps_only" type="boolean" truevalue="-s" falsevalue="" chacked="false" label="Only report SNPs, not indels" />
+    <param name="depth_as" type="select" label="Report DP and SAF with read coverage of" help="The reported read voverage depth: DP, and the calculation of specific allele frequency (SAF) of variants">
+      <option value="ref" selected="true">Reads at this position</option>
+      <option value="qual">Reads at this position taht pass the base call quality threshold</option>
+      <option value="all"></option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="vcf" metadata_source="input_file" name="output_file" />
+  </outputs>
+  <stdio>
+    <exit_code range="1:"  level="fatal"   description="Bad input dataset" />
+  </stdio>
+  <tests>
+    <test>
+      <param name="input_file" ftype="pileup" value="test.pileup" />
+      <param name="select_order" value="no"/>
+      <param name="min_base_qual" value="0"/>
+      <param name="min_cvrg" value="0"/>
+      <param name="min_var_pct" value=".1"/>
+      <param name="allow_multiples" value="True"/>
+      <param name="snps_only" value="False"/>
+      <param name="depth_as" value="ref"/>
+      <output name="output_file">
+        <assert_contents>
+            <has_text_matching expression="seq2\t156\t.\tA\tG,AAG\t.\tPASS\t.*" />
+            <has_text_matching expression="chr1\t158571283\t.\tA\tC,T\t.\tPASS\t.*" />
+        </assert_contents>
+      </output>
+    </test>
+    <test>
+      <param name="input_file" ftype="pileup" value="test.pileup" />
+      <param name="select_order" value="no"/>
+      <param name="min_base_qual" value="20"/>
+      <param name="min_cvrg" value="5"/>
+      <param name="min_var_pct" value=".1"/>
+      <param name="allow_multiples" value="True"/>
+      <param name="snps_only" value="False"/>
+      <param name="depth_as" value="ref"/>
+      <output name="output_file">
+        <assert_contents>
+            <has_text_matching expression="seq2\t156\t.\tA\tG,AAG\t.\tPASS\t.*" />
+            <has_text_matching expression="chr1\t158571283\t.\tA\tC\t.\tPASS\t.*" />
+        </assert_contents>
+      </output>
+    </test>
+
+  </tests>
+  <help>
+    Pileup to VCF converts the output of a pileup tool to a VCF representing any alleles that surpass a user specified frequency, optionally presenting multiple alleles for a given position if the allele frequency is set below 0.5.  This tool assumes that any filtering for base call quality and mapping quality has been done in previous processing.
+  </help>
+</tool>
b
diff -r 000000000000 -r 3890f8ba0e4d test-data/test.pileup
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test.pileup Mon Feb 18 11:32:50 2013 -0500
b
@@ -0,0 +1,13 @@
+seq1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+seq1 273 T 23 ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+seq1 274 T 23 ,.$....,,.,.,...,,,.,... 7<7;<;<<<<<<<<<=<;<;<<6
+seq1 275 A 23 ,$....,,.,.,...,,,.,...^l. <+;9*<<<<<<<<<=<<:;<<<<
+seq1 276 G 22 ...T,,.,.,...,,,.,.... 33;+<<7=7<<7<&<<1;<<6<
+seq1 277 T 22 ....,,.,.,.C.,,,.,..G. +7<;<<<<<<<&<=<<:;<<&<
+seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<<
+seq1 279 C 23 A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+seq2 156 A 11 .$......+2AG.+2AG.+2AGGG <975;:<<<<<
+seq3 200 A 20 ,,,,,..,.-4CACC.-4CACC....,.,,.^~. ==<<<<<<<<<<<::<;2<<
+chr1 95935183 G 34 .,.,,,+1c,+1c.+1C,+1c,+1c.+1C.+1C.+1C,+1c.+1C,+1c,+1c.+1C,+1c,+1c.+1C,+1c.+1C.+1C,+1c,+1c,+1c.+1C.+1C.+1C.+1C.+1C^~c^~c IFEGIHIBIIIBAGIDI@IHCIIGHIHHEIIG!!
+chr1 158571283 A 8 <,.c.c^~t^~c GIC#I#'!
+chr6 125070485 C 216 <><<>>>>>>><<>>>><<<><<<<<>>>>><>>>>>><>>>>>>><><><>><><><>><>><<><<<>>>><<<>>>><<>>>><<<<<<<<<<<><<<<<<<><><<<<<><<<>><<<<<<<<>>>><<<<<<><><>>>>>>>>>>><<<<<<>><<<<>><><>>>>><>...,,,,,,,GGGGGGggggggGGGGggggggggg^~G^~g^~g^~g^~g IHHH:GFDH<H@IHCEIIIIHIIIDI#EACGGIEHB#HDHEBHGGDI?BHIEIIIGHBHIEDIIHHHIGHFIIDI<IIIIEEIIIIGAHIHEGGIBHBIHE;@CFIBHGGCDBGDGGGIIDC@GEAGDIHIIEEHBIIGIEGHIHHIHHHH2@G#>EBHICD@#GHHHFI9HII#I##AIIIE5@G!!!!!!!!!#!!!!!!!!$$!!$!$!#!!!