| 0 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 # Code by Boris Rebolledo-Jaramillo | 
|  | 4 # (boris-at-bx.psu.edu) | 
|  | 5 # Edited by Nick Stoler | 
|  | 6 # (nick-at-bx.psu.edu) | 
|  | 7 # New in this version: | 
|  | 8 # - Add in proper header line if not present | 
|  | 9 | 
|  | 10 | 
|  | 11 import os | 
|  | 12 import sys | 
|  | 13 import array | 
|  | 14 import numpy | 
|  | 15 from rpy2.robjects import Formula | 
|  | 16 from rpy2.robjects.packages import importr | 
|  | 17 from rpy2 import robjects | 
|  | 18 | 
|  | 19 def fail(message): | 
|  | 20   sys.stderr.write(message+'\n') | 
|  | 21   sys.exit(1) | 
|  | 22 | 
|  | 23 COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', | 
|  | 24                  'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] | 
|  | 25 | 
|  | 26 COLUMN_LABELS_STRANDED= ['SAMPLE', 'CHR',  'POS', '+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T', | 
|  | 27                          'CVRG','ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] | 
|  | 28 | 
|  | 29 args = sys.argv[1:] | 
|  | 30 if len(args) >= 1: | 
|  | 31   infile = args[0] | 
|  | 32 else: | 
|  | 33   fail('Error: No input filename provided (as argument 1).') | 
|  | 34 if len(args) >= 2: | 
|  | 35   outfile = args[1] | 
|  | 36 else: | 
|  | 37   fail('Error: No output filename provided (as argument 2).') | 
|  | 38 if len(args) >= 3: | 
|  | 39   report = args[2] | 
|  | 40 else: | 
|  | 41   report = '' | 
|  | 42 | 
|  | 43 # Check input file | 
|  | 44 add_header = False | 
|  | 45 if not os.path.exists(infile): | 
|  | 46   fail('Error: Input file '+infile+' could not be found.') | 
|  | 47 with open(infile, 'r') as lines: | 
|  | 48   line = lines.readline() | 
|  | 49   if not line: | 
|  | 50     fail('Error: Input file seems to be empty') | 
|  | 51   line = line.strip().lstrip('#') # rm whitespace, comment chars | 
|  | 52   labels = line.split("\t") | 
|  | 53   if 'SAMPLE' not in labels: | 
|  | 54     sys.stderr.write("Error: Input file does not seem to have a proper header " | 
|  | 55       +"line.\nAdding an artificial header..") | 
|  | 56     add_header = True | 
|  | 57 | 
|  | 58 r = robjects.r | 
|  | 59 base = importr('base') | 
|  | 60 utils = importr('utils') | 
|  | 61 stats = importr('stats') | 
|  | 62 rprint = robjects.globalenv.get("print") | 
|  | 63 graphics = importr('graphics') | 
|  | 64 grdevices = importr('grDevices') | 
|  | 65 grdevices.png(file=outfile, width=1024, height=768, type="cairo") | 
|  | 66 | 
|  | 67 # Read file into a data frame | 
|  | 68 if add_header: | 
|  | 69     # add header line manually if not present | 
|  | 70     DATA = utils.read_delim(infile, header=False) | 
|  | 71     labels = robjects.r.names(DATA) | 
|  | 72     for i in range(len(labels)): | 
|  | 73         try: | 
|  | 74             labels[i] = COLUMN_LABELS[i] | 
|  | 75         except IndexError, e: | 
|  | 76             try: | 
|  | 77                 labels[i] = COLUMN_LABELS_EXTENDED[i] | 
|  | 78             except: | 
|  | 79                 fail("Error in input file: Too many columns (does not match hardcoded " | 
|  | 80                 +"column labels).") | 
|  | 81 else: | 
|  | 82   DATA = utils.read_delim(infile) | 
|  | 83   # Remove comment from header, if present | 
|  | 84   labels = robjects.r.names(DATA) | 
|  | 85   if labels[0][0:2] == 'X.': | 
|  | 86     labels[0] = labels[0][2:] | 
|  | 87 | 
|  | 88 # Multiply minor allele frequencies by 100 to get percentage | 
|  | 89 #  .rx2() looks up a column by its label and returns it as a vector | 
|  | 90 #  .ro turns the returned object into one that can be operated on per-element | 
|  | 91 minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 | 
|  | 92 samples    = DATA.rx2('SAMPLE') | 
|  | 93 | 
|  | 94 # Formula() creates a Python object representing the R object returned by x ~ y | 
|  | 95 formula = Formula('minor_freq ~ samples') | 
|  | 96 # The "environment" in .getenvironment() is the entire R workspace in which the | 
|  | 97 # Formula object exists. The R workspace meaning all the defined variables. | 
|  | 98 # Here, the .getenvironment() method is being used to set some variables in the | 
|  | 99 # R workspace | 
|  | 100 | 
|  | 101 formula.getenvironment()['minor_freq'] = minor_freq | 
|  | 102 formula.getenvironment()['samples']    = samples | 
|  | 103 | 
|  | 104 | 
|  | 105 r.par(oma=array.array('i', [0,0,0,0])) | 
|  | 106 r.par(mar=array.array('i', [10,4,4,2])) | 
|  | 107 ylimit = array.array('i',[-5,50]) | 
|  | 108 | 
|  | 109 # create boxplot - fill kwargs1 with the options for the boxplot function | 
|  | 110 kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", | 
|  | 111            'outpch':"*",'main':"Distribution of minor allele frequencies", | 
|  | 112            'cex.lab':"1.5"} | 
|  | 113 p = graphics.boxplot(formula, axes=0,ylim=ylimit, lty=1,**kwargs1) | 
|  | 114 | 
|  | 115 table = base.table(DATA.rx2('SAMPLE')) | 
|  | 116 graphics.text(0, -1, 'N:', font=2) | 
|  | 117 for i in range(1, base.length(table)[0]+1, 1): | 
|  | 118     graphics.text(i, -1, table[i-1], font=2) | 
|  | 119 | 
|  | 120 graphlabels = base.names(table) | 
|  | 121 kwargs3 = {'pos':"-2", 'las':"2", 'cex.axis':"1"} | 
|  | 122 graphics.axis(1, at=range(1, len(graphlabels)+1, 1),labels=graphlabels, **kwargs3) | 
|  | 123 graphics.axis(2,at=(range(0,60,10)),pos=0,font=2) | 
|  | 124 grdevices.dev_off() | 
|  | 125 | 
|  | 126 if not report: | 
|  | 127     sys.exit(0) | 
|  | 128 | 
|  | 129 | 
|  | 130 SAMPLES=[] | 
|  | 131 for i in range(len(table)): | 
|  | 132     SAMPLES.append(base.names(table)[i]) | 
|  | 133 | 
|  | 134 def boxstats(data,sample): | 
|  | 135     VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample] | 
|  | 136     NoHET  = len(VALUES) | 
|  | 137     MEDIAN = numpy.median(VALUES) | 
|  | 138     MAD    = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic) | 
|  | 139     return [NoHET,MEDIAN, MAD] | 
|  | 140 | 
|  | 141 boxreport = open(report, "w+") | 
|  | 142 boxreport.write("#sample\tNo.sites\tmedian.freq\tMAD.freq\n") | 
|  | 143 | 
|  | 144 for sample in SAMPLES: | 
|  | 145     ENTRY = [sample] + boxstats(infile,sample) | 
|  | 146     boxreport.write ('%s\t%d\t%.1f\t%.1f\n' % tuple([ENTRY[i] for i in [0,1,2,3]])) | 
|  | 147 boxreport.close() | 
|  | 148 | 
|  | 149 | 
|  | 150 | 
|  | 151 |