Galaxy - GATK Output

',] - if llen > maxloglines: - n = min(50,int(maxloglines/2)) - rlog += l[:n] - rlog.append('------------ ## %d rows deleted ## --------------\n' % (llen-maxloglines)) - rlog += l[-n:] - else: - rlog += l - rlog.append('

-""" - - -SVG_HEADER = ''' - - - -''' - - -DEFAULT_MAX_SAMPLE_SIZE = 5000 - -REF_COUNT_HOM1 = 3 -REF_COUNT_HET = 2 -REF_COUNT_HOM2 = 1 -MISSING = 0 - -MARKER_PAIRS_PER_SECOND_SLOW = 15000000 -MARKER_PAIRS_PER_SECOND_FAST = 70000000 - -POLYGONS = { - REL_UNRELATED: ((1.360, 0.655), (1.385, 0.730), (1.620, 0.575), (1.610, 0.505)), - REL_HALFSIBS: ((1.630, 0.500), (1.630, 0.550), (1.648, 0.540), (1.648, 0.490)), - REL_SIBS: ((1.660, 0.510), (1.665, 0.560), (1.820, 0.410), (1.820, 0.390)), - REL_PARENTCHILD: ((1.650, 0.470), (1.650, 0.490), (1.750, 0.440), (1.750, 0.420)), - REL_DUPE: ((1.970, 0.000), (1.970, 0.150), (2.000, 0.150), (2.000, 0.000)), - } - -def distance(point1, point2): - """ Calculate the distance between two points - """ - (x1,y1) = [float(d) for d in point1] - (x2,y2) = [float(d) for d in point2] - dx = abs(x1 - x2) - dy = abs(y1 - y2) - return math.sqrt(dx**2 + dy**2) - -def point_inside_polygon(x, y, poly): - """ Determine if a point (x,y) is inside a given polygon or not - poly is a list of (x,y) pairs. - - Taken from: http://www.ariel.com.au/a/python-point-int-poly.html - """ - - n = len(poly) - inside = False - - p1x,p1y = poly[0] - for i in range(n+1): - p2x,p2y = poly[i % n] - if y > min(p1y,p2y): - if y <= max(p1y,p2y): - if x <= max(p1x,p2x): - if p1y != p2y: - xinters = (y-p1y)*(p2x-p1x)/(p2y-p1y)+p1x - if p1x == p2x or x <= xinters: - inside = not inside - p1x,p1y = p2x,p2y - return inside - -def readMap(pedfile): - """ - """ - mapfile = pedfile.replace('.ped', '.map') - marker_list = [] - if os.path.exists(mapfile): - print 'readMap: %s' % (mapfile) - fh = file(mapfile, 'r') - for line in fh: - marker_list.append(line.strip().split()) - fh.close() - print 'readMap: %s markers' % (len(marker_list)) - return marker_list - -def calcMeanSD(useme): - """ - A numerically stable algorithm is given below. It also computes the mean. - This algorithm is due to Knuth,[1] who cites Welford.[2] - n = 0 - mean = 0 - M2 = 0 - - foreach x in data: - n = n + 1 - delta = x - mean - mean = mean + delta/n - M2 = M2 + delta*(x - mean) // This expression uses the new value of mean - end for - - variance_n = M2/n - variance = M2/(n - 1) - """ - mean = 0.0 - M2 = 0.0 - sd = 0.0 - n = len(useme) - if n > 1: - for i,x in enumerate(useme): - delta = x - mean - mean = mean + delta/(i+1) # knuth uses n+=1 at start - M2 = M2 + delta*(x - mean) # This expression uses the new value of mean - variance = M2/(n-1) # assume is sample so lose 1 DOF - sd = pow(variance,0.5) - return mean,sd - - -def doIBSpy(ped=None,basename='',outdir=None,logf=None, - nrsSamples=10000,title='title',pdftoo=0,Zcutoff=2.0): - #def doIBS(pedName, title, nrsSamples=None, pdftoo=False): - """ started with snpmatrix but GRR uses actual IBS counts and sd's - """ - repOut = [] # text strings to add to the html display - refallele = {} - tblf = '%s_table.xls' % (title) - tbl = file(os.path.join(outdir,tblf), 'w') - tbl.write(TABLE_HEADER) - svgf = '%s.svg' % (title) - svg = file(os.path.join(outdir,svgf), 'w') - - nMarkers = len(ped._markers) - if nMarkers < 5: - print sys.stderr, '### ERROR - %d is too few markers for reliable estimation in %s - terminating' % (nMarkers,PROGNAME) - sys.exit(1) - nSubjects = len(ped._subjects) - nrsSamples = min(nMarkers, nrsSamples) - if opts and opts.use_mito: - markers = range(nMarkers) - nrsSamples = min(len(markers), nrsSamples) - sampleIndexes = sorted(random.sample(markers, nrsSamples)) - else: - autosomals = ped.autosomal_indices() - nrsSamples = min(len(autosomals), nrsSamples) - sampleIndexes = sorted(random.sample(autosomals, nrsSamples)) - - print '' - print 'Getting random.sample of %s from %s total' % (nrsSamples, nMarkers) - npairs = (nSubjects*(nSubjects-1))/2 # total rows in table - newfiles=[svgf,tblf] - explanations = ['rgGRR Plot (requires SVG)','Mean by SD alleles shared - %d rows' % npairs] - # these go with the output file links in the html file - s = 'Reading genotypes for %s subjects and %s markers\n' % (nSubjects, nrsSamples) - logf.write(s) - minUsegenos = nrsSamples/2 # must have half? - nGenotypes = nSubjects*nrsSamples - stime = time.time() - emptyRows = set() - genos = numpy.zeros((nSubjects, nrsSamples), dtype=int) - for s in xrange(nSubjects): - nValid = 0 - #getGenotypesByIndices(self, s, mlist, format) - genos[s] = ped.getGenotypesByIndices(s, sampleIndexes, format='ref') - nValid = sum([1 for g in genos[s] if g]) - if not nValid: - emptyRows.add(s) - sub = ped.getSubject(s) - print 'All missing for row %d (%s)' % (s, sub) - logf.write('All missing for row %d (%s)\n' % (s, sub)) - rtime = time.time() - stime - if verbose: - print '@@Read %s genotypes in %s seconds' % (nGenotypes, rtime) - - - ### Now the expensive part. For each pair of subjects, we get the mean number - ### and standard deviation of shared alleles over all of the markers where both - ### subjects have a known genotype. Identical subjects should have mean shared - ### alleles very close to 2.0 with a standard deviation very close to 0.0. - tot = nSubjects*(nSubjects-1)/2 - nprog = tot/10 - nMarkerpairs = tot * nrsSamples - estimatedTimeSlow = nMarkerpairs/MARKER_PAIRS_PER_SECOND_SLOW - estimatedTimeFast = nMarkerpairs/MARKER_PAIRS_PER_SECOND_FAST - - pairs = [] - pair_data = {} - means = [] ## Mean IBS for each pair - ngenoL = [] ## Count of comparable genotypes for each pair - sdevs = [] ## Standard dev for each pair - rels = [] ## A relationship code for each pair - zmeans = [0.0 for x in xrange(tot)] ## zmean score for each pair for the relgroup - zstds = [0.0 for x in xrange(tot)] ## zstd score for each pair for the relgrp - skip = set() - ndone = 0 ## How many have been done so far - - logf.write('Calculating %d pairs...\n' % (tot)) - logf.write('Estimated time is %2.2f to %2.2f seconds ...\n' % (estimatedTimeFast, estimatedTimeSlow)) - - t1sum = 0 - t2sum = 0 - t3sum = 0 - now = time.time() - scache = {} - _founder_cache = {} - C_CODE = """ - #include "math.h" - int i; - int sumibs = 0; - int ssqibs = 0; - int ngeno = 0; - float mean = 0; - float M2 = 0; - float delta = 0; - float sdev=0; - float variance=0; - for (i=0; i 1) { - variance = M2/(ngeno-1); - sdev = sqrt(variance); - //printf("OK: %d %3.2f %3.2f\\n", ngeno, mean, sdev); - } - //printf("%d %d %d %1.2f %1.2f\\n", ngeno, sumibs, ssqibs, mean, sdev); - result[0] = ngeno; - result[1] = mean; - result[2] = sdev; - return_val = ngeno; - """ - started = time.time() - for s1 in xrange(nSubjects): - if s1 in emptyRows: - continue - (fid1,iid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache.setdefault(s1, ped.getSubject(s1)) - - isFounder1 = _founder_cache.setdefault(s1, (did1==mid1)) - g1 = genos[s1] - - for s2 in xrange(s1+1, nSubjects): - if s2 in emptyRows: - continue - t1s = time.time() - - (fid2,iid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache.setdefault(s2, ped.getSubject(s2)) - - g2 = genos[s2] - isFounder2 = _founder_cache.setdefault(s2, (did2==mid2)) - - # Determine the relationship for this pair - relcode = REL_UNKNOWN - if (fid2 == fid1): - if iid1 == iid2: - relcode = REL_DUPE - elif (did2 == did1) and (mid2 == mid1) and did1 != mid1: - relcode = REL_SIBS - elif (iid1 == mid2) or (iid1 == did2) or (iid2 == mid1) or (iid2 == did1): - relcode = REL_PARENTCHILD - elif (str(did1) != '0' and (did2 == did1)) or (str(mid1) != '0' and (mid2 == mid1)): - relcode = REL_HALFSIBS - else: - # People in the same family should be marked as some other - # form of related. In general, these people will have a - # pretty random spread of similarity. This distinction is - # probably not very useful most of the time - relcode = REL_RELATED - else: - ### Different families - relcode = REL_UNRELATED - - t1e = time.time() - t1sum += t1e-t1s - - - ### Calculate sum(2-abs(a1-a2)) and sum((2-abs(a1-a2))**2) and count - ### the number of contributing genotypes. These values are not actually - ### calculated here, but instead are looked up in a table for speed. - ### FIXME: This is still too slow ... - result = [0.0, 0.0, 0.0] - ngeno = weave.inline(C_CODE, ['g1', 'g2', 'nrsSamples', 'result']) - if ngeno >= minUsegenos: - _, mean, sdev = result - means.append(mean) - sdevs.append(sdev) - ngenoL.append(ngeno) - pairs.append((s1, s2)) - rels.append(relcode) - else: - skip.add(ndone) # signal no comparable genotypes for this pair - ndone += 1 - t2e = time.time() - t2sum += t2e-t1e - t3e = time.time() - t3sum += t3e-t2e - - logme = [ 'T1: %s' % (t1sum), 'T2: %s' % (t2sum), 'T3: %s' % (t3sum),'TOT: %s' % (t3e-now), - '%s pairs with no (or not enough) comparable genotypes (%3.1f%%)' % (len(skip), - float(len(skip))/float(tot)*100)] - logf.write('%s\n' % '\t'.join(logme)) - ### Calculate mean and standard deviation of scores on a per relationship - ### type basis, allowing us to flag outliers for each particular relationship - ### type - relstats = {} - relCounts = {} - outlierFiles = {} - for relCode, relInfo in REL_LOOKUP.items(): - relName, relColor, relStyle = relInfo - useme = [means[x] for x in xrange(len(means)) if rels[x] == relCode] - relCounts[relCode] = len(useme) - mm = scipy.mean(useme) - ms = scipy.std(useme) - useme = [sdevs[x] for x in xrange(len(sdevs)) if rels[x] == relCode] - sm = scipy.mean(useme) - ss = scipy.std(useme) - relstats[relCode] = {'sd':(sm,ss), 'mean':(mm,ms)} - s = 'Relstate %s (n=%d): mean(mean)=%3.2f sdev(mean)=%3.2f, mean(sdev)=%3.2f sdev(sdev)=%3.2f\n' % \ - (relName,relCounts[relCode], mm, ms, sm, ss) - logf.write(s) - - ### now fake z scores for each subject like abecasis recommends max(|zmu|,|zsd|) - ### within each group, for each pair, z=(groupmean-pairmean)/groupsd - available = len(means) - logf.write('%d pairs are available of %d\n' % (available, tot)) - ### s = '\nOutliers:\nrelationship\tzmean\tzsd\tped1\tped2\tmean\tsd\trmeanmean\trmeansd\trsdmean\trsdsd\n' - ### logf.write(s) - pairnum = 0 - offset = 0 - nOutliers = 0 - cexs = [] - outlierRecords = dict([(r, []) for r in range(N_RELATIONSHIP_TYPES)]) - zsdmax = 0 - for s1 in range(nSubjects): - if s1 in emptyRows: - continue - (fid1,iid1,did1,mid1,sex1,aff1,ok1,d_sid1,m_sid1) = scache[s1] - for s2 in range(s1+1, nSubjects): - if s2 in emptyRows: - continue - if pairnum not in skip: - ### Get group stats for this relationship - (fid2,iid2,did2,mid2,sex2,aff2,ok2,d_sid2,m_sid2) = scache[s2] - try: - r = rels[offset] - except IndexError: - logf.write('###OOPS offset %d available %d pairnum %d len(rels) %d', offset, available, pairnum, len(rels)) - notfound = ('?',('?','0','0')) - relInfo = REL_LOOKUP.get(r,notfound) - relName, relColor, relStyle = relInfo - rmm,rmd = relstats[r]['mean'] # group mean, group meansd alleles shared - rdm,rdd = relstats[r]['sd'] # group sdmean, group sdsd alleles shared - - try: - zsd = (sdevs[offset] - rdm)/rdd # distance from group mean in group sd units - except: - zsd = 1 - if abs(zsd) > zsdmax: - zsdmax = zsd # keep for sort scaling - try: - zmean = (means[offset] - rmm)/rmd # distance from group mean - except: - zmean = 1 - zmeans[offset] = zmean - zstds[offset] = zsd - pid=(s1,s2) - zrad = max(zsd,zmean) - if zrad < 4: - zrad = 2 - elif 4 < zrad < 15: - zrad = 3 # to 9 - else: # > 15 6=24+ - zrad=zrad/4 - zrad = min(zrad,6) # scale limit - zrad = max(2,max(zsd,zmean)) # as > 2, z grows - pair_data[pid] = (zmean,zsd,r,zrad) - if max(zsd,zmean) > Zcutoff: # is potentially interesting - mean = means[offset] - sdev = sdevs[offset] - outlierRecords[r].append((mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd,did1,mid1,did2,mid2)) - nOutliers += 1 - tbl.write('%s_%s\t%s_%s\t%f\t%f\t%f\t%f\t%d\t%s\t%s\t%s\t%s\t%s\n' % \ - (fid1, iid1, fid2, iid2, mean, sdev, zmean,zsd, ngeno, relName, did1,mid1,did2,mid2)) - offset += 1 - pairnum += 1 - logf.write( 'Outliers: %s\n' % (nOutliers)) - - ### Write outlier files for each relationship type - repOut.append('

Outliers in tab delimited files linked above are also listed below

') - lzsd = round(numpy.log10(zsdmax)) + 1 - scalefactor = 10**lzsd - for relCode, relInfo in REL_LOOKUP.items(): - relName, _, _ = relInfo - outliers = outlierRecords[relCode] - if not outliers: - continue - outliers = [(scalefactor*int(abs(x[3]))+ int(abs(x[2])),x) for x in outliers] # decorate - outliers.sort() - outliers.reverse() # largest deviation first - outliers = [x[1] for x in outliers] # undecorate - nrows = len(outliers) - truncated = 0 - if nrows > MAX_SHOW_ROWS: - s = '

%s outlying pairs (top %d of %d) from %s

' % \ - (relName,MAX_SHOW_ROWS,nrows,title) - truncated = nrows - MAX_SHOW_ROWS - else: - s = '

%s outlying pairs (n=%d) from %s

' % (relName,nrows,title) - repOut.append(s) - fhname = '%s_rgGRR_%s_outliers.xls' % (title, relName) - fhpath = os.path.join(outdir,fhname) - fh = open(fhpath, 'w') - newfiles.append(fhname) - explanations.append('%s Outlier Pairs %s, N=%d, Cutoff SD=%f' % (relName,title,len(outliers),Zcutoff)) - fh.write(OUTLIERS_HEADER) - s = ''.join(['' % x for x in OUTLIERS_HEADER_list]) - repOut.append('%s' % s) - for n,rec in enumerate(outliers): - #(mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd) = rec - s = '%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t%f\t%f\t%f\t%f\t%s\t%s\t%s\t%s\t' % tuple(rec) - fh.write('%s%s\n' % (s,relName)) - # (mean, sdev, zmean, zsd, fid1, iid1, fid2, iid2, rmm, rmd, rdm, rdd, did1,mid1,did2,mid2)) - s = ''' - ''' % tuple(rec) - s = '%s' % (s,relName) - if n < MAX_SHOW_ROWS: - repOut.append('%s' % s) - if truncated > 0: - repOut.append('

WARNING: %d rows truncated - see outlier file for all %d rows

' % (truncated, - nrows)) - fh.close() - repOut.append('

%s
%f	%f	%f	%f	%s	%s	%s	%s	%f	%f	%f	%f	%s	%s	%s	%s	%s

') - - ### Now, draw the plot in jpeg and svg formats, and optionally in the PDF format - ### if requested - logf.write('Plotting ...') - pointColors = [REL_COLORS[rel] for rel in rels] - pointStyles = [REL_POINTS[rel] for rel in rels] - - mainTitle = '%s (%s subjects, %d snp)' % (title, nSubjects, nrsSamples) - svg.write(SVG_HEADER % (SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[4], - SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[0],SVG_COLORS[0],SVG_COLORS[1],SVG_COLORS[1], - SVG_COLORS[2],SVG_COLORS[2],SVG_COLORS[3],SVG_COLORS[3],SVG_COLORS[4],SVG_COLORS[4], - SVG_COLORS[5],SVG_COLORS[5],SVG_COLORS[6],SVG_COLORS[6],mainTitle)) - #rpy.r.jpeg(filename='%s.jpg' % (title), width=1600, height=1200, pointsize=12, quality=100, bg='white') - #rpy.r.par(mai=(1,1,1,0.5)) - #rpy.r('par(xaxs="i",yaxs="i")') - #rpy.r.plot(means, sdevs, main=mainTitle, ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) - #rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) - #rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') - #rpy.r.dev_off() - - ### We will now go through each relationship type to partition plot points - ### into "bulk" and "outlier" groups. Bulk points will represent common - ### mean/sdev pairs and will cover the majority of the points in the plot -- - ### they will use generic tooltip informtion about all of the pairs - ### represented by that point. "Outlier" points will be uncommon pairs, - ### with very specific information in their tooltips. It would be nice to - ### keep hte total number of plotted points in the SVG representation to - ### ~10000 (certainly less than 100000?) - pointMap = {} - orderedRels = [y[1] for y in reversed(sorted([(relCounts.get(x, 0),x) for x in REL_LOOKUP.keys()]))] - # do we really want this? I want out of zone points last and big - for relCode in orderedRels: - svgColor = SVG_COLORS[relCode] - relName, relColor, relStyle = REL_LOOKUP[relCode] - svg.write('\n' % (relName, svgColor, svgColor)) - pMap = pointMap.setdefault(relCode, {}) - nPoints = 0 - rpairs=[] - rgenos=[] - rmeans=[] - rsdevs=[] - rz = [] - for x,rel in enumerate(rels): # all pairs - if rel == relCode: - s1,s2 = pairs[x] - pid=(s1,s2) - zmean,zsd,r,zrad = pair_data[pid][:4] - rpairs.append(pairs[x]) - rgenos.append(ngenoL[x]) - rmeans.append(means[x]) - rsdevs.append(sdevs[x]) - rz.append(zrad) - ### Now add the svg point group for this relationship to the svg file - for x in range(len(rmeans)): - svgX = '%d' % ((rmeans[x] - 1.0) * PLOT_WIDTH) # changed so mean scale is 1-2 - svgY = '%d' % (PLOT_HEIGHT - (rsdevs[x] * PLOT_HEIGHT)) # changed so sd scale is 0-1 - s1, s2 = rpairs[x] - (fid1,uid1,did1,mid1,sex1,phe1,iid1,d_sid1,m_sid1) = scache[s1] - (fid2,uid2,did2,mid2,sex2,phe2,iid2,d_sid2,m_sid2) = scache[s2] - ngenos = rgenos[x] - nPoints += 1 - point = pMap.setdefault((svgX, svgY), []) - point.append((rmeans[x], rsdevs[x], fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos,rz[x])) - for (svgX, svgY) in pMap: - points = pMap[(svgX, svgY)] - svgX = int(svgX) - svgY = int(svgY) - if len(points) > 1: - mmean,dmean = calcMeanSD([p[0] for p in points]) - msdev,dsdev = calcMeanSD([p[1] for p in points]) - mgeno,dgeno = calcMeanSD([p[-1] for p in points]) - mingeno = min([p[-1] for p in points]) - maxgeno = max([p[-1] for p in points]) - svg.write("""\n""" % (svgX, svgY, relCode, mmean, dmean, msdev, dsdev, len(points), mgeno, dgeno, mingeno, maxgeno)) - else: - mean, sdev, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, ngenos, zrad = points[0][:12] - rmean = float(relstats[relCode]['mean'][0]) - rsdev = float(relstats[relCode]['sd'][0]) - if zrad < 4: - zrad = 2 - elif 4 < zrad < 9: - zrad = 3 # to 9 - else: # > 9 5=15+ - zrad=zrad/3 - zrad = min(zrad,5) # scale limit - if zrad <= 3: - svg.write('\n' % (svgX, svgY, zrad, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) - else: # highlight pairs a long way from expectation by outlining circle in red - svg.write("""\n""" % \ - (svgX, svgY, zrad, svgColor, relCode, fid1, iid1, did1, mid1, fid2, iid2, did2, mid2, mean, sdev, ngenos, rmean, rsdev)) - svg.write('\n') - - ### Create a pdf as well if indicated on the command line - ### WARNING! for framingham share, with about 50M pairs, this is a 5.5GB pdf! -## if pdftoo: -## pdfname = '%s.pdf' % (title) -## rpy.r.pdf(pdfname, 6, 6) -## rpy.r.par(mai=(1,1,1,0.5)) -## rpy.r('par(xaxs="i",yaxs="i")') -## rpy.r.plot(means, sdevs, main='%s, %d snp' % (title, nSamples), ylab=Y_AXIS_LABEL, xlab=X_AXIS_LABEL, cex=cexs, col=pointColors, pch=pointStyles, xlim=(0,2), ylim=(0,2)) -## rpy.r.legend(LEGEND_ALIGN, legend=REL_STATES, pch=REL_POINTS, col=REL_COLORS, title=LEGEND_TITLE) -## rpy.r.grid(nx=10, ny=10, col='lightgray', lty='dotted') -## rpy.r.dev_off() - - ### Draw polygons - if showPolygons: - svg.write('\n') - for rel, poly in POLYGONS.items(): - points = ' '.join(['%s,%s' % ((p[0]-1.0)*float(PLOT_WIDTH), (PLOT_HEIGHT - p[1]*PLOT_HEIGHT)) for p in poly]) - svg.write('\n' % (points, SVG_COLORS[rel])) - svg.write('\n') - - - svg.write(SVG_FOOTER) - svg.close() - return newfiles,explanations,repOut - -def doIBS(n=100): - """parse parameters from galaxy - expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n' - - rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" - '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' - - - """ - u=""" - rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" - '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' - - """ - - - if len(sys.argv) < 7: - print >> sys.stdout, 'Need pbed inpath, basename, out_htmlname, outpath, title, logpath, nSNP, Zcutoff on command line please' - print >> sys.stdout, u - sys.exit(1) - ts = '%s%s' % (string.punctuation,string.whitespace) - ptran = string.maketrans(ts,'_'*len(ts)) - inpath = sys.argv[1] - ldinpath = os.path.split(inpath)[0] - basename = sys.argv[2] - outhtml = sys.argv[3] - newfilepath = sys.argv[4] - title = sys.argv[5].translate(ptran) - logfname = 'Log_%s.txt' % title - logpath = os.path.join(newfilepath,logfname) # log was a child - make part of html extra_files_path zoo - n = int(sys.argv[6]) - try: - Zcutoff = float(sys.argv[7]) - except: - Zcutoff = 2.0 - try: - os.makedirs(newfilepath) - except: - pass - logf = file(logpath,'w') - efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path - ped = plinkbinJZ.BPed(inpath) - ped.parse(quick=True) - if ped == None: - print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename) - sys.exit(1) - newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath, - logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff) - logf.close() - logfs = file(logpath,'r').readlines() - lf = file(outhtml,'w') - lf.write(galhtmlprefix % PROGNAME) - # this is a mess. todo clean up - should each datatype have it's own directory? Yes - # probably. Then titles are universal - but userId libraries are separate. - s = '

Output from %s run at %s
\n' % (PROGNAME,timenow()) - lf.write('

%s

\n' % s) - fixed = ["'%s'" % x for x in sys.argv] # add quotes just in case - s = 'If you need to rerun this analysis, the command line was\n

%s

' % (' '.join(fixed)) - lf.write(s) - # various ways of displaying svg - experiments related to missing svg mimetype on test (!) - #s = """

""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT,newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) - s = """

""" % (newfiles[0],PLOT_WIDTH,PLOT_HEIGHT) - #s = """

Additional Files:

Messages from MACS:

Outliers in tab delimited files linked above are also listed below

%s outlying pairs (top %d of %d) from %s

%s outlying pairs (n=%d) from %s

WARNING: %d rows truncated - see outlier file for all %d rows

%s