Mercurial > repos > devteam > dna_filtering
changeset 0:a6f0d355b05f draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 28 Jul 2014 11:55:47 -0400 |
parents | |
children | 549d2cb4c6f2 |
files | histogram.py histogram2.png histogram2.xml plot_filter.py plotter.py test-data/histogram_in1.tabular test-data/histogram_out1.pdf tool_dependencies.xml |
diffstat | 8 files changed, 850 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/histogram.py Mon Jul 28 11:55:47 2014 -0400 @@ -0,0 +1,101 @@ +#!/usr/bin/env python +#Greg Von Kuster + +import sys +from rpy import * + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def main(): + + # Handle input params + in_fname = sys.argv[1] + out_fname = sys.argv[2] + try: + column = int( sys.argv[3] ) - 1 + except: + stop_err( "Column not specified, your query does not contain a column of numerical data." ) + title = sys.argv[4] + xlab = sys.argv[5] + breaks = int( sys.argv[6] ) + if breaks == 0: + breaks = "Sturges" + if sys.argv[7] == "true": + density = True + else: density = False + if len( sys.argv ) >= 9 and sys.argv[8] == "true": + frequency = True + else: frequency = False + + matrix = [] + skipped_lines = 0 + first_invalid_line = 0 + invalid_value = '' + i = 0 + for i, line in enumerate( file( in_fname ) ): + valid = True + line = line.rstrip('\r\n') + # Skip comments + if line and not line.startswith( '#' ): + # Extract values and convert to floats + row = [] + try: + fields = line.split( "\t" ) + val = fields[column] + if val.lower() == "na": + row.append( float( "nan" ) ) + except: + valid = False + skipped_lines += 1 + if not first_invalid_line: + first_invalid_line = i+1 + else: + try: + row.append( float( val ) ) + except ValueError: + valid = False + skipped_lines += 1 + if not first_invalid_line: + first_invalid_line = i+1 + invalid_value = fields[column] + else: + valid = False + skipped_lines += 1 + if not first_invalid_line: + first_invalid_line = i+1 + + if valid: + matrix += row + + if skipped_lines < i: + try: + a = r.array( matrix ) + r.pdf( out_fname, 8, 8 ) + histogram = r.hist( a, probability=not frequency, main=title, xlab=xlab, breaks=breaks ) + if density: + density = r.density( a ) + if frequency: + scale_factor = len( matrix ) * ( histogram['mids'][1] - histogram['mids'][0] ) #uniform bandwidth taken from first 2 midpoints + density[ 'y' ] = map( lambda x: x * scale_factor, density[ 'y' ] ) + r.lines( density ) + r.dev_off() + except Exception, exc: + stop_err( "%s" %str( exc ) ) + else: + if i == 0: + stop_err("Input dataset is empty.") + else: + stop_err( "All values in column %s are non-numeric." %sys.argv[3] ) + + print "Histogram of column %s. " %sys.argv[3] + if skipped_lines > 0: + print "Skipped %d invalid lines starting with line #%d, '%s'." % ( skipped_lines, first_invalid_line, invalid_value ) + + r.quit( save="no" ) + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/histogram2.xml Mon Jul 28 11:55:47 2014 -0400 @@ -0,0 +1,77 @@ +<tool id="histogram_rpy" name="Histogram" version="1.0.3"> + <description>of a numeric column</description> + <requirements> + <requirement type="package" version="1.0.3">rpy</requirement> + <requirement type="package" version="2.11.0">R</requirement> + </requirements> + <command interpreter="python">histogram.py $input $out_file1 $numerical_column "$title" "$xlab" $breaks $density $frequency</command> + <inputs> + <param name="input" type="data" format="tabular" label="Dataset" help="Dataset missing? See TIP below"/> + <param name="numerical_column" type="data_column" data_ref="input" numerical="True" label="Numerical column for x axis" /> + <param name="breaks" type="integer" size="4" value="0" label="Number of breaks (bars)"/> + <param name="title" type="text" size="30" value="Histogram" label="Plot title"/> + <param name="xlab" type="text" size="30" value="V1" label="Label for x axis"/> + <param name="density" type="boolean" checked="yes" label="Include smoothed density"/> + <param name="frequency" type="boolean" checked="no" label="Plot as frequency (counts)"/> + </inputs> + <outputs> + <data format="pdf" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input" value="histogram_in1.tabular" ftype="tabular"/> + <param name="numerical_column" value="2"/> + <param name="breaks" value="0"/> + <param name="title" value="Histogram"/> + <param name="xlab" value="V1"/> + <param name="density" value="true"/> + <param name="frequency" value="false"/> + <output name="out_file1" file="histogram_out1.pdf"/> + </test> + </tests> + <help> + +.. class:: infomark + +**TIP:** To remove comment lines that do not begin with a *#* character, use *Text Manipulation->Remove beginning* + + .. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +This tool computes a histogram of the numerical values in a column of a dataset. + +- All invalid, blank and comment lines in the dataset are skipped. The number of skipped lines is displayed in the resulting history item. +- **Column for x axis** - only numerical columns are possible. +- **Number of breaks(bars)** - breakpoints between histogram cells. Value of '0' will determine breaks automatically. +- **Plot title** - the histogram title. +- **Label for x axis** - the label of the x axis for the histogram. +- **Include smoothed density** - if checked, the resulting graph will join the given corresponding points with line segments. + +----- + +**Example** + +- Input file:: + + 1 68 4.1 + 2 71 4.6 + 3 62 3.8 + 4 75 4.4 + 5 58 3.2 + 6 60 3.1 + 7 67 3.8 + 8 68 4.1 + 9 71 4.3 + 10 69 3.7 + +- Create a histogram on column 2 of the above dataset. + +.. image:: histogram2.png + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plot_filter.py Mon Jul 28 11:55:47 2014 -0400 @@ -0,0 +1,19 @@ + +def validate(incoming): + """Validator for the plotting program""" + + bins = incoming.get("bins","") + col = incoming.get("col","") + + if not bins or not col: + raise Exception, "You need to specify a number for bins and columns" + + try: + bins = int(bins) + col = int(col) + except: + raise Exception, "Parameters are not valid numbers, columns:%s, bins:%s" % (col, bins) + + if not 1<bins<100: + raise Exception, "The number of bins %s must be a number between 1 and 100" % bins +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plotter.py Mon Jul 28 11:55:47 2014 -0400 @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +# python histogram input_file output_file column bins +import sys, os +import matplotlib; matplotlib.use('Agg') + +from pylab import * + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +if __name__ == '__main__': + # parse the arguments + + if len(sys.argv) != 6: + stop_err('Usage: python histogram.py input_file column bins output_file style') + sys.exit() + + mode = sys.argv[5] + HIST = mode == 'hist' + try: + col = int(float(sys.argv[2])) + if HIST: + bin = int(float(sys.argv[3])) + else: + # hack, this parameter is the plotting style for scatter plots + if sys.argv[3] == 'P': + style = 'o' + elif sys.argv[3] == 'LP': + style = 'o-' + else: + style = '-' + + except: + msg = 'Parameter were not numbers %s, %s' % (sys.argv[3], sys.argv[4]) + stop_err(msg) + + # validate arguments + inp_file = sys.argv[1] + out_file = sys.argv[4] + + if HIST: + print "Histogram on column %s (%s bins)" % (col, bin) + else: + print "Scatterplot on column %s" % (col) + + xcol= col -1 + # read the file + values = [] + try: + count = 0 + for line in file(inp_file): + count += 1 + line = line.strip() + if line and line[0] != '#': + values.append(float(line.split()[xcol])) + except Exception, e: + stop_err('%s' % e) + stop_err("Non numerical data at line %d, column %d" % (count, col) ) + + # plot the data + + if HIST: + n, bins, patches = hist(values, bins=bin, normed=0) + else: + plot(values, style) + + xlabel('values') + ylabel('counts') + + if HIST: + title('Histogram of values over column %s (%s bins)' % (col, len(bins)) ) + else: + title('Scatterplot over column %s' % col ) + grid(True) + + # the plotter detects types by file extension + png_out = out_file + '.png' # force it to png + savefig(png_out) + + # shuffle it back and clean up + data = file(png_out, 'rb').read() + fp = open(out_file, 'wb') + fp.write(data) + fp.close() + os.remove(png_out)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/histogram_in1.tabular Mon Jul 28 11:55:47 2014 -0400 @@ -0,0 +1,10 @@ +1 68 4.1 +2 71 4.6 +3 62 3.8 +4 75 4.4 +5 58 3.2 +6 60 3.1 +7 67 3.8 +8 68 4.1 +9 71 4.3 +10 69 3.7
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/histogram_out1.pdf Mon Jul 28 11:55:47 2014 -0400 @@ -0,0 +1,545 @@ +%PDF-1.4 +%ρ\r +1 0 obj +<< +/CreationDate (D:20110303082028) +/ModDate (D:20110303082028) +/Title (R Graphics Output) +/Producer (R 2.11.0) +/Creator (R) +>> +endobj +2 0 obj +<< +/Type /Catalog +/Pages 3 0 R +>> +endobj +5 0 obj +<< +/Type /Page +/Parent 3 0 R +/Contents 6 0 R +/Resources 4 0 R +>> +endobj +6 0 obj +<< +/Length 7 0 R +>> +stream +1 J 1 j q +Q q +BT +0.000 0.000 0.000 rg +/F3 1 Tf 14.00 0.00 -0.00 14.00 267.78 541.45 Tm (Histogram) Tj +ET +BT +/F2 1 Tf 12.00 0.00 -0.00 12.00 295.06 18.72 Tm (V1) Tj +ET +BT +/F2 1 Tf 0.00 12.00 -12.00 0.00 12.96 275.20 Tm (Density) Tj +ET +Q q +0.000 0.000 0.000 RG +0.75 w +[] 0 d +1 J +1 j +10.00 M +77.07 73.44 m 527.73 73.44 l S +77.07 73.44 m 77.07 66.24 l S +189.73 73.44 m 189.73 66.24 l S +302.40 73.44 m 302.40 66.24 l S +415.07 73.44 m 415.07 66.24 l S +527.73 73.44 m 527.73 66.24 l S +BT +0.000 0.000 0.000 rg +/F2 1 Tf 12.00 0.00 -0.00 12.00 70.39 47.52 Tm (55) Tj +ET +BT +/F2 1 Tf 12.00 0.00 -0.00 12.00 183.06 47.52 Tm (60) Tj +ET +BT +/F2 1 Tf 12.00 0.00 -0.00 12.00 295.73 47.52 Tm (65) Tj +ET +BT +/F2 1 Tf 12.00 0.00 -0.00 12.00 408.39 47.52 Tm (70) Tj +ET +BT +/F2 1 Tf 12.00 0.00 -0.00 12.00 521.06 47.52 Tm (75) Tj +ET +59.04 89.87 m 59.04 500.53 l S +59.04 89.87 m 51.84 89.87 l S +59.04 192.53 m 51.84 192.53 l S +59.04 295.20 m 51.84 295.20 l S +59.04 397.87 m 51.84 397.87 l S +59.04 500.53 m 51.84 500.53 l S +BT +/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 78.19 Tm (0.00) Tj +ET +BT +/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 180.86 Tm (0.02) Tj +ET +BT +/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 283.52 Tm (0.04) Tj +ET +BT +/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 386.19 Tm (0.06) Tj +ET +BT +/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 488.86 Tm (0.08) Tj +ET +Q q 59.04 73.44 486.72 443.52 re W n +0.000 0.000 0.000 RG +0.75 w +[] 0 d +1 J +1 j +10.00 M +77.07 89.87 112.67 205.33 re S +189.73 89.87 112.67 102.67 re S +302.40 89.87 112.67 410.67 re S +415.07 89.87 112.67 308.00 re S +0.00 98.74 m +0.12 98.77 l +1.67 99.23 l +3.22 99.72 l +4.77 100.22 l +6.33 100.74 l +7.88 101.29 l +9.43 101.85 l +10.98 102.44 l +12.53 103.05 l +14.09 103.69 l +15.64 104.35 l +17.19 105.03 l +18.74 105.74 l +20.30 106.47 l +21.85 107.23 l +23.40 108.02 l +24.95 108.83 l +26.51 109.67 l +28.06 110.53 l +29.61 111.43 l +31.16 112.35 l +32.72 113.31 l +34.27 114.29 l +35.82 115.30 l +37.37 116.35 l +38.92 117.42 l +40.48 118.52 l +42.03 119.66 l +43.58 120.82 l +45.13 122.02 l +46.69 123.24 l +48.24 124.51 l +49.79 125.80 l +51.34 127.11 l +52.90 128.48 l +54.45 129.85 l +56.00 131.28 l +57.55 132.73 l +59.11 134.20 l +60.66 135.72 l +62.21 137.25 l +63.76 138.83 l +65.31 140.43 l +66.87 142.07 l +68.42 143.73 l +69.97 145.41 l +71.52 147.14 l +73.08 148.89 l +74.63 150.66 l +76.18 152.47 l +77.73 154.29 l +79.29 156.15 l +80.84 158.02 l +82.39 159.93 l +83.94 161.85 l +85.50 163.79 l +87.05 165.77 l +88.60 167.75 l +90.15 169.76 l +91.70 171.78 l +93.26 173.82 l +94.81 175.88 l +96.36 177.95 l +97.91 180.03 l +99.47 182.13 l +101.02 184.23 l +102.57 186.35 l +104.12 188.47 l +105.68 190.60 l +107.23 192.73 l +108.78 194.87 l +110.33 197.01 l +111.89 199.14 l +113.44 201.28 l +114.99 203.42 l +116.54 205.55 l +118.09 207.67 l +119.65 209.79 l +121.20 211.89 l +122.75 213.99 l +124.30 216.08 l +125.86 218.15 l +127.41 220.21 l +128.96 222.24 l +130.51 224.27 l +132.07 226.26 l +133.62 228.24 l +135.17 230.20 l +136.72 232.13 l +138.28 234.04 l +139.83 235.91 l +141.38 237.77 l +142.93 239.59 l +144.48 241.37 l +146.04 243.14 l +147.59 244.85 l +149.14 246.54 l +150.69 248.19 l +152.25 249.80 l +153.80 251.39 l +155.35 252.91 l +156.90 254.41 l +158.46 255.87 l +160.01 257.28 l +161.56 258.66 l +163.11 259.98 l +164.67 261.28 l +166.22 262.51 l +167.77 263.71 l +169.32 264.88 l +170.87 265.98 l +172.43 267.06 l +173.98 268.07 l +175.53 269.05 l +177.08 269.98 l +178.64 270.87 l +180.19 271.73 l +181.74 272.52 l +183.29 273.28 l +184.85 274.00 l +186.40 274.67 l +187.95 275.32 l +189.50 275.90 l +191.06 276.47 l +192.61 276.99 l +194.16 277.47 l +195.71 277.93 l +197.26 278.33 l +198.82 278.72 l +200.37 279.07 l +201.92 279.39 l +203.47 279.69 l +205.03 279.96 l +206.58 280.21 l +208.13 280.43 l +209.68 280.63 l +211.24 280.82 l +212.79 280.99 l +214.34 281.15 l +215.89 281.29 l +217.45 281.43 l +219.00 281.56 l +220.55 281.69 l +222.10 281.81 l +223.65 281.93 l +225.21 282.06 l +226.76 282.19 l +228.31 282.34 l +229.86 282.48 l +231.42 282.65 l +232.97 282.83 l +234.52 283.03 l +236.07 283.25 l +237.63 283.49 l +239.18 283.77 l +240.73 284.06 l +242.28 284.39 l +243.84 284.75 l +245.39 285.15 l +246.94 285.59 l +248.49 286.06 l +250.04 286.59 l +251.60 287.15 l +253.15 287.76 l +254.70 288.42 l +256.25 289.12 l +257.81 289.90 l +259.36 290.71 l +260.91 291.59 l +262.46 292.53 l +264.02 293.50 l +265.57 294.58 l +267.12 295.68 l +268.67 296.87 l +270.23 298.11 l +271.78 299.40 l +273.33 300.80 l +274.88 302.22 l +276.43 303.74 l +277.99 305.31 l +279.54 306.94 l +281.09 308.66 l +282.64 310.42 l +284.20 312.27 l +285.75 314.17 l +287.30 316.14 l +288.85 318.19 l +290.41 320.27 l +291.96 322.45 l +293.51 324.67 l +295.06 326.95 l +296.62 329.30 l +298.17 331.68 l +299.72 334.16 l +301.27 336.66 l +302.82 339.22 l +304.38 341.83 l +305.93 344.47 l +307.48 347.18 l +309.03 349.91 l +310.59 352.69 l +312.14 355.51 l +313.69 358.34 l +315.24 361.22 l +316.80 364.12 l +318.35 367.04 l +319.90 369.99 l +321.45 372.94 l +323.01 375.91 l +324.56 378.89 l +326.11 381.87 l +327.66 384.86 l +329.21 387.84 l +330.77 390.81 l +332.32 393.78 l +333.87 396.72 l +335.42 399.66 l +336.98 402.57 l +338.53 405.45 l +340.08 408.31 l +341.63 411.12 l +343.19 413.90 l +344.74 416.64 l +346.29 419.33 l +347.84 421.99 l +349.39 424.56 l +350.95 427.11 l +352.50 429.58 l +354.05 431.98 l +355.60 434.34 l +357.16 436.60 l +358.71 438.81 l +360.26 440.92 l +361.81 442.96 l +363.37 444.93 l +364.92 446.79 l +366.47 448.60 l +368.02 450.27 l +369.58 451.88 l +371.13 453.39 l +372.68 454.78 l +374.23 456.12 l +375.78 457.29 l +377.34 458.41 l +378.89 459.40 l +380.44 460.28 l +381.99 461.09 l +383.55 461.74 l +385.10 462.32 l +386.65 462.76 l +388.20 463.10 l +389.76 463.35 l +391.31 463.45 l +392.86 463.48 l +394.41 463.35 l +395.97 463.14 l +397.52 462.83 l +399.07 462.36 l +400.62 461.84 l +402.17 461.16 l +403.73 460.40 l +405.28 459.53 l +406.83 458.53 l +408.38 457.47 l +409.94 456.25 l +411.49 454.96 l +413.04 453.57 l +414.59 452.06 l +416.15 450.50 l +417.70 448.78 l +419.25 447.01 l +420.80 445.13 l +422.36 443.17 l +423.91 441.14 l +425.46 438.99 l +427.01 436.79 l +428.56 434.48 l +430.12 432.11 l +431.67 429.68 l +433.22 427.15 l +434.77 424.58 l +436.33 421.92 l +437.88 419.21 l +439.43 416.44 l +440.98 413.60 l +442.54 410.73 l +444.09 407.78 l +445.64 404.80 l +447.19 401.77 l +448.75 398.69 l +450.30 395.58 l +451.85 392.41 l +453.40 389.23 l +454.95 386.00 l +456.51 382.75 l +458.06 379.48 l +459.61 376.16 l +461.16 372.84 l +462.72 369.49 l +464.27 366.13 l +465.82 362.75 l +467.37 359.36 l +468.93 355.96 l +470.48 352.55 l +472.03 349.14 l +473.58 345.72 l +475.14 342.30 l +476.69 338.88 l +478.24 335.46 l +479.79 332.05 l +481.34 328.64 l +482.90 325.24 l +484.45 321.85 l +486.00 318.47 l +487.55 315.10 l +489.11 311.75 l +490.66 308.41 l +492.21 305.08 l +493.76 301.78 l +495.32 298.49 l +496.87 295.22 l +498.42 291.97 l +499.97 288.74 l +501.53 285.54 l +503.08 282.35 l +504.63 279.19 l +506.18 276.06 l +507.73 272.94 l +509.29 269.86 l +510.84 266.79 l +512.39 263.76 l +513.94 260.76 l +515.50 257.77 l +517.05 254.82 l +518.60 251.89 l +520.15 249.00 l +521.71 246.13 l +523.26 243.28 l +524.81 240.47 l +526.36 237.68 l +527.92 234.93 l +529.47 232.20 l +531.02 229.50 l +532.57 226.83 l +534.12 224.18 l +535.68 221.58 l +537.23 218.99 l +538.78 216.43 l +540.33 213.91 l +541.89 211.40 l +543.44 208.94 l +544.99 206.49 l +546.54 204.08 l +548.10 201.69 l +549.65 199.33 l +551.20 197.01 l +552.75 194.70 l +554.31 192.43 l +555.86 190.18 l +557.41 187.96 l +558.96 185.77 l +560.51 183.60 l +562.07 181.48 l +563.62 179.36 l +565.17 177.28 l +566.72 175.23 l +568.28 173.20 l +569.83 171.21 l +571.38 169.24 l +572.93 167.30 l +574.49 165.38 l +576.00 163.54 l +S +Q +endstream +endobj +7 0 obj +7425 +endobj +3 0 obj +<< +/Type /Pages +/Kids [ +5 0 R +] +/Count 1 +/MediaBox [0 0 576 576] +>> +endobj +4 0 obj +<< +/ProcSet [/PDF /Text] +/Font <</F2 9 0 R /F3 10 0 R >> +/ExtGState << >> +>> +endobj +8 0 obj +<< +/Type /Encoding +/BaseEncoding /WinAnsiEncoding +/Differences [ 45/minus 96/quoteleft +144/dotlessi /grave /acute /circumflex /tilde /macron /breve /dotaccent +/dieresis /.notdef /ring /cedilla /.notdef /hungarumlaut /ogonek /caron /space] +>> +endobj +9 0 obj << +/Type /Font +/Subtype /Type1 +/Name /F2 +/BaseFont /Helvetica +/Encoding 8 0 R +>> endobj +10 0 obj << +/Type /Font +/Subtype /Type1 +/Name /F3 +/BaseFont /Helvetica-Bold +/Encoding 8 0 R +>> endobj +xref +0 11 +0000000000 65535 f +0000000021 00000 n +0000000164 00000 n +0000007791 00000 n +0000007874 00000 n +0000000213 00000 n +0000000293 00000 n +0000007771 00000 n +0000007966 00000 n +0000008223 00000 n +0000008319 00000 n +trailer +<< +/Size 11 +/Info 1 0 R +/Root 2 0 R +>> +startxref +8421 +%%EOF
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Jul 28 11:55:47 2014 -0400 @@ -0,0 +1,9 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="rpy" version="1.0.3"> + <repository changeset_revision="82170c94ca7c" name="package_rpy_1_0_3" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="R" version="2.11.0"> + <repository changeset_revision="5824d2b3bc8b" name="package_r_2_11_0" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>